Index: head/sys/dev/cxgbe/iw_cxgbe/provider.c =================================================================== --- head/sys/dev/cxgbe/iw_cxgbe/provider.c (revision 300675) +++ head/sys/dev/cxgbe/iw_cxgbe/provider.c (revision 300676) @@ -1,502 +1,504 @@ /* * Copyright (c) 2009-2013, 2016 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); +#define LINUXKPI_PARAM_PREFIX iw_cxgbe_ + #include "opt_inet.h" #ifdef TCP_OFFLOAD #include #include #include #include #include "iw_cxgbe.h" #include "user.h" static int fastreg_support = 1; module_param(fastreg_support, int, 0644); MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default = 1)"); static int c4iw_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask, struct ib_port_modify *props) { return -ENOSYS; } static struct ib_ah *c4iw_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr) { return ERR_PTR(-ENOSYS); } static int c4iw_ah_destroy(struct ib_ah *ah) { return -ENOSYS; } static int c4iw_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { return -ENOSYS; } static int c4iw_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { return -ENOSYS; } static int c4iw_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad) { return -ENOSYS; } static int c4iw_dealloc_ucontext(struct ib_ucontext *context) { struct c4iw_dev *rhp = to_c4iw_dev(context->device); struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context); struct c4iw_mm_entry *mm, *tmp; CTR2(KTR_IW_CXGBE, "%s context %p", __func__, context); list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry) kfree(mm); c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx); kfree(ucontext); return 0; } static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct c4iw_ucontext *context; struct c4iw_dev *rhp = to_c4iw_dev(ibdev); CTR2(KTR_IW_CXGBE, "%s ibdev %p", __func__, ibdev); context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx); INIT_LIST_HEAD(&context->mmaps); spin_lock_init(&context->mmap_lock); return &context->ibucontext; } #ifdef DOT5 static inline pgprot_t t4_pgprot_wc(pgprot_t prot) { return pgprot_writecombine(prot); } #endif static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { int len = vma->vm_end - vma->vm_start; u32 key = vma->vm_pgoff << PAGE_SHIFT; struct c4iw_rdev *rdev; int ret = 0; struct c4iw_mm_entry *mm; struct c4iw_ucontext *ucontext; u64 addr, paddr; u64 va_regs_res = 0, va_udbs_res = 0; u64 len_regs_res = 0, len_udbs_res = 0; CTR3(KTR_IW_CXGBE, "%s:1 ctx %p vma %p", __func__, context, vma); CTR4(KTR_IW_CXGBE, "%s:1a pgoff 0x%lx key 0x%x len %d", __func__, vma->vm_pgoff, key, len); if (vma->vm_start & (PAGE_SIZE-1)) { CTR3(KTR_IW_CXGBE, "%s:2 unaligned vm_start %u vma %p", __func__, vma->vm_start, vma); return -EINVAL; } rdev = &(to_c4iw_dev(context->device)->rdev); ucontext = to_c4iw_ucontext(context); mm = remove_mmap(ucontext, key, len); if (!mm) { CTR4(KTR_IW_CXGBE, "%s:3 ucontext %p key %u len %u", __func__, ucontext, key, len); return -EINVAL; } addr = mm->addr; kfree(mm); va_regs_res = (u64)rman_get_virtual(rdev->adap->regs_res); len_regs_res = (u64)rman_get_size(rdev->adap->regs_res); va_udbs_res = (u64)rman_get_virtual(rdev->adap->udbs_res); len_udbs_res = (u64)rman_get_size(rdev->adap->udbs_res); CTR6(KTR_IW_CXGBE, "%s:4 addr %p, masync region %p:%p, udb region %p:%p", __func__, addr, va_regs_res, va_regs_res+len_regs_res, va_udbs_res, va_udbs_res+len_udbs_res); if (addr >= va_regs_res && addr < va_regs_res + len_regs_res) { CTR4(KTR_IW_CXGBE, "%s:5 MA_SYNC addr %p region %p, reglen %u", __func__, addr, va_regs_res, len_regs_res); /* * MA_SYNC register... */ paddr = vtophys(addr); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); ret = io_remap_pfn_range(vma, vma->vm_start, paddr >> PAGE_SHIFT, len, vma->vm_page_prot); } else { if (addr >= va_udbs_res && addr < va_udbs_res + len_udbs_res) { /* * Map user DB or OCQP memory... */ paddr = vtophys(addr); CTR4(KTR_IW_CXGBE, "%s:6 USER DB-GTS addr %p region %p, reglen %u", __func__, addr, va_udbs_res, len_udbs_res); #ifdef DOT5 if (is_t5(rdev->lldi.adapter_type) && map_udb_as_wc) vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot); else #endif vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); ret = io_remap_pfn_range(vma, vma->vm_start, paddr >> PAGE_SHIFT, len, vma->vm_page_prot); } else { /* * Map WQ or CQ contig dma memory... */ CTR4(KTR_IW_CXGBE, "%s:7 WQ/CQ addr %p vm_start %u vma %p", __func__, addr, vma->vm_start, vma); ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, len, vma->vm_page_prot); } } CTR4(KTR_IW_CXGBE, "%s:8 ctx %p vma %p ret %u", __func__, context, vma, ret); return ret; } static int c4iw_deallocate_pd(struct ib_pd *pd) { struct c4iw_pd *php = to_c4iw_pd(pd); struct c4iw_dev *rhp = php->rhp; CTR3(KTR_IW_CXGBE, "%s: pd %p, pdid 0x%x", __func__, pd, php->pdid); c4iw_put_resource(&rhp->rdev.resource.pdid_table, php->pdid); mutex_lock(&rhp->rdev.stats.lock); rhp->rdev.stats.pd.cur--; mutex_unlock(&rhp->rdev.stats.lock); kfree(php); return (0); } static struct ib_pd * c4iw_allocate_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct c4iw_pd *php; u32 pdid; struct c4iw_dev *rhp; CTR4(KTR_IW_CXGBE, "%s: ibdev %p, context %p, data %p", __func__, ibdev, context, udata); rhp = (struct c4iw_dev *) ibdev; pdid = c4iw_get_resource(&rhp->rdev.resource.pdid_table); if (!pdid) return ERR_PTR(-EINVAL); php = kzalloc(sizeof(*php), GFP_KERNEL); if (!php) { c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid); return ERR_PTR(-ENOMEM); } php->pdid = pdid; php->rhp = rhp; if (context) { if (ib_copy_to_udata(udata, &php->pdid, sizeof(u32))) { c4iw_deallocate_pd(&php->ibpd); return ERR_PTR(-EFAULT); } } mutex_lock(&rhp->rdev.stats.lock); rhp->rdev.stats.pd.cur++; if (rhp->rdev.stats.pd.cur > rhp->rdev.stats.pd.max) rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur; mutex_unlock(&rhp->rdev.stats.lock); CTR6(KTR_IW_CXGBE, "%s: ibdev %p, context %p, data %p, pddid 0x%x, pd %p", __func__, ibdev, context, udata, pdid, php); return (&php->ibpd); } static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { CTR5(KTR_IW_CXGBE, "%s ibdev %p, port %d, index %d, pkey %p", __func__, ibdev, port, index, pkey); *pkey = 0; return (0); } static int c4iw_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct c4iw_dev *dev; struct port_info *pi; struct adapter *sc; CTR5(KTR_IW_CXGBE, "%s ibdev %p, port %d, index %d, gid %p", __func__, ibdev, port, index, gid); memset(&gid->raw[0], 0, sizeof(gid->raw)); dev = to_c4iw_dev(ibdev); sc = dev->rdev.adap; if (port == 0 || port > sc->params.nports) return (-EINVAL); pi = sc->port[port - 1]; memcpy(&gid->raw[0], pi->vi[0].hw_addr, ETHER_ADDR_LEN); return (0); } static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { struct c4iw_dev *dev = to_c4iw_dev(ibdev); struct adapter *sc = dev->rdev.adap; CTR3(KTR_IW_CXGBE, "%s ibdev %p, props %p", __func__, ibdev, props); memset(props, 0, sizeof *props); memcpy(&props->sys_image_guid, sc->port[0]->vi[0].hw_addr, ETHER_ADDR_LEN); props->hw_ver = sc->params.chipid; props->fw_ver = sc->params.fw_vers; props->device_cap_flags = dev->device_cap_flags; props->page_size_cap = T4_PAGESIZE_MASK; props->vendor_id = pci_get_vendor(sc->dev); props->vendor_part_id = pci_get_device(sc->dev); props->max_mr_size = T4_MAX_MR_SIZE; props->max_qp = T4_MAX_NUM_QP; props->max_qp_wr = T4_MAX_QP_DEPTH; props->max_sge = T4_MAX_RECV_SGE; props->max_sge_rd = 1; props->max_qp_rd_atom = c4iw_max_read_depth; props->max_qp_init_rd_atom = c4iw_max_read_depth; props->max_cq = T4_MAX_NUM_CQ; props->max_cqe = T4_MAX_CQ_DEPTH; props->max_mr = c4iw_num_stags(&dev->rdev); props->max_pd = T4_MAX_NUM_PD; props->local_ca_ack_delay = 0; props->max_fast_reg_page_list_len = T4_MAX_FR_DEPTH; return (0); } /* * Returns -errno on failure. */ static int c4iw_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct c4iw_dev *dev; struct adapter *sc; struct port_info *pi; struct ifnet *ifp; CTR4(KTR_IW_CXGBE, "%s ibdev %p, port %d, props %p", __func__, ibdev, port, props); dev = to_c4iw_dev(ibdev); sc = dev->rdev.adap; if (port > sc->params.nports) return (-EINVAL); pi = sc->port[port - 1]; ifp = pi->vi[0].ifp; memset(props, 0, sizeof(struct ib_port_attr)); props->max_mtu = IB_MTU_4096; if (ifp->if_mtu >= 4096) props->active_mtu = IB_MTU_4096; else if (ifp->if_mtu >= 2048) props->active_mtu = IB_MTU_2048; else if (ifp->if_mtu >= 1024) props->active_mtu = IB_MTU_1024; else if (ifp->if_mtu >= 512) props->active_mtu = IB_MTU_512; else props->active_mtu = IB_MTU_256; props->state = pi->link_cfg.link_ok ? IB_PORT_ACTIVE : IB_PORT_DOWN; props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_REINIT_SUP | IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; props->gid_tbl_len = 1; props->pkey_tbl_len = 1; props->active_width = 2; props->active_speed = 2; props->max_msg_sz = -1; return 0; } /* * Returns -errno on error. */ int c4iw_register_device(struct c4iw_dev *dev) { struct adapter *sc = dev->rdev.adap; struct ib_device *ibdev = &dev->ibdev; struct iw_cm_verbs *iwcm; int ret; CTR3(KTR_IW_CXGBE, "%s c4iw_dev %p, adapter %p", __func__, dev, sc); BUG_ON(!sc->port[0]); strlcpy(ibdev->name, device_get_nameunit(sc->dev), sizeof(ibdev->name)); memset(&ibdev->node_guid, 0, sizeof(ibdev->node_guid)); memcpy(&ibdev->node_guid, sc->port[0]->vi[0].hw_addr, ETHER_ADDR_LEN); ibdev->owner = THIS_MODULE; dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW; if (fastreg_support) dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; ibdev->local_dma_lkey = 0; ibdev->uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_POLL_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_POST_SEND) | (1ull << IB_USER_VERBS_CMD_POST_RECV); ibdev->node_type = RDMA_NODE_RNIC; strlcpy(ibdev->node_desc, C4IW_NODE_DESC, sizeof(ibdev->node_desc)); ibdev->phys_port_cnt = sc->params.nports; ibdev->num_comp_vectors = 1; ibdev->dma_device = sc->dev; ibdev->query_device = c4iw_query_device; ibdev->query_port = c4iw_query_port; ibdev->modify_port = c4iw_modify_port; ibdev->query_pkey = c4iw_query_pkey; ibdev->query_gid = c4iw_query_gid; ibdev->alloc_ucontext = c4iw_alloc_ucontext; ibdev->dealloc_ucontext = c4iw_dealloc_ucontext; ibdev->mmap = c4iw_mmap; ibdev->alloc_pd = c4iw_allocate_pd; ibdev->dealloc_pd = c4iw_deallocate_pd; ibdev->create_ah = c4iw_ah_create; ibdev->destroy_ah = c4iw_ah_destroy; ibdev->create_qp = c4iw_create_qp; ibdev->modify_qp = c4iw_ib_modify_qp; ibdev->query_qp = c4iw_ib_query_qp; ibdev->destroy_qp = c4iw_destroy_qp; ibdev->create_cq = c4iw_create_cq; ibdev->destroy_cq = c4iw_destroy_cq; ibdev->resize_cq = c4iw_resize_cq; ibdev->poll_cq = c4iw_poll_cq; ibdev->get_dma_mr = c4iw_get_dma_mr; ibdev->reg_phys_mr = c4iw_register_phys_mem; ibdev->rereg_phys_mr = c4iw_reregister_phys_mem; ibdev->reg_user_mr = c4iw_reg_user_mr; ibdev->dereg_mr = c4iw_dereg_mr; ibdev->alloc_mw = c4iw_alloc_mw; ibdev->bind_mw = c4iw_bind_mw; ibdev->dealloc_mw = c4iw_dealloc_mw; ibdev->alloc_fast_reg_mr = c4iw_alloc_fast_reg_mr; ibdev->alloc_fast_reg_page_list = c4iw_alloc_fastreg_pbl; ibdev->free_fast_reg_page_list = c4iw_free_fastreg_pbl; ibdev->attach_mcast = c4iw_multicast_attach; ibdev->detach_mcast = c4iw_multicast_detach; ibdev->process_mad = c4iw_process_mad; ibdev->req_notify_cq = c4iw_arm_cq; ibdev->post_send = c4iw_post_send; ibdev->post_recv = c4iw_post_receive; ibdev->uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; iwcm = kmalloc(sizeof(*iwcm), GFP_KERNEL); if (iwcm == NULL) return (-ENOMEM); iwcm->connect = c4iw_connect; iwcm->accept = c4iw_accept_cr; iwcm->reject = c4iw_reject_cr; iwcm->create_listen_ep = c4iw_create_listen_ep; iwcm->destroy_listen_ep = c4iw_destroy_listen_ep; iwcm->newconn = process_newconn; iwcm->add_ref = c4iw_qp_add_ref; iwcm->rem_ref = c4iw_qp_rem_ref; iwcm->get_qp = c4iw_get_qp; ibdev->iwcm = iwcm; ret = ib_register_device(&dev->ibdev, NULL); if (ret) kfree(iwcm); return (ret); } void c4iw_unregister_device(struct c4iw_dev *dev) { CTR3(KTR_IW_CXGBE, "%s c4iw_dev %p, adapter %p", __func__, dev, dev->rdev.adap); ib_unregister_device(&dev->ibdev); kfree(dev->ibdev.iwcm); return; } #endif Index: head/sys/dev/mlx5/mlx5_core/mlx5_main.c =================================================================== --- head/sys/dev/mlx5/mlx5_core/mlx5_main.c (revision 300675) +++ head/sys/dev/mlx5/mlx5_core/mlx5_main.c (revision 300676) @@ -1,1126 +1,1128 @@ /*- * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ +#define LINUXKPI_PARAM_PREFIX mlx5_ + #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mlx5_core.h" MODULE_AUTHOR("Eli Cohen "); MODULE_DESCRIPTION("Mellanox Connect-IB, ConnectX-4 core driver"); MODULE_LICENSE("Dual BSD/GPL"); #if (__FreeBSD_version >= 1100000) MODULE_DEPEND(mlx5, linuxkpi, 1, 1, 1); #endif MODULE_VERSION(mlx5, 1); int mlx5_core_debug_mask; module_param_named(debug_mask, mlx5_core_debug_mask, int, 0644); MODULE_PARM_DESC(debug_mask, "debug mask: 1 = dump cmd data, 2 = dump cmd exec time, 3 = both. Default=0"); #define MLX5_DEFAULT_PROF 2 static int prof_sel = MLX5_DEFAULT_PROF; module_param_named(prof_sel, prof_sel, int, 0444); MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2"); #define NUMA_NO_NODE -1 struct workqueue_struct *mlx5_core_wq; static LIST_HEAD(intf_list); static LIST_HEAD(dev_list); static DEFINE_MUTEX(intf_mutex); struct mlx5_device_context { struct list_head list; struct mlx5_interface *intf; void *context; }; static struct mlx5_profile profiles[] = { [0] = { .mask = 0, }, [1] = { .mask = MLX5_PROF_MASK_QP_SIZE, .log_max_qp = 12, }, [2] = { .mask = MLX5_PROF_MASK_QP_SIZE | MLX5_PROF_MASK_MR_CACHE, .log_max_qp = 17, .mr_cache[0] = { .size = 500, .limit = 250 }, .mr_cache[1] = { .size = 500, .limit = 250 }, .mr_cache[2] = { .size = 500, .limit = 250 }, .mr_cache[3] = { .size = 500, .limit = 250 }, .mr_cache[4] = { .size = 500, .limit = 250 }, .mr_cache[5] = { .size = 500, .limit = 250 }, .mr_cache[6] = { .size = 500, .limit = 250 }, .mr_cache[7] = { .size = 500, .limit = 250 }, .mr_cache[8] = { .size = 500, .limit = 250 }, .mr_cache[9] = { .size = 500, .limit = 250 }, .mr_cache[10] = { .size = 500, .limit = 250 }, .mr_cache[11] = { .size = 500, .limit = 250 }, .mr_cache[12] = { .size = 64, .limit = 32 }, .mr_cache[13] = { .size = 32, .limit = 16 }, .mr_cache[14] = { .size = 16, .limit = 8 }, .mr_cache[15] = { .size = 8, .limit = 4 }, }, [3] = { .mask = MLX5_PROF_MASK_QP_SIZE, .log_max_qp = 17, }, }; static int set_dma_caps(struct pci_dev *pdev) { int err; err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); if (err) { device_printf((&pdev->dev)->bsddev, "WARN: ""Warning: couldn't set 64-bit PCI DMA mask\n"); err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""Can't set PCI DMA mask, aborting\n"); return err; } } err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); if (err) { device_printf((&pdev->dev)->bsddev, "WARN: ""Warning: couldn't set 64-bit consistent PCI DMA mask\n"); err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""Can't set consistent PCI DMA mask, aborting\n"); return err; } } dma_set_max_seg_size(&pdev->dev, 2u * 1024 * 1024 * 1024); return err; } static int request_bar(struct pci_dev *pdev) { int err = 0; if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { device_printf((&pdev->dev)->bsddev, "ERR: ""Missing registers BAR, aborting\n"); return -ENODEV; } err = pci_request_regions(pdev, DRIVER_NAME); if (err) device_printf((&pdev->dev)->bsddev, "ERR: ""Couldn't get PCI resources, aborting\n"); return err; } static void release_bar(struct pci_dev *pdev) { pci_release_regions(pdev); } static int mlx5_enable_msix(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; struct mlx5_eq_table *table = &priv->eq_table; int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq); int nvec; int i; nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() + MLX5_EQ_VEC_COMP_BASE; nvec = min_t(int, nvec, num_eqs); if (nvec <= MLX5_EQ_VEC_COMP_BASE) return -ENOMEM; priv->msix_arr = kzalloc(nvec * sizeof(*priv->msix_arr), GFP_KERNEL); priv->irq_info = kzalloc(nvec * sizeof(*priv->irq_info), GFP_KERNEL); for (i = 0; i < nvec; i++) priv->msix_arr[i].entry = i; nvec = pci_enable_msix_range(dev->pdev, priv->msix_arr, MLX5_EQ_VEC_COMP_BASE + 1, nvec); if (nvec < 0) return nvec; table->num_comp_vectors = nvec - MLX5_EQ_VEC_COMP_BASE; return 0; } static void mlx5_disable_msix(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; pci_disable_msix(dev->pdev); kfree(priv->irq_info); kfree(priv->msix_arr); } struct mlx5_reg_host_endianess { u8 he; u8 rsvd[15]; }; #define CAP_MASK(pos, size) ((u64)((1 << (size)) - 1) << (pos)) enum { MLX5_CAP_BITS_RW_MASK = CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) | MLX5_DEV_CAP_FLAG_DCT, }; static u16 to_fw_pkey_sz(u32 size) { switch (size) { case 128: return 0; case 256: return 1; case 512: return 2; case 1024: return 3; case 2048: return 4; case 4096: return 5; default: printf("mlx5_core: WARN: ""invalid pkey table size %d\n", size); return 0; } } int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type, enum mlx5_cap_mode cap_mode) { u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)]; int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); void *out, *hca_caps; u16 opmod = (cap_type << 1) | (cap_mode & 0x01); int err; memset(in, 0, sizeof(in)); out = kzalloc(out_sz, GFP_KERNEL); MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); MLX5_SET(query_hca_cap_in, in, op_mod, opmod); err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); if (err) goto query_ex; err = mlx5_cmd_status_to_err_v2(out); if (err) { mlx5_core_warn(dev, "QUERY_HCA_CAP : type(%x) opmode(%x) Failed(%d)\n", cap_type, cap_mode, err); goto query_ex; } hca_caps = MLX5_ADDR_OF(query_hca_cap_out, out, capability); switch (cap_mode) { case HCA_CAP_OPMOD_GET_MAX: memcpy(dev->hca_caps_max[cap_type], hca_caps, MLX5_UN_SZ_BYTES(hca_cap_union)); break; case HCA_CAP_OPMOD_GET_CUR: memcpy(dev->hca_caps_cur[cap_type], hca_caps, MLX5_UN_SZ_BYTES(hca_cap_union)); break; default: mlx5_core_warn(dev, "Tried to query dev cap type(%x) with wrong opmode(%x)\n", cap_type, cap_mode); err = -EINVAL; break; } query_ex: kfree(out); return err; } static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz) { u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)]; int err; memset(out, 0, sizeof(out)); MLX5_SET(set_hca_cap_in, in, opcode, MLX5_CMD_OP_SET_HCA_CAP); err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); if (err) return err; err = mlx5_cmd_status_to_err_v2(out); return err; } static int handle_hca_cap(struct mlx5_core_dev *dev) { void *set_ctx = NULL; struct mlx5_profile *prof = dev->profile; int err = -ENOMEM; int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in); void *set_hca_cap; set_ctx = kzalloc(set_sz, GFP_KERNEL); err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL, HCA_CAP_OPMOD_GET_MAX); if (err) goto query_ex; err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL, HCA_CAP_OPMOD_GET_CUR); if (err) goto query_ex; set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability); memcpy(set_hca_cap, dev->hca_caps_cur[MLX5_CAP_GENERAL], MLX5_ST_SZ_BYTES(cmd_hca_cap)); mlx5_core_dbg(dev, "Current Pkey table size %d Setting new size %d\n", mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(dev, pkey_table_size)), 128); /* we limit the size of the pkey table to 128 entries for now */ MLX5_SET(cmd_hca_cap, set_hca_cap, pkey_table_size, to_fw_pkey_sz(128)); if (prof->mask & MLX5_PROF_MASK_QP_SIZE) MLX5_SET(cmd_hca_cap, set_hca_cap, log_max_qp, prof->log_max_qp); /* disable cmdif checksum */ MLX5_SET(cmd_hca_cap, set_hca_cap, cmdif_checksum, 0); MLX5_SET(cmd_hca_cap, set_hca_cap, log_uar_page_sz, PAGE_SHIFT - 12); err = set_caps(dev, set_ctx, set_sz); query_ex: kfree(set_ctx); return err; } static int set_hca_ctrl(struct mlx5_core_dev *dev) { struct mlx5_reg_host_endianess he_in; struct mlx5_reg_host_endianess he_out; int err; memset(&he_in, 0, sizeof(he_in)); he_in.he = MLX5_SET_HOST_ENDIANNESS; err = mlx5_core_access_reg(dev, &he_in, sizeof(he_in), &he_out, sizeof(he_out), MLX5_REG_HOST_ENDIANNESS, 0, 1); return err; } static int mlx5_core_enable_hca(struct mlx5_core_dev *dev) { u32 in[MLX5_ST_SZ_DW(enable_hca_in)]; u32 out[MLX5_ST_SZ_DW(enable_hca_out)]; memset(in, 0, sizeof(in)); MLX5_SET(enable_hca_in, in, opcode, MLX5_CMD_OP_ENABLE_HCA); memset(out, 0, sizeof(out)); return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); } static int mlx5_core_disable_hca(struct mlx5_core_dev *dev) { u32 in[MLX5_ST_SZ_DW(disable_hca_in)]; u32 out[MLX5_ST_SZ_DW(disable_hca_out)]; memset(in, 0, sizeof(in)); MLX5_SET(disable_hca_in, in, opcode, MLX5_CMD_OP_DISABLE_HCA); memset(out, 0, sizeof(out)); return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); } static int mlx5_core_set_issi(struct mlx5_core_dev *dev) { u32 query_in[MLX5_ST_SZ_DW(query_issi_in)]; u32 query_out[MLX5_ST_SZ_DW(query_issi_out)]; u32 set_in[MLX5_ST_SZ_DW(set_issi_in)]; u32 set_out[MLX5_ST_SZ_DW(set_issi_out)]; int err; u32 sup_issi; memset(query_in, 0, sizeof(query_in)); memset(query_out, 0, sizeof(query_out)); MLX5_SET(query_issi_in, query_in, opcode, MLX5_CMD_OP_QUERY_ISSI); err = mlx5_cmd_exec_check_status(dev, query_in, sizeof(query_in), query_out, sizeof(query_out)); if (err) { if (((struct mlx5_outbox_hdr *)query_out)->status == MLX5_CMD_STAT_BAD_OP_ERR) { pr_debug("Only ISSI 0 is supported\n"); return 0; } printf("mlx5_core: ERR: ""failed to query ISSI\n"); return err; } sup_issi = MLX5_GET(query_issi_out, query_out, supported_issi_dw0); if (sup_issi & (1 << 1)) { memset(set_in, 0, sizeof(set_in)); memset(set_out, 0, sizeof(set_out)); MLX5_SET(set_issi_in, set_in, opcode, MLX5_CMD_OP_SET_ISSI); MLX5_SET(set_issi_in, set_in, current_issi, 1); err = mlx5_cmd_exec_check_status(dev, set_in, sizeof(set_in), set_out, sizeof(set_out)); if (err) { printf("mlx5_core: ERR: ""failed to set ISSI=1\n"); return err; } dev->issi = 1; return 0; } else if (sup_issi & (1 << 0)) { return 0; } return -ENOTSUPP; } int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, int *irqn) { struct mlx5_eq_table *table = &dev->priv.eq_table; struct mlx5_eq *eq; int err = -ENOENT; spin_lock(&table->lock); list_for_each_entry(eq, &table->comp_eqs_list, list) { if (eq->index == vector) { *eqn = eq->eqn; *irqn = eq->irqn; err = 0; break; } } spin_unlock(&table->lock); return err; } EXPORT_SYMBOL(mlx5_vector2eqn); int mlx5_rename_eq(struct mlx5_core_dev *dev, int eq_ix, char *name) { struct mlx5_priv *priv = &dev->priv; struct mlx5_eq_table *table = &priv->eq_table; struct mlx5_eq *eq; int err = -ENOENT; spin_lock(&table->lock); list_for_each_entry(eq, &table->comp_eqs_list, list) { if (eq->index == eq_ix) { int irq_ix = eq_ix + MLX5_EQ_VEC_COMP_BASE; snprintf(priv->irq_info[irq_ix].name, MLX5_MAX_IRQ_NAME, "%s-%d", name, eq_ix); err = 0; break; } } spin_unlock(&table->lock); return err; } static void free_comp_eqs(struct mlx5_core_dev *dev) { struct mlx5_eq_table *table = &dev->priv.eq_table; struct mlx5_eq *eq, *n; spin_lock(&table->lock); list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) { list_del(&eq->list); spin_unlock(&table->lock); if (mlx5_destroy_unmap_eq(dev, eq)) mlx5_core_warn(dev, "failed to destroy EQ 0x%x\n", eq->eqn); kfree(eq); spin_lock(&table->lock); } spin_unlock(&table->lock); } static int alloc_comp_eqs(struct mlx5_core_dev *dev) { struct mlx5_eq_table *table = &dev->priv.eq_table; char name[MLX5_MAX_IRQ_NAME]; struct mlx5_eq *eq; int ncomp_vec; int nent; int err; int i; INIT_LIST_HEAD(&table->comp_eqs_list); ncomp_vec = table->num_comp_vectors; nent = MLX5_COMP_EQ_SIZE; for (i = 0; i < ncomp_vec; i++) { eq = kzalloc(sizeof(*eq), GFP_KERNEL); snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i); err = mlx5_create_map_eq(dev, eq, i + MLX5_EQ_VEC_COMP_BASE, nent, 0, name, &dev->priv.uuari.uars[0]); if (err) { kfree(eq); goto clean; } mlx5_core_dbg(dev, "allocated completion EQN %d\n", eq->eqn); eq->index = i; spin_lock(&table->lock); list_add_tail(&eq->list, &table->comp_eqs_list); spin_unlock(&table->lock); } return 0; clean: free_comp_eqs(dev); return err; } static int map_bf_area(struct mlx5_core_dev *dev) { resource_size_t bf_start = pci_resource_start(dev->pdev, 0); resource_size_t bf_len = pci_resource_len(dev->pdev, 0); dev->priv.bf_mapping = io_mapping_create_wc(bf_start, bf_len); return dev->priv.bf_mapping ? 0 : -ENOMEM; } static void unmap_bf_area(struct mlx5_core_dev *dev) { if (dev->priv.bf_mapping) io_mapping_free(dev->priv.bf_mapping); } static inline int fw_initializing(struct mlx5_core_dev *dev) { return ioread32be(&dev->iseg->initializing) >> 31; } static int wait_fw_init(struct mlx5_core_dev *dev, u32 max_wait_mili) { u64 end = jiffies + msecs_to_jiffies(max_wait_mili); int err = 0; while (fw_initializing(dev)) { if (time_after(jiffies, end)) { err = -EBUSY; break; } msleep(FW_INIT_WAIT_MS); } return err; } static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev) { struct mlx5_priv *priv = &dev->priv; int err; dev->pdev = pdev; pci_set_drvdata(dev->pdev, dev); strncpy(priv->name, dev_name(&pdev->dev), MLX5_MAX_NAME_LEN); priv->name[MLX5_MAX_NAME_LEN - 1] = 0; mutex_init(&priv->pgdir_mutex); INIT_LIST_HEAD(&priv->pgdir_list); spin_lock_init(&priv->mkey_lock); priv->numa_node = NUMA_NO_NODE; err = pci_enable_device(pdev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""Cannot enable PCI device, aborting\n"); goto err_dbg; } err = request_bar(pdev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""error requesting BARs, aborting\n"); goto err_disable; } pci_set_master(pdev); err = set_dma_caps(pdev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""Failed setting DMA capabilities mask, aborting\n"); goto err_clr_master; } dev->iseg = ioremap(pci_resource_start(dev->pdev, 0), sizeof(*dev->iseg)); if (!dev->iseg) { err = -ENOMEM; device_printf((&pdev->dev)->bsddev, "ERR: ""Failed mapping initialization segment, aborting\n"); goto err_clr_master; } device_printf((&pdev->dev)->bsddev, "INFO: ""firmware version: %d.%d.%d\n", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev)); err = mlx5_cmd_init(dev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""Failed initializing command interface, aborting\n"); goto err_unmap; } err = wait_fw_init(dev, FW_INIT_TIMEOUT_MILI); if (err) { device_printf((&dev->pdev->dev)->bsddev, "ERR: ""Firmware over %d MS in initializing state, aborting\n", FW_INIT_TIMEOUT_MILI); goto err_cmd_cleanup; } mlx5_pagealloc_init(dev); err = mlx5_core_enable_hca(dev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""enable hca failed\n"); goto err_pagealloc_cleanup; } err = mlx5_core_set_issi(dev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""failed to set issi\n"); goto err_disable_hca; } err = mlx5_pagealloc_start(dev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""mlx5_pagealloc_start failed\n"); goto err_disable_hca; } err = mlx5_satisfy_startup_pages(dev, 1); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""failed to allocate boot pages\n"); goto err_pagealloc_stop; } err = set_hca_ctrl(dev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""set_hca_ctrl failed\n"); goto reclaim_boot_pages; } err = handle_hca_cap(dev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""handle_hca_cap failed\n"); goto reclaim_boot_pages; } err = mlx5_satisfy_startup_pages(dev, 0); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""failed to allocate init pages\n"); goto reclaim_boot_pages; } err = mlx5_cmd_init_hca(dev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""init hca failed\n"); goto reclaim_boot_pages; } mlx5_start_health_poll(dev); err = mlx5_query_hca_caps(dev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""query hca failed\n"); goto err_stop_poll; } err = mlx5_query_board_id(dev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""query board id failed\n"); goto err_stop_poll; } err = mlx5_enable_msix(dev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""enable msix failed\n"); goto err_stop_poll; } err = mlx5_eq_init(dev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""failed to initialize eq\n"); goto disable_msix; } err = mlx5_alloc_uuars(dev, &priv->uuari); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""Failed allocating uar, aborting\n"); goto err_eq_cleanup; } err = mlx5_start_eqs(dev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""Failed to start pages and async EQs\n"); goto err_free_uar; } err = alloc_comp_eqs(dev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""Failed to alloc completion EQs\n"); goto err_stop_eqs; } if (map_bf_area(dev)) device_printf((&pdev->dev)->bsddev, "ERR: ""Failed to map blue flame area\n"); MLX5_INIT_DOORBELL_LOCK(&priv->cq_uar_lock); mlx5_init_cq_table(dev); mlx5_init_qp_table(dev); mlx5_init_srq_table(dev); mlx5_init_mr_table(dev); return 0; err_stop_eqs: mlx5_stop_eqs(dev); err_free_uar: mlx5_free_uuars(dev, &priv->uuari); err_eq_cleanup: mlx5_eq_cleanup(dev); disable_msix: mlx5_disable_msix(dev); err_stop_poll: mlx5_stop_health_poll(dev); if (mlx5_cmd_teardown_hca(dev)) { device_printf((&dev->pdev->dev)->bsddev, "ERR: ""tear_down_hca failed, skip cleanup\n"); return err; } reclaim_boot_pages: mlx5_reclaim_startup_pages(dev); err_pagealloc_stop: mlx5_pagealloc_stop(dev); err_disable_hca: mlx5_core_disable_hca(dev); err_pagealloc_cleanup: mlx5_pagealloc_cleanup(dev); err_cmd_cleanup: mlx5_cmd_cleanup(dev); err_unmap: iounmap(dev->iseg); err_clr_master: pci_clear_master(dev->pdev); release_bar(dev->pdev); err_disable: pci_disable_device(dev->pdev); err_dbg: return err; } static void mlx5_dev_cleanup(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; mlx5_cleanup_mr_table(dev); mlx5_cleanup_srq_table(dev); mlx5_cleanup_qp_table(dev); mlx5_cleanup_cq_table(dev); unmap_bf_area(dev); free_comp_eqs(dev); mlx5_stop_eqs(dev); mlx5_free_uuars(dev, &priv->uuari); mlx5_eq_cleanup(dev); mlx5_disable_msix(dev); mlx5_stop_health_poll(dev); if (mlx5_cmd_teardown_hca(dev)) { device_printf((&dev->pdev->dev)->bsddev, "ERR: ""tear_down_hca failed, skip cleanup\n"); return; } mlx5_pagealloc_stop(dev); mlx5_reclaim_startup_pages(dev); mlx5_core_disable_hca(dev); mlx5_pagealloc_cleanup(dev); mlx5_cmd_cleanup(dev); iounmap(dev->iseg); pci_clear_master(dev->pdev); release_bar(dev->pdev); pci_disable_device(dev->pdev); } static void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv) { struct mlx5_device_context *dev_ctx; struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv); dev_ctx = kmalloc(sizeof(*dev_ctx), GFP_KERNEL); dev_ctx->intf = intf; dev_ctx->context = intf->add(dev); if (dev_ctx->context) { spin_lock_irq(&priv->ctx_lock); list_add_tail(&dev_ctx->list, &priv->ctx_list); spin_unlock_irq(&priv->ctx_lock); } else { kfree(dev_ctx); } } static void mlx5_remove_device(struct mlx5_interface *intf, struct mlx5_priv *priv) { struct mlx5_device_context *dev_ctx; struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv); list_for_each_entry(dev_ctx, &priv->ctx_list, list) if (dev_ctx->intf == intf) { spin_lock_irq(&priv->ctx_lock); list_del(&dev_ctx->list); spin_unlock_irq(&priv->ctx_lock); intf->remove(dev, dev_ctx->context); kfree(dev_ctx); return; } } static int mlx5_register_device(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; struct mlx5_interface *intf; mutex_lock(&intf_mutex); list_add_tail(&priv->dev_list, &dev_list); list_for_each_entry(intf, &intf_list, list) mlx5_add_device(intf, priv); mutex_unlock(&intf_mutex); return 0; } static void mlx5_unregister_device(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; struct mlx5_interface *intf; mutex_lock(&intf_mutex); list_for_each_entry(intf, &intf_list, list) mlx5_remove_device(intf, priv); list_del(&priv->dev_list); mutex_unlock(&intf_mutex); } int mlx5_register_interface(struct mlx5_interface *intf) { struct mlx5_priv *priv; if (!intf->add || !intf->remove) return -EINVAL; mutex_lock(&intf_mutex); list_add_tail(&intf->list, &intf_list); list_for_each_entry(priv, &dev_list, dev_list) mlx5_add_device(intf, priv); mutex_unlock(&intf_mutex); return 0; } EXPORT_SYMBOL(mlx5_register_interface); void mlx5_unregister_interface(struct mlx5_interface *intf) { struct mlx5_priv *priv; mutex_lock(&intf_mutex); list_for_each_entry(priv, &dev_list, dev_list) mlx5_remove_device(intf, priv); list_del(&intf->list); mutex_unlock(&intf_mutex); } EXPORT_SYMBOL(mlx5_unregister_interface); void *mlx5_get_protocol_dev(struct mlx5_core_dev *mdev, int protocol) { struct mlx5_priv *priv = &mdev->priv; struct mlx5_device_context *dev_ctx; unsigned long flags; void *result = NULL; spin_lock_irqsave(&priv->ctx_lock, flags); list_for_each_entry(dev_ctx, &mdev->priv.ctx_list, list) if ((dev_ctx->intf->protocol == protocol) && dev_ctx->intf->get_dev) { result = dev_ctx->intf->get_dev(dev_ctx->context); break; } spin_unlock_irqrestore(&priv->ctx_lock, flags); return result; } EXPORT_SYMBOL(mlx5_get_protocol_dev); static void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event, unsigned long param) { struct mlx5_priv *priv = &dev->priv; struct mlx5_device_context *dev_ctx; unsigned long flags; spin_lock_irqsave(&priv->ctx_lock, flags); list_for_each_entry(dev_ctx, &priv->ctx_list, list) if (dev_ctx->intf->event) dev_ctx->intf->event(dev, dev_ctx->context, event, param); spin_unlock_irqrestore(&priv->ctx_lock, flags); } struct mlx5_core_event_handler { void (*event)(struct mlx5_core_dev *dev, enum mlx5_dev_event event, void *data); }; static int init_one(struct pci_dev *pdev, const struct pci_device_id *id) { struct mlx5_core_dev *dev; struct mlx5_priv *priv; int err; dev = kzalloc(sizeof(*dev), GFP_KERNEL); priv = &dev->priv; if (prof_sel < 0 || prof_sel >= ARRAY_SIZE(profiles)) { printf("mlx5_core: WARN: ""selected profile out of range, selecting default (%d)\n", MLX5_DEFAULT_PROF); prof_sel = MLX5_DEFAULT_PROF; } dev->profile = &profiles[prof_sel]; dev->event = mlx5_core_event; INIT_LIST_HEAD(&priv->ctx_list); spin_lock_init(&priv->ctx_lock); err = mlx5_dev_init(dev, pdev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""mlx5_dev_init failed %d\n", err); goto out; } err = mlx5_register_device(dev); if (err) { device_printf((&pdev->dev)->bsddev, "ERR: ""mlx5_register_device failed %d\n", err); goto out_init; } return 0; out_init: mlx5_dev_cleanup(dev); out: kfree(dev); return err; } static void remove_one(struct pci_dev *pdev) { struct mlx5_core_dev *dev = pci_get_drvdata(pdev); mlx5_unregister_device(dev); mlx5_dev_cleanup(dev); kfree(dev); } static const struct pci_device_id mlx5_core_pci_table[] = { { PCI_VDEVICE(MELLANOX, 4113) }, /* Connect-IB */ { PCI_VDEVICE(MELLANOX, 4114) }, /* Connect-IB VF */ { PCI_VDEVICE(MELLANOX, 4115) }, /* ConnectX-4 */ { PCI_VDEVICE(MELLANOX, 4116) }, /* ConnectX-4 VF */ { PCI_VDEVICE(MELLANOX, 4117) }, /* ConnectX-4LX */ { PCI_VDEVICE(MELLANOX, 4118) }, /* ConnectX-4LX VF */ { PCI_VDEVICE(MELLANOX, 4119) }, { PCI_VDEVICE(MELLANOX, 4120) }, { PCI_VDEVICE(MELLANOX, 4121) }, { PCI_VDEVICE(MELLANOX, 4122) }, { PCI_VDEVICE(MELLANOX, 4123) }, { PCI_VDEVICE(MELLANOX, 4124) }, { PCI_VDEVICE(MELLANOX, 4125) }, { PCI_VDEVICE(MELLANOX, 4126) }, { PCI_VDEVICE(MELLANOX, 4127) }, { PCI_VDEVICE(MELLANOX, 4128) }, { PCI_VDEVICE(MELLANOX, 4129) }, { PCI_VDEVICE(MELLANOX, 4130) }, { PCI_VDEVICE(MELLANOX, 4131) }, { PCI_VDEVICE(MELLANOX, 4132) }, { PCI_VDEVICE(MELLANOX, 4133) }, { PCI_VDEVICE(MELLANOX, 4134) }, { PCI_VDEVICE(MELLANOX, 4135) }, { PCI_VDEVICE(MELLANOX, 4136) }, { PCI_VDEVICE(MELLANOX, 4137) }, { PCI_VDEVICE(MELLANOX, 4138) }, { PCI_VDEVICE(MELLANOX, 4139) }, { PCI_VDEVICE(MELLANOX, 4140) }, { PCI_VDEVICE(MELLANOX, 4141) }, { PCI_VDEVICE(MELLANOX, 4142) }, { PCI_VDEVICE(MELLANOX, 4143) }, { PCI_VDEVICE(MELLANOX, 4144) }, { 0, } }; MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table); static struct pci_driver mlx5_core_driver = { .name = DRIVER_NAME, .id_table = mlx5_core_pci_table, .probe = init_one, .remove = remove_one }; static int __init init(void) { int err; mlx5_core_wq = create_singlethread_workqueue("mlx5_core_wq"); if (!mlx5_core_wq) { err = -ENOMEM; goto err_debug; } mlx5_health_init(); err = pci_register_driver(&mlx5_core_driver); if (err) goto err_health; return 0; err_health: mlx5_health_cleanup(); destroy_workqueue(mlx5_core_wq); err_debug: return err; } static void __exit cleanup(void) { pci_unregister_driver(&mlx5_core_driver); mlx5_health_cleanup(); destroy_workqueue(mlx5_core_wq); } module_init(init); module_exit(cleanup); Index: head/sys/ofed/drivers/infiniband/core/cma.c =================================================================== --- head/sys/ofed/drivers/infiniband/core/cma.c (revision 300675) +++ head/sys/ofed/drivers/infiniband/core/cma.c (revision 300676) @@ -1,3863 +1,3865 @@ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#define LINUXKPI_PARAM_PREFIX ibcore_ + #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 18 static int cma_response_timeout = CMA_CM_RESPONSE_TIMEOUT; module_param_named(cma_response_timeout, cma_response_timeout, int, 0644); MODULE_PARM_DESC(cma_response_timeout, "CMA_CM_RESPONSE_TIMEOUT (default=20)"); static int def_prec2sl = 3; module_param_named(def_prec2sl, def_prec2sl, int, 0644); MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7"); static int unify_tcp_port_space = 1; module_param(unify_tcp_port_space, int, 0644); MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port " "space allocation (default=1)"); static int debug_level = 0; #define cma_pr(level, priv, format, arg...) \ printk(level "CMA: %p: %s: " format, ((struct rdma_id_priv *) priv) , __func__, ## arg) #define cma_dbg(priv, format, arg...) \ do { if (debug_level) cma_pr(KERN_DEBUG, priv, format, ## arg); } while (0) #define cma_warn(priv, format, arg...) \ cma_pr(KERN_WARNING, priv, format, ## arg) #define CMA_GID_FMT "%2.2x%2.2x:%2.2x%2.2x" #define CMA_GID_RAW_ARG(gid) ((u8 *)(gid))[12],\ ((u8 *)(gid))[13],\ ((u8 *)(gid))[14],\ ((u8 *)(gid))[15] #define CMA_GID_ARG(gid) CMA_GID_RAW_ARG((gid).raw) #define cma_debug_path(priv, pfx, p) \ cma_dbg(priv, pfx "sgid=" CMA_GID_FMT ",dgid=" \ CMA_GID_FMT "\n", CMA_GID_ARG(p.sgid), \ CMA_GID_ARG(p.dgid)) #define cma_debug_gid(priv, g) \ cma_dbg(priv, "gid=" CMA_GID_FMT "\n", CMA_GID_ARG(g) module_param_named(debug_level, debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "debug level default=0"); static void cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device); static struct ib_client cma_client = { .name = "cma", .add = cma_add_one, .remove = cma_remove_one }; static struct ib_sa_client sa_client; static struct rdma_addr_client addr_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct workqueue_struct *cma_wq; static struct workqueue_struct *cma_free_wq; static DEFINE_IDR(sdp_ps); static DEFINE_IDR(tcp_ps); static DEFINE_IDR(udp_ps); static DEFINE_IDR(ipoib_ps); static DEFINE_IDR(ib_ps); struct cma_device { struct list_head list; struct ib_device *device; struct completion comp; atomic_t refcount; struct list_head id_list; }; struct rdma_bind_list { struct idr *ps; struct hlist_head owners; unsigned short port; }; enum { CMA_OPTION_AFONLY, }; /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. * We do this by disabling removal notification while a callback is in process, * and reporting it after the callback completes. */ struct rdma_id_private { struct rdma_cm_id id; struct rdma_bind_list *bind_list; struct socket *sock; struct hlist_node node; struct list_head list; /* listen_any_list or cma_device.list */ struct list_head listen_list; /* per device listens */ struct cma_device *cma_dev; struct list_head mc_list; int internal_id; enum rdma_cm_state state; spinlock_t lock; spinlock_t cm_lock; struct mutex qp_mutex; struct completion comp; atomic_t refcount; struct mutex handler_mutex; struct work_struct work; /* garbage coll */ int backlog; int timeout_ms; struct ib_sa_query *query; int query_id; union { struct ib_cm_id *ib; struct iw_cm_id *iw; } cm_id; u32 seq_num; u32 qkey; u32 qp_num; pid_t owner; u32 options; u8 srq; u8 tos; u8 reuseaddr; u8 afonly; int qp_timeout; /* cache for mc record params */ struct ib_sa_mcmember_rec rec; int is_valid_rec; }; struct cma_multicast { struct rdma_id_private *id_priv; union { struct ib_sa_multicast *ib; } multicast; struct list_head list; void *context; struct sockaddr_storage addr; struct kref mcref; }; struct cma_work { struct work_struct work; struct rdma_id_private *id; enum rdma_cm_state old_state; enum rdma_cm_state new_state; struct rdma_cm_event event; }; struct cma_ndev_work { struct work_struct work; struct rdma_id_private *id; struct rdma_cm_event event; }; struct iboe_mcast_work { struct work_struct work; struct rdma_id_private *id; struct cma_multicast *mc; }; union cma_ip_addr { struct in6_addr ip6; struct { __be32 pad[3]; __be32 addr; } ip4; }; struct cma_hdr { u8 cma_version; u8 ip_version; /* IP version: 7:4 */ __be16 port; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; struct sdp_hh { u8 bsdh[16]; u8 sdp_version; /* Major version: 7:4 */ u8 ip_version; /* IP version: 7:4 */ u8 sdp_specific1[10]; __be16 port; __be16 sdp_specific2; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; struct sdp_hah { u8 bsdh[16]; u8 sdp_version; }; #define CMA_VERSION 0x00 #define SDP_MAJ_VERSION 0x2 static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) { unsigned long flags; int ret; spin_lock_irqsave(&id_priv->lock, flags); ret = (id_priv->state == comp); spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static int cma_comp_exch(struct rdma_id_private *id_priv, enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv, enum rdma_cm_state exch) { unsigned long flags; enum rdma_cm_state old; spin_lock_irqsave(&id_priv->lock, flags); old = id_priv->state; id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return old; } static inline u8 cma_get_ip_ver(struct cma_hdr *hdr) { return hdr->ip_version >> 4; } static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) { hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } static inline u8 sdp_get_majv(u8 sdp_version) { return sdp_version >> 4; } static inline u8 sdp_get_ip_ver(struct sdp_hh *hh) { return hh->ip_version >> 4; } static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver) { hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { atomic_inc(&cma_dev->refcount); id_priv->cma_dev = cma_dev; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); } static inline void cma_deref_dev(struct cma_device *cma_dev) { if (atomic_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); } static inline void release_mc(struct kref *kref) { struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref); kfree(mc->multicast.ib); kfree(mc); } static void cma_release_dev(struct rdma_id_private *id_priv) { mutex_lock(&lock); list_del(&id_priv->list); cma_deref_dev(id_priv->cma_dev); id_priv->cma_dev = NULL; mutex_unlock(&lock); } static int cma_set_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; if (id_priv->qkey) return 0; switch (id_priv->id.ps) { case RDMA_PS_UDP: id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (!ret) id_priv->qkey = be32_to_cpu(rec.qkey); break; default: break; } return ret; } static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num) { int i; int err; struct ib_port_attr props; union ib_gid tmp; err = ib_query_port(device, port_num, &props); if (err) return 1; for (i = 0; i < props.gid_tbl_len; ++i) { err = ib_query_gid(device, port_num, i, &tmp); if (err) return 1; if (!memcmp(&tmp, gid, sizeof tmp)) return 0; } return -EAGAIN; } int rdma_find_cmid_laddr(struct sockaddr_in *local_addr, unsigned short dev_type, void **cm_id) { int ret; u8 port; int found_dev = 0, found_cmid = 0; struct rdma_id_private *id_priv; struct rdma_id_private *dev_id_priv; struct cma_device *cma_dev; struct rdma_dev_addr dev_addr; union ib_gid gid; enum rdma_link_layer dev_ll = dev_type == ARPHRD_INFINIBAND ? IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; memset(&dev_addr, 0, sizeof(dev_addr)); ret = rdma_translate_ip((struct sockaddr *)local_addr, &dev_addr, NULL); if (ret) goto err; /* find rdma device based on MAC address/gid */ mutex_lock(&lock); memcpy(&gid, dev_addr.src_dev_addr + rdma_addr_gid_offset(&dev_addr), sizeof(gid)); list_for_each_entry(cma_dev, &dev_list, list) for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) if ((rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) && (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IWARP)) { ret = find_gid_port(cma_dev->device, &gid, port); if (!ret) { found_dev = 1; goto out; } else if (ret == 1) { mutex_unlock(&lock); goto err; } } out: mutex_unlock(&lock); if (!found_dev) goto err; /* Traverse through the list of listening cm_id's to find the * desired cm_id based on rdma device & port number. */ list_for_each_entry(id_priv, &listen_any_list, list) list_for_each_entry(dev_id_priv, &id_priv->listen_list, listen_list) if (dev_id_priv->cma_dev == cma_dev) if (dev_id_priv->cm_id.iw->local_addr.sin_port == local_addr->sin_port) { *cm_id = (void *)dev_id_priv->cm_id.iw; found_cmid = 1; } return found_cmid ? 0 : -ENODEV; err: return -ENODEV; } EXPORT_SYMBOL(rdma_find_cmid_laddr); static int cma_acquire_dev(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct cma_device *cma_dev; union ib_gid gid, iboe_gid; int ret = -ENODEV; u8 port; enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ? IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; if (dev_ll != IB_LINK_LAYER_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; mutex_lock(&lock); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof gid); list_for_each_entry(cma_dev, &dev_list, list) { for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) { if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) ret = find_gid_port(cma_dev->device, &iboe_gid, port); else ret = find_gid_port(cma_dev->device, &gid, port); if (!ret) { id_priv->id.port_num = port; goto out; } else if (ret == 1) break; } } } out: if (!ret) cma_attach_to_dev(id_priv, cma_dev); mutex_unlock(&lock); return ret; } static void cma_deref_id(struct rdma_id_private *id_priv) { if (atomic_dec_and_test(&id_priv->refcount)) complete(&id_priv->comp); } static int cma_disable_callback(struct rdma_id_private *id_priv, enum rdma_cm_state state) { mutex_lock(&id_priv->handler_mutex); if (id_priv->state != state) { mutex_unlock(&id_priv->handler_mutex); return -EINVAL; } return 0; } struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, void *context, enum rdma_port_space ps, enum ib_qp_type qp_type) { struct rdma_id_private *id_priv; id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL); if (!id_priv) return ERR_PTR(-ENOMEM); id_priv->owner = curthread->td_proc->p_pid; id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; spin_lock_init(&id_priv->lock); spin_lock_init(&id_priv->cm_lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); atomic_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); return &id_priv->id; } EXPORT_SYMBOL(rdma_create_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTS; qp_attr.sq_psn = 0; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); return ret; } static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; return ib_modify_qp(qp, &qp_attr, qp_attr_mask); } int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { struct rdma_id_private *id_priv; struct ib_qp *qp; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id->device != pd->device) return -EINVAL; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) return PTR_ERR(qp); if (id->qp_type == IB_QPT_UD) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); if (ret) goto err; id->qp = qp; id_priv->qp_num = qp->qp_num; id_priv->srq = (qp->srq != NULL); return 0; err: ib_destroy_qp(qp); return ret; } EXPORT_SYMBOL(rdma_create_qp); void rdma_destroy_qp(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); ib_destroy_qp(id_priv->id.qp); id_priv->id.qp = NULL; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_destroy_qp); static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; union ib_gid sgid; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } /* Need to update QP attributes from default values. */ qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); if (ret) goto out; qp_attr.qp_state = IB_QPS_RTR; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num, qp_attr.ah_attr.grh.sgid_index, &sgid); if (ret) goto out; if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB && rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) == IB_LINK_LAYER_ETHERNET) { u32 scope_id = rdma_get_ipv6_scope_id(id_priv->id.device, id_priv->id.port_num); ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL, scope_id); if (ret) goto out; } if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_rts(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_RTS; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; if (conn_param) qp_attr.max_rd_atomic = conn_param->initiator_depth; if (id_priv->qp_timeout && id_priv->id.qp->qp_type == IB_QPT_RC) { qp_attr.timeout = id_priv->qp_timeout; qp_attr_mask |= IB_QP_TIMEOUT; } ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_err(struct rdma_id_private *id_priv) { struct ib_qp_attr qp_attr; int ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_ERR; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int ret; u16 pkey; if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) == IB_LINK_LAYER_INFINIBAND) pkey = ib_addr_get_pkey(dev_addr); else pkey = 0xffff; ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, pkey, &qp_attr->pkey_index); if (ret) return ret; qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { ret = cma_set_qkey(id_priv); if (ret) return ret; qp_attr->qkey = id_priv->qkey; *qp_attr_mask |= IB_QP_QKEY; } else { qp_attr->qp_access_flags = 0; *qp_attr_mask |= IB_QP_ACCESS_FLAGS; } return 0; } int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_id_private *id_priv; int ret = 0; id_priv = container_of(id, struct rdma_id_private, id); switch (rdma_node_get_transport(id_priv->id.device->node_type)) { case RDMA_TRANSPORT_IB: if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); break; default: ret = -ENOSYS; break; } return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); static inline int cma_zero_addr(struct sockaddr *addr) { struct in6_addr *ip6; if (addr->sa_family == AF_INET) return ipv4_is_zeronet( ((struct sockaddr_in *)addr)->sin_addr.s_addr); else { ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr; return (ip6->s6_addr32[0] | ip6->s6_addr32[1] | ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0; } } static inline int cma_loopback_addr(struct sockaddr *addr) { if (addr->sa_family == AF_INET) return ipv4_is_loopback( ((struct sockaddr_in *) addr)->sin_addr.s_addr); else return ipv6_addr_loopback( &((struct sockaddr_in6 *) addr)->sin6_addr); } static inline int cma_any_addr(struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } int rdma_cma_any_addr(struct sockaddr *addr) { return cma_any_addr(addr); } EXPORT_SYMBOL(rdma_cma_any_addr); static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) { if (src->sa_family != dst->sa_family) return -1; switch (src->sa_family) { case AF_INET: return ((struct sockaddr_in *) src)->sin_addr.s_addr != ((struct sockaddr_in *) dst)->sin_addr.s_addr; default: return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr, &((struct sockaddr_in6 *) dst)->sin6_addr); } } static inline __be16 cma_port(struct sockaddr *addr) { if (addr->sa_family == AF_INET) return ((struct sockaddr_in *) addr)->sin_port; else return ((struct sockaddr_in6 *) addr)->sin6_port; } static inline int cma_any_port(struct sockaddr *addr) { return !cma_port(addr); } static int cma_get_net_info(void *hdr, enum rdma_port_space ps, u8 *ip_ver, __be16 *port, union cma_ip_addr **src, union cma_ip_addr **dst) { switch (ps) { case RDMA_PS_SDP: if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) != SDP_MAJ_VERSION) return -EINVAL; *ip_ver = sdp_get_ip_ver(hdr); *port = ((struct sdp_hh *) hdr)->port; *src = &((struct sdp_hh *) hdr)->src_addr; *dst = &((struct sdp_hh *) hdr)->dst_addr; break; default: if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION) return -EINVAL; *ip_ver = cma_get_ip_ver(hdr); *port = ((struct cma_hdr *) hdr)->port; *src = &((struct cma_hdr *) hdr)->src_addr; *dst = &((struct cma_hdr *) hdr)->dst_addr; break; } if (*ip_ver != 4 && *ip_ver != 6) return -EINVAL; return 0; } static void cma_save_net_info(struct rdma_addr *addr, struct rdma_addr *listen_addr, u8 ip_ver, __be16 port, union cma_ip_addr *src, union cma_ip_addr *dst) { struct sockaddr_in *listen4, *ip4; struct sockaddr_in6 *listen6, *ip6; switch (ip_ver) { case 4: listen4 = (struct sockaddr_in *) &listen_addr->src_addr; ip4 = (struct sockaddr_in *) &addr->src_addr; ip4->sin_family = listen4->sin_family; ip4->sin_addr.s_addr = dst->ip4.addr; ip4->sin_port = listen4->sin_port; ip4->sin_len = sizeof(struct sockaddr_in); ip4 = (struct sockaddr_in *) &addr->dst_addr; ip4->sin_family = listen4->sin_family; ip4->sin_addr.s_addr = src->ip4.addr; ip4->sin_port = port; ip4->sin_len = sizeof(struct sockaddr_in); break; case 6: listen6 = (struct sockaddr_in6 *) &listen_addr->src_addr; ip6 = (struct sockaddr_in6 *) &addr->src_addr; ip6->sin6_family = listen6->sin6_family; ip6->sin6_addr = dst->ip6; ip6->sin6_port = listen6->sin6_port; ip6->sin6_len = sizeof(struct sockaddr_in6); ip6->sin6_scope_id = listen6->sin6_scope_id; ip6 = (struct sockaddr_in6 *) &addr->dst_addr; ip6->sin6_family = listen6->sin6_family; ip6->sin6_addr = src->ip6; ip6->sin6_port = port; ip6->sin6_len = sizeof(struct sockaddr_in6); ip6->sin6_scope_id = listen6->sin6_scope_id; break; default: break; } } static inline int cma_user_data_offset(enum rdma_port_space ps) { switch (ps) { case RDMA_PS_SDP: return 0; default: return sizeof(struct cma_hdr); } } static void cma_cancel_route(struct rdma_id_private *id_priv) { switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) { case IB_LINK_LAYER_INFINIBAND: if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); break; default: break; } } static void cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ mutex_lock(&lock); list_del(&id_priv->list); while (!list_empty(&id_priv->listen_list)) { dev_id_priv = list_entry(id_priv->listen_list.next, struct rdma_id_private, listen_list); /* sync with device removal to avoid duplicate destruction */ list_del_init(&dev_id_priv->list); list_del(&dev_id_priv->listen_list); mutex_unlock(&lock); rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } mutex_unlock(&lock); } static void cma_cancel_operation(struct rdma_id_private *id_priv, enum rdma_cm_state state) { switch (state) { case RDMA_CM_ADDR_QUERY: rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: cma_cancel_route(id_priv); break; case RDMA_CM_LISTEN: if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr) && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: break; } } static void cma_release_port(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; mutex_lock(&lock); bind_list = id_priv->bind_list; if (!bind_list) { mutex_unlock(&lock); return; } hlist_del(&id_priv->node); id_priv->bind_list = NULL; if (hlist_empty(&bind_list->owners)) { idr_remove(bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); if (id_priv->sock) sock_release(id_priv->sock); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) { struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { mc = container_of(id_priv->mc_list.next, struct cma_multicast, list); list_del(&mc->list); switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) { case IB_LINK_LAYER_INFINIBAND: ib_sa_free_multicast(mc->multicast.ib); kfree(mc); break; case IB_LINK_LAYER_ETHERNET: kref_put(&mc->mcref, release_mc); break; default: break; } } } static void __rdma_free(struct work_struct *work) { struct rdma_id_private *id_priv; id_priv = container_of(work, struct rdma_id_private, work); wait_for_completion(&id_priv->comp); if (id_priv->internal_id) cma_deref_id(id_priv->id.context); kfree(id_priv->id.route.path_rec); kfree(id_priv); } void rdma_destroy_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; enum rdma_cm_state state; unsigned long flags; struct ib_cm_id *ib; id_priv = container_of(id, struct rdma_id_private, id); state = cma_exch(id_priv, RDMA_CM_DESTROYING); cma_cancel_operation(id_priv, state); /* * Wait for any active callback to finish. New callbacks will find * the id_priv state set to destroying and abort. */ mutex_lock(&id_priv->handler_mutex); mutex_unlock(&id_priv->handler_mutex); if (id_priv->cma_dev) { switch (rdma_node_get_transport(id_priv->id.device->node_type)) { case RDMA_TRANSPORT_IB: spin_lock_irqsave(&id_priv->cm_lock, flags); if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) { ib = id_priv->cm_id.ib; id_priv->cm_id.ib = NULL; spin_unlock_irqrestore(&id_priv->cm_lock, flags); ib_destroy_cm_id(ib); } else spin_unlock_irqrestore(&id_priv->cm_lock, flags); break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: if (id_priv->cm_id.iw) iw_destroy_cm_id(id_priv->cm_id.iw); break; default: break; } cma_leave_mc_groups(id_priv); cma_release_dev(id_priv); } cma_release_port(id_priv); cma_deref_id(id_priv); INIT_WORK(&id_priv->work, __rdma_free); queue_work(cma_free_wq, &id_priv->work); } EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) { int ret; ret = cma_modify_qp_rtr(id_priv, NULL); if (ret) goto reject; ret = cma_modify_qp_rts(id_priv, NULL); if (ret) goto reject; cma_dbg(id_priv, "sending RTU\n"); ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); cma_dbg(id_priv, "sending REJ\n"); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; } static int cma_verify_rep(struct rdma_id_private *id_priv, void *data) { if (id_priv->id.ps == RDMA_PS_SDP && sdp_get_majv(((struct sdp_hah *) data)->sdp_version) != SDP_MAJ_VERSION) return -EINVAL; return 0; } static void cma_set_rep_event_data(struct rdma_cm_event *event, struct ib_cm_rep_event_param *rep_data, void *private_data) { event->param.conn.private_data = private_data; event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; event->param.conn.responder_resources = rep_data->responder_resources; event->param.conn.initiator_depth = rep_data->initiator_depth; event->param.conn.flow_control = rep_data->flow_control; event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; } static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event; int ret = 0; if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && cma_disable_callback(id_priv, RDMA_CM_CONNECT)) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && cma_disable_callback(id_priv, RDMA_CM_DISCONNECT))) return 0; memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_REQ_ERROR: case IB_CM_REP_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: event.status = cma_verify_rep(id_priv, ib_event->private_data); if (event.status) event.event = RDMA_CM_EVENT_CONNECT_ERROR; else if (id_priv->id.qp && id_priv->id.ps != RDMA_PS_SDP) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : RDMA_CM_EVENT_ESTABLISHED; } else event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, ib_event->private_data); break; case IB_CM_RTU_RECEIVED: case IB_CM_USER_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR: event.status = -ETIMEDOUT; /* fall through */ case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_DISCONNECT)) goto out; event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IB_CM_TIMEWAIT_EXIT: event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; break; case IB_CM_MRA_RECEIVED: /* ignore event */ goto out; case IB_CM_REJ_RECEIVED: cma_modify_qp_err(id_priv); event.status = ib_event->param.rej_rcvd.reason; event.event = RDMA_CM_EVENT_REJECTED; event.param.conn.private_data = ib_event->private_data; event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; union cma_ip_addr *src, *dst; __be16 port; u8 ip_ver; int ret; if (cma_get_net_info(ib_event->private_data, listen_id->ps, &ip_ver, &port, &src, &dst)) return NULL; id = rdma_create_id(listen_id->event_handler, listen_id->context, listen_id->ps, ib_event->param.req_rcvd.qp_type); if (IS_ERR(id)) return NULL; cma_save_net_info(&id->route.addr, &listen_id->route.addr, ip_ver, port, src, dst); rt = &id->route; rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths, GFP_KERNEL); if (!rt->path_rec) goto err; rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path; if (rt->num_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (cma_any_addr((struct sockaddr *) &rt->addr.src_addr)) { rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); } else { ret = rdma_translate_ip((struct sockaddr *) &rt->addr.src_addr, &rt->addr.dev_addr, NULL); if (ret) goto err; } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv; struct rdma_cm_id *id; union cma_ip_addr *src, *dst; __be16 port; u8 ip_ver; int ret; id = rdma_create_id(listen_id->event_handler, listen_id->context, listen_id->ps, IB_QPT_UD); if (IS_ERR(id)) return NULL; if (cma_get_net_info(ib_event->private_data, listen_id->ps, &ip_ver, &port, &src, &dst)) goto err; cma_save_net_info(&id->route.addr, &listen_id->route.addr, ip_ver, port, src, dst); if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) { ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, &id->route.addr.dev_addr, NULL); if (ret) goto err; } id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static void cma_set_req_event_data(struct rdma_cm_event *event, struct ib_cm_req_event_param *req_data, void *private_data, int offset) { event->param.conn.private_data = private_data + offset; event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; event->param.conn.responder_resources = req_data->responder_resources; event->param.conn.initiator_depth = req_data->initiator_depth; event->param.conn.flow_control = req_data->flow_control; event->param.conn.retry_count = req_data->retry_count; event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; } static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_event) { return (((ib_event->event == IB_CM_REQ_RECEIVED) && (ib_event->param.req_rcvd.qp_type == id->qp_type)) || ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) && (id->qp_type == IB_QPT_UD)) || (!id->qp_type)); } static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event; int offset, ret; u8 smac[ETH_ALEN]; u8 alt_smac[ETH_ALEN]; u8 *psmac = smac; u8 *palt_smac = alt_smac; int is_iboe = ((rdma_node_get_transport(cm_id->device->node_type) == RDMA_TRANSPORT_IB) && (rdma_port_get_link_layer(cm_id->device, ib_event->param.req_rcvd.port) == IB_LINK_LAYER_ETHERNET)); int is_sidr = 0; listen_id = cm_id->context; if (!cma_check_req_qp_type(&listen_id->id, ib_event)) return -EINVAL; if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) return -ECONNABORTED; memset(&event, 0, sizeof event); offset = cma_user_data_offset(listen_id->id.ps); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { is_sidr = 1; conn_id = cma_new_udp_id(&listen_id->id, ib_event); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { conn_id = cma_new_conn_id(&listen_id->id, ib_event); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } if (!conn_id) { ret = -ENOMEM; goto err1; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); ret = cma_acquire_dev(conn_id); if (ret) goto err2; conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; /* * Protect against the user destroying conn_id from another thread * until we're done accessing it. */ atomic_inc(&conn_id->refcount); ret = conn_id->id.event_handler(&conn_id->id, &event); if (ret) goto err3; if (is_iboe && !is_sidr) { u32 scope_id = rdma_get_ipv6_scope_id(cm_id->device, ib_event->param.req_rcvd.port); if (ib_event->param.req_rcvd.primary_path != NULL) rdma_addr_find_smac_by_sgid( &ib_event->param.req_rcvd.primary_path->sgid, psmac, NULL, scope_id); else psmac = NULL; if (ib_event->param.req_rcvd.alternate_path != NULL) rdma_addr_find_smac_by_sgid( &ib_event->param.req_rcvd.alternate_path->sgid, palt_smac, NULL, scope_id); else palt_smac = NULL; } /* * Acquire mutex to prevent user executing rdma_destroy_id() * while we're accessing the cm_id. */ mutex_lock(&lock); if (is_iboe && !is_sidr) ib_update_cm_av(cm_id, psmac, palt_smac); if (cma_comp(conn_id, RDMA_CM_CONNECT) && (conn_id->id.qp_type != IB_QPT_UD)) { cma_dbg(container_of(&conn_id->id, struct rdma_id_private, id), "sending MRA\n"); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } mutex_unlock(&lock); mutex_unlock(&conn_id->handler_mutex); mutex_unlock(&listen_id->handler_mutex); cma_deref_id(conn_id); return 0; err3: cma_deref_id(conn_id); /* Destroy the CM ID by returning a non-zero value. */ conn_id->cm_id.ib = NULL; err2: cma_exch(conn_id, RDMA_CM_DESTROYING); mutex_unlock(&conn_id->handler_mutex); err1: mutex_unlock(&listen_id->handler_mutex); if (conn_id) rdma_destroy_id(&conn_id->id); return ret; } static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr) { return cpu_to_be64(((u64)ps << 16) + be16_to_cpu(cma_port(addr))); } static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, struct ib_cm_compare_data *compare) { struct cma_hdr *cma_data, *cma_mask; struct sdp_hh *sdp_data, *sdp_mask; __be32 ip4_addr; struct in6_addr ip6_addr; memset(compare, 0, sizeof *compare); cma_data = (void *) compare->data; cma_mask = (void *) compare->mask; sdp_data = (void *) compare->data; sdp_mask = (void *) compare->mask; switch (addr->sa_family) { case AF_INET: ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr; if (ps == RDMA_PS_SDP) { sdp_set_ip_ver(sdp_data, 4); sdp_set_ip_ver(sdp_mask, 0xF); if (!cma_any_addr(addr)) { sdp_data->dst_addr.ip4.addr = ip4_addr; sdp_mask->dst_addr.ip4.addr = htonl(~0); } } else { cma_set_ip_ver(cma_data, 4); cma_set_ip_ver(cma_mask, 0xF); if (!cma_any_addr(addr)) { cma_data->dst_addr.ip4.addr = ip4_addr; cma_mask->dst_addr.ip4.addr = htonl(~0); } } break; case AF_INET6: ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr; if (ps == RDMA_PS_SDP) { sdp_set_ip_ver(sdp_data, 6); sdp_set_ip_ver(sdp_mask, 0xF); if (!cma_any_addr(addr)) { sdp_data->dst_addr.ip6 = ip6_addr; memset(&sdp_mask->dst_addr.ip6, 0xFF, sizeof(sdp_mask->dst_addr.ip6)); } } else { cma_set_ip_ver(cma_data, 6); cma_set_ip_ver(cma_mask, 0xF); if (!cma_any_addr(addr)) { cma_data->dst_addr.ip6 = ip6_addr; memset(&cma_mask->dst_addr.ip6, 0xFF, sizeof(cma_mask->dst_addr.ip6)); } } break; default: break; } } static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; struct rdma_cm_event event; struct sockaddr_in *sin; int ret = 0; if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) return 0; memset(&event, 0, sizeof event); switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; *sin = iw_event->local_addr; sin = (struct sockaddr_in *) &id_priv->id.route.addr.dst_addr; *sin = iw_event->remote_addr; switch ((int)iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; case -ECONNRESET: case -ECONNREFUSED: event.event = RDMA_CM_EVENT_REJECTED; break; case -ETIMEDOUT: event.event = RDMA_CM_EVENT_UNREACHABLE; break; default: event.event = RDMA_CM_EVENT_CONNECT_ERROR; break; } break; case IW_CM_EVENT_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; default: BUG_ON(1); } event.status = iw_event->status; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } mutex_unlock(&id_priv->handler_mutex); return ret; } static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; struct sockaddr_in *sin; struct net_device *dev = NULL; struct rdma_cm_event event; int ret; struct ib_device_attr attr; listen_id = cm_id->context; if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) return -ECONNABORTED; /* Create a new RDMA id for the new IW CM ID */ new_cm_id = rdma_create_id(listen_id->id.event_handler, listen_id->id.context, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(new_cm_id)) { ret = -ENOMEM; goto out; } conn_id = container_of(new_cm_id, struct rdma_id_private, id); mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr); if (!dev) { ret = -EADDRNOTAVAIL; mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } ret = cma_acquire_dev(conn_id); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } conn_id->cm_id.iw = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; sin = (struct sockaddr_in *) &new_cm_id->route.addr.src_addr; *sin = iw_event->local_addr; sin = (struct sockaddr_in *) &new_cm_id->route.addr.dst_addr; *sin = iw_event->remote_addr; ret = ib_query_device(conn_id->id.device, &attr); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; /* * Protect against the user destroying conn_id from another thread * until we're done accessing it. */ atomic_inc(&conn_id->refcount); ret = conn_id->id.event_handler(&conn_id->id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; cma_exch(conn_id, RDMA_CM_DESTROYING); mutex_unlock(&conn_id->handler_mutex); cma_deref_id(conn_id); rdma_destroy_id(&conn_id->id); goto out; } mutex_unlock(&conn_id->handler_mutex); cma_deref_id(conn_id); out: if (dev) dev_put(dev); mutex_unlock(&listen_id->handler_mutex); return ret; } static int cma_ib_listen(struct rdma_id_private *id_priv) { struct ib_cm_compare_data compare_data; struct sockaddr *addr; struct ib_cm_id *id; __be64 svc_id; int ret; id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv); if (IS_ERR(id)) return PTR_ERR(id); id_priv->cm_id.ib = id; addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; svc_id = cma_get_service_id(id_priv->id.ps, addr); if (cma_any_addr(addr) && !id_priv->afonly) ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL); else { cma_set_compare_data(id_priv->id.ps, addr, &compare_data); ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data); } if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } return ret; } static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; struct sockaddr_in *sin; struct iw_cm_id *id; id = iw_create_cm_id(id_priv->id.device, id_priv->sock, iw_conn_req_handler, id_priv); if (IS_ERR(id)) return PTR_ERR(id); id_priv->cm_id.iw = id; sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; id_priv->cm_id.iw->local_addr = *sin; ret = iw_cm_listen(id_priv->cm_id.iw, backlog); if (ret) { iw_destroy_cm_id(id_priv->cm_id.iw); id_priv->cm_id.iw = NULL; } return ret; } static int cma_listen_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rdma_id_private *id_priv = id->context; id->context = id_priv->id.context; id->event_handler = id_priv->id.event_handler; return id_priv->id.event_handler(id, event); } static void cma_listen_on_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { struct rdma_id_private *dev_id_priv; struct rdma_cm_id *id; int ret; id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps, id_priv->id.qp_type); if (IS_ERR(id)) return; dev_id_priv = container_of(id, struct rdma_id_private, id); dev_id_priv->state = RDMA_CM_ADDR_BOUND; dev_id_priv->sock = id_priv->sock; memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr, ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); cma_attach_to_dev(dev_id_priv, cma_dev); list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); atomic_inc(&id_priv->refcount); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; ret = rdma_listen(id, id_priv->backlog); if (ret) cma_warn(id_priv, "cma_listen_on_dev, error %d, listening on device %s\n", ret, cma_dev->device->name); } static void cma_listen_on_all(struct rdma_id_private *id_priv) { struct cma_device *cma_dev; mutex_lock(&lock); list_add_tail(&id_priv->list, &listen_any_list); list_for_each_entry(cma_dev, &dev_list, list) cma_listen_on_dev(id_priv, cma_dev); mutex_unlock(&lock); } void rdma_set_service_type(struct rdma_cm_id *id, int tos) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); id_priv->tos = (u8) tos; } EXPORT_SYMBOL(rdma_set_service_type); void rdma_set_timeout(struct rdma_cm_id *id, int timeout) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); id_priv->qp_timeout = (u8) timeout; } EXPORT_SYMBOL(rdma_set_timeout); static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, void *context) { struct cma_work *work = context; struct rdma_route *route; route = &work->id->id.route; if (!status) { route->num_paths = 1; *route->path_rec = *path_rec; } else { work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; work->event.status = status; } queue_work(cma_wq, &work->work); } static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, struct cma_work *work) { struct rdma_addr *addr = &id_priv->id.route.addr; struct ib_sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; memset(&path_rec, 0, sizeof path_rec); rdma_addr_get_sgid(&addr->dev_addr, &path_rec.sgid); rdma_addr_get_dgid(&addr->dev_addr, &path_rec.dgid); path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; path_rec.service_id = cma_get_service_id(id_priv->id.ps, (struct sockaddr *) &addr->dst_addr); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; if (addr->src_addr.ss_family == AF_INET) { path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; } else { sin6 = (struct sockaddr_in6 *) &addr->src_addr; path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, id_priv->id.port_num, &path_rec, comp_mask, timeout_ms, GFP_KERNEL, cma_query_handler, work, &id_priv->query); return (id_priv->query_id < 0) ? id_priv->query_id : 0; } static void cma_work_handler(struct work_struct *_work) { struct cma_work *work = container_of(_work, struct cma_work, work); struct rdma_id_private *id_priv = work->id; int destroy = 0; mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) goto out; if (id_priv->id.event_handler(&id_priv->id, &work->event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); destroy = 1; } out: mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); if (destroy) rdma_destroy_id(&id_priv->id); kfree(work); } static void cma_ndev_work_handler(struct work_struct *_work) { struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work); struct rdma_id_private *id_priv = work->id; int destroy = 0; mutex_lock(&id_priv->handler_mutex); if (id_priv->state == RDMA_CM_DESTROYING || id_priv->state == RDMA_CM_DEVICE_REMOVAL) goto out; if (id_priv->id.event_handler(&id_priv->id, &work->event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); destroy = 1; } out: mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); if (destroy) rdma_destroy_id(&id_priv->id); kfree(work); } static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } ret = cma_query_ib_route(id_priv, timeout_ms, work); if (ret) goto err2; return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } int rdma_set_ib_paths(struct rdma_cm_id *id, struct ib_sa_path_rec *path_rec, int num_paths) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths, GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } id->route.num_paths = num_paths; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_set_ib_paths); static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) { struct cma_work *work; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; queue_work(cma_wq, &work->work); return 0; } static u8 tos_to_sl(u8 tos) { return def_prec2sl & 7; } static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr; struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr; struct net_device *ndev = NULL; if (src_addr->sin_family != dst_addr->sin_family) return -EINVAL; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } route->num_paths = 1; if (addr->dev_addr.bound_dev_if) ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); if (!ndev) { ret = -ENODEV; goto err2; } route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev); memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN); memcpy(route->path_rec->smac, IF_LLADDR(ndev), ndev->if_addrlen); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; route->path_rec->sl = tos_to_sl(id_priv->tos); route->path_rec->mtu = iboe_get_mtu(ndev->if_mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = iboe_get_rate(ndev); dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; work->event.status = 0; queue_work(cma_wq, &work->work); return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: switch (rdma_port_get_link_layer(id->device, id->port_num)) { case IB_LINK_LAYER_INFINIBAND: ret = cma_resolve_ib_route(id_priv, timeout_ms); break; case IB_LINK_LAYER_ETHERNET: ret = cma_resolve_iboe_route(id_priv); break; default: ret = -ENOSYS; } break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: ret = cma_resolve_iw_route(id_priv, timeout_ms); break; default: ret = -ENOSYS; break; } if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); int rdma_enable_apm(struct rdma_cm_id *id, enum alt_path_type alt_type) { /* APM is not supported yet */ return -EINVAL; } EXPORT_SYMBOL(rdma_enable_apm); static int cma_bind_loopback(struct rdma_id_private *id_priv) { struct cma_device *cma_dev; struct ib_port_attr port_attr; union ib_gid gid; u16 pkey; int ret; u8 p; mutex_lock(&lock); if (list_empty(&dev_list)) { ret = -ENODEV; goto out; } list_for_each_entry(cma_dev, &dev_list, list) for (p = 1; p <= cma_dev->device->phys_port_cnt; ++p) if (!ib_query_port(cma_dev->device, p, &port_attr) && port_attr.state == IB_PORT_ACTIVE) goto port_found; p = 1; cma_dev = list_entry(dev_list.next, struct cma_device, list); port_found: ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid); if (ret) goto out; ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); if (ret) goto out; id_priv->id.route.addr.dev_addr.dev_type = (rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); out: mutex_unlock(&lock); return ret; } static void addr_handler(int status, struct sockaddr *src_addr, struct rdma_dev_addr *dev_addr, void *context) { struct rdma_id_private *id_priv = context; struct rdma_cm_event event; memset(&event, 0, sizeof event); mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) goto out; memcpy(&id_priv->id.route.addr.src_addr, src_addr, ip_addr_size(src_addr)); if (!status && !id_priv->cma_dev) status = cma_acquire_dev(id_priv); if (status) { if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ADDR_BOUND)) goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; if (id_priv->id.event_handler(&id_priv->id, &event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); rdma_destroy_id(&id_priv->id); return; } out: mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; struct sockaddr *src, *dst; union ib_gid gid; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_bind_loopback(id_priv); if (ret) goto err; } rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); src = (struct sockaddr *) &id_priv->id.route.addr.src_addr; if (cma_zero_addr(src)) { dst = (struct sockaddr *) &id_priv->id.route.addr.dst_addr; if ((src->sa_family = dst->sa_family) == AF_INET) { ((struct sockaddr_in *)src)->sin_addr = ((struct sockaddr_in *)dst)->sin_addr; } else { ((struct sockaddr_in6 *)src)->sin6_addr = ((struct sockaddr_in6 *)dst)->sin6_addr; } } work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); return 0; err: kfree(work); return ret; } static int cma_resolve_scif(struct rdma_id_private *id_priv) { struct cma_work *work; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; /* we probably can leave it empty here */ work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); return 0; } static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr) { if (!src_addr || !src_addr->sa_family) { src_addr = (struct sockaddr *) &id->route.addr.src_addr; src_addr->sa_family = dst_addr->sa_family; #ifdef INET6 if (dst_addr->sa_family == AF_INET6) { ((struct sockaddr_in6 *) src_addr)->sin6_scope_id = ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id; } #endif } if (!cma_any_addr(src_addr)) return rdma_bind_addr(id, src_addr); else { #if defined(INET6) || defined(INET) union { #ifdef INET struct sockaddr_in in; #endif #ifdef INET6 struct sockaddr_in6 in6; #endif } addr; #endif switch(dst_addr->sa_family) { #ifdef INET case AF_INET: memset(&addr.in, 0, sizeof(addr.in)); addr.in.sin_family = dst_addr->sa_family; addr.in.sin_len = sizeof(addr.in); return rdma_bind_addr(id, (struct sockaddr *)&addr.in); #endif #ifdef INET6 case AF_INET6: memset(&addr.in6, 0, sizeof(addr.in6)); addr.in6.sin6_family = dst_addr->sa_family; addr.in6.sin6_len = sizeof(addr.in6); addr.in6.sin6_scope_id = ((struct sockaddr_in6 *)dst_addr)->sin6_scope_id; return rdma_bind_addr(id, (struct sockaddr *)&addr.in6); #endif default: return -EINVAL; } } } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr, int timeout_ms) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id_priv->state == RDMA_CM_IDLE) { ret = cma_bind_addr(id, src_addr, dst_addr); if (ret) return ret; } if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr)); if (cma_any_addr(dst_addr)) ret = cma_resolve_loopback(id_priv); else if (id_priv->id.device && rdma_node_get_transport(id_priv->id.device->node_type) == RDMA_TRANSPORT_SCIF) ret = cma_resolve_scif(id_priv); else ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr, dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, id_priv); if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if (id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_reuseaddr); int rdma_set_afonly(struct rdma_cm_id *id, int afonly) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) { id_priv->options |= (1 << CMA_OPTION_AFONLY); id_priv->afonly = afonly; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_afonly); static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct sockaddr_in *sin; sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; sin->sin_port = htons(bind_list->port); id_priv->bind_list = bind_list; hlist_add_head(&id_priv->node, &bind_list->owners); } static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; int port, ret; bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; do { ret = idr_get_new_above(ps, bind_list, snum, &port); } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); if (ret) goto err1; if (port != snum) { ret = -EADDRNOTAVAIL; goto err2; } bind_list->ps = ps; bind_list->port = (unsigned short) port; cma_bind_port(bind_list, id_priv); return 0; err2: idr_remove(ps, port); err1: kfree(bind_list); return ret; } static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) { static unsigned int last_used_port; int low, high, remaining; unsigned int rover; inet_get_local_port_range(&low, &high); remaining = (high - low) + 1; rover = random() % remaining + low; retry: if (last_used_port != rover && !idr_find(ps, (unsigned short) rover)) { int ret = cma_alloc_port(ps, id_priv, rover); /* * Remember previously used port number in order to avoid * re-using same port immediately after it is closed. */ if (!ret) last_used_port = rover; if (ret != -EADDRNOTAVAIL) return ret; } if (--remaining) { rover++; if ((rover < low) || (rover > high)) rover = low; goto retry; } return -EADDRNOTAVAIL; } /* * Check that the requested port is available. This is called when trying to * bind to a specific port, or when trying to listen on a bound port. In * the latter case, the provided id_priv may already be on the bind_list, but * we still need to check that it's okay to start listening. */ static int cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv, uint8_t reuseaddr) { struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr && cur_id->reuseaddr) continue; cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr; if (id_priv->afonly && cur_id->afonly && (addr->sa_family != cur_addr->sa_family)) continue; if (cma_any_addr(addr) || cma_any_addr(cur_addr)) return -EADDRNOTAVAIL; if (!cma_addr_cmp(addr, cur_addr)) return -EADDRINUSE; } return 0; } static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; unsigned short snum; int ret; snum = ntohs(cma_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)); bind_list = idr_find(ps, snum); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, snum); } else { ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); if (!ret) cma_bind_port(bind_list, id_priv); } return ret; } static int cma_bind_listen(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list = id_priv->bind_list; int ret = 0; mutex_lock(&lock); if (bind_list->owners.first->next) ret = cma_check_port(bind_list, id_priv, 0); mutex_unlock(&lock); return ret; } static int cma_get_tcp_port(struct rdma_id_private *id_priv) { int ret; int size; struct socket *sock; ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret) return ret; #ifdef __linux__ ret = sock->ops->bind(sock, (struct sockaddr *) &id_priv->id.route.addr.src_addr, ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); #else ret = -sobind(sock, (struct sockaddr *)&id_priv->id.route.addr.src_addr, curthread); #endif if (ret) { sock_release(sock); return ret; } size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr); ret = sock_getname(sock, (struct sockaddr *) &id_priv->id.route.addr.src_addr, &size, 0); if (ret) { sock_release(sock); return ret; } id_priv->sock = sock; return 0; } static int cma_get_port(struct rdma_id_private *id_priv) { struct idr *ps; int ret; switch (id_priv->id.ps) { case RDMA_PS_SDP: ps = &sdp_ps; break; case RDMA_PS_TCP: ps = &tcp_ps; if (unify_tcp_port_space) { ret = cma_get_tcp_port(id_priv); if (ret) goto out; } break; case RDMA_PS_UDP: ps = &udp_ps; break; case RDMA_PS_IPOIB: ps = &ipoib_ps; break; case RDMA_PS_IB: ps = &ib_ps; break; default: return -EPROTONOSUPPORT; } mutex_lock(&lock); if (cma_any_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)) ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); out: return ret; } static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, struct sockaddr *addr) { #if defined(INET6) struct sockaddr_in6 *sin6; if (addr->sa_family != AF_INET6) return 0; sin6 = (struct sockaddr_in6 *) addr; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) && !sin6->sin6_scope_id) return -EINVAL; dev_addr->bound_dev_if = sin6->sin6_scope_id; #endif return 0; } int rdma_listen(struct rdma_cm_id *id, int backlog) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id_priv->state == RDMA_CM_IDLE) { ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET; ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr); if (ret) return ret; } if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) return -EINVAL; if (id_priv->reuseaddr) { ret = cma_bind_listen(id_priv); if (ret) goto err; } id_priv->backlog = backlog; if (id->device) { switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: ret = cma_ib_listen(id_priv); if (ret) goto err; break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: ret = cma_iw_listen(id_priv, backlog); if (ret) goto err; break; default: ret = -ENOSYS; goto err; } } else cma_listen_on_all(id_priv); return 0; err: id_priv->backlog = 0; cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_listen); int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; int ret; #if defined(INET6) int ipv6only; size_t var_size = sizeof(int); #endif if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) return -EAFNOSUPPORT; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL; ret = cma_check_linklocal(&id->route.addr.dev_addr, addr); if (ret) goto err1; memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr)); if (!cma_any_addr(addr)) { ret = rdma_translate_ip(addr, &id->route.addr.dev_addr, NULL); if (ret) goto err1; ret = cma_acquire_dev(id_priv); if (ret) goto err1; } if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { if (addr->sa_family == AF_INET) id_priv->afonly = 1; #if defined(INET6) else if (addr->sa_family == AF_INET6) id_priv->afonly = kernel_sysctlbyname(&thread0, "net.inet6.ip6.v6only", &ipv6only, &var_size, NULL, 0, NULL, 0); #endif } ret = cma_get_port(id_priv); if (ret) goto err2; return 0; err2: if (id_priv->cma_dev) cma_release_dev(id_priv); err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; } EXPORT_SYMBOL(rdma_bind_addr); static int cma_format_hdr(void *hdr, enum rdma_port_space ps, struct rdma_route *route) { struct cma_hdr *cma_hdr; struct sdp_hh *sdp_hdr; if (route->addr.src_addr.ss_family == AF_INET) { struct sockaddr_in *src4, *dst4; src4 = (struct sockaddr_in *) &route->addr.src_addr; dst4 = (struct sockaddr_in *) &route->addr.dst_addr; switch (ps) { case RDMA_PS_SDP: sdp_hdr = hdr; if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) return -EINVAL; sdp_set_ip_ver(sdp_hdr, 4); sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; sdp_hdr->port = src4->sin_port; break; default: cma_hdr = hdr; cma_hdr->cma_version = CMA_VERSION; cma_set_ip_ver(cma_hdr, 4); cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; cma_hdr->port = src4->sin_port; break; } } else { struct sockaddr_in6 *src6, *dst6; src6 = (struct sockaddr_in6 *) &route->addr.src_addr; dst6 = (struct sockaddr_in6 *) &route->addr.dst_addr; switch (ps) { case RDMA_PS_SDP: sdp_hdr = hdr; if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) return -EINVAL; sdp_set_ip_ver(sdp_hdr, 6); sdp_hdr->src_addr.ip6 = src6->sin6_addr; sdp_hdr->dst_addr.ip6 = dst6->sin6_addr; sdp_hdr->port = src6->sin6_port; break; default: cma_hdr = hdr; cma_hdr->cma_version = CMA_VERSION; cma_set_ip_ver(cma_hdr, 6); cma_hdr->src_addr.ip6 = src6->sin6_addr; cma_hdr->dst_addr.ip6 = dst6->sin6_addr; cma_hdr->port = src6->sin6_port; break; } } return 0; } static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event; struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret = 0; if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) return 0; memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_SIDR_REP_RECEIVED: event.param.ud.private_data = ib_event->private_data; event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; if (rep->status != IB_SIDR_SUCCESS) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = ib_event->param.sidr_rep_rcvd.status; break; } ret = cma_set_qkey(id_priv); if (ret) { event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = -EINVAL; break; } if (id_priv->qkey != rep->qkey) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -EINVAL; break; } ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num, id_priv->id.route.path_rec, &event.param.ud.ah_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; event.status = 0; break; default: printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_sidr_req_param req; struct rdma_route *route; struct ib_cm_id *id; int ret; req.private_data_len = sizeof(struct cma_hdr) + conn_param->private_data_len; if (req.private_data_len < conn_param->private_data_len) return -EINVAL; req.private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!req.private_data) return -ENOMEM; if (conn_param->private_data && conn_param->private_data_len) memcpy((void *) req.private_data + sizeof(struct cma_hdr), conn_param->private_data, conn_param->private_data_len); route = &id_priv->id.route; ret = cma_format_hdr((void *) req.private_data, id_priv->id.ps, route); if (ret) goto out; id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; req.path = route->path_rec; req.service_id = cma_get_service_id(id_priv->id.ps, (struct sockaddr *) &route->addr.dst_addr); req.timeout_ms = 1 << (cma_response_timeout - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; cma_dbg(id_priv, "sending SIDR\n"); ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } out: kfree(req.private_data); return ret; } static int cma_connect_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_req_param req; struct rdma_route *route; void *private_data; struct ib_cm_id *id; int offset, ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv->id.ps); req.private_data_len = offset + conn_param->private_data_len; if (req.private_data_len < conn_param->private_data_len) return -EINVAL; private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; route = &id_priv->id.route; ret = cma_format_hdr(private_data, id_priv->id.ps, route); if (ret) goto out; req.private_data = private_data; req.primary_path = &route->path_rec[0]; if (route->num_paths == 2) req.alternate_path = &route->path_rec[1]; req.service_id = cma_get_service_id(id_priv->id.ps, (struct sockaddr *) &route->addr.dst_addr); req.qp_num = id_priv->qp_num; req.qp_type = id_priv->id.qp_type; req.starting_psn = id_priv->seq_num; req.responder_resources = conn_param->responder_resources; req.initiator_depth = conn_param->initiator_depth; req.flow_control = conn_param->flow_control; req.retry_count = min_t(u8, 7, conn_param->retry_count); req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); req.remote_cm_response_timeout = cma_response_timeout; req.local_cm_response_timeout = cma_response_timeout; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; cma_dbg(id_priv, "sending REQ\n"); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: if (ret && !IS_ERR(id)) { ib_destroy_cm_id(id); id_priv->cm_id.ib = NULL; } kfree(private_data); return ret; } static int cma_connect_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_id *cm_id; struct sockaddr_in* sin; int ret; struct iw_cm_conn_param iw_param; cm_id = iw_create_cm_id(id_priv->id.device, id_priv->sock, cma_iw_handler, id_priv); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); id_priv->cm_id.iw = cm_id; sin = (struct sockaddr_in*) &id_priv->id.route.addr.src_addr; cm_id->local_addr = *sin; sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr; cm_id->remote_addr = *sin; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; if (conn_param) { iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; } else { memset(&iw_param, 0, sizeof iw_param); iw_param.qpn = id_priv->qp_num; } ret = iw_cm_connect(cm_id, &iw_param); out: if (ret) { iw_destroy_cm_id(cm_id); id_priv->cm_id.iw = NULL; } return ret; } int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: if (id->qp_type == IB_QPT_UD) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: ret = cma_connect_iw(id_priv, conn_param); break; default: ret = -ENOSYS; break; } if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_connect); static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_rep_param rep; int ret; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; ret = cma_modify_qp_rts(id_priv, conn_param); if (ret) goto out; memset(&rep, 0, sizeof rep); rep.qp_num = id_priv->qp_num; rep.starting_psn = id_priv->seq_num; rep.private_data = conn_param->private_data; rep.private_data_len = conn_param->private_data_len; rep.responder_resources = conn_param->responder_resources; rep.initiator_depth = conn_param->initiator_depth; rep.failover_accepted = 0; rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; cma_dbg(id_priv, "sending REP\n"); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; } static int cma_accept_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_conn_param iw_param; int ret; if (!conn_param) return -EINVAL; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) return ret; iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; if (id_priv->id.qp) { iw_param.qpn = id_priv->qp_num; } else iw_param.qpn = conn_param->qp_num; return iw_cm_accept(id_priv->cm_id.iw, &iw_param); } static int cma_send_sidr_rep(struct rdma_id_private *id_priv, enum ib_cm_sidr_status status, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; int ret; memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { ret = cma_set_qkey(id_priv); if (ret) return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; } rep.private_data = private_data; rep.private_data_len = private_data_len; cma_dbg(id_priv, "sending SIDR\n"); return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); id_priv->owner = curthread->td_proc->p_pid; if (!cma_comp(id_priv, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp && conn_param) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: if (id->qp_type == IB_QPT_UD) { if (conn_param) ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, conn_param->private_data, conn_param->private_data_len); else ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, NULL, 0); } else { if (conn_param) ret = cma_accept_ib(id_priv, conn_param); else ret = cma_rep_recv(id_priv); } break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: ret = cma_accept_iw(id_priv, conn_param); break; default: ret = -ENOSYS; break; } if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); rdma_reject(id, NULL, 0); return ret; } EXPORT_SYMBOL(rdma_accept); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; switch (id->device->node_type) { case RDMA_NODE_IB_CA: ret = ib_cm_notify(id_priv->cm_id.ib, event); break; default: ret = 0; break; } return ret; } EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, u8 private_data_len) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: if (id->qp_type == IB_QPT_UD) ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, private_data, private_data_len); else { cma_dbg(id_priv, "sending REJ\n"); ret = ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, private_data, private_data_len); } break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); break; default: ret = -ENOSYS; break; } return ret; } EXPORT_SYMBOL(rdma_reject); int rdma_disconnect(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: ret = cma_modify_qp_err(id_priv); if (ret) goto out; /* Initiate or respond to a disconnect. */ cma_dbg(id_priv, "sending DREQ\n"); if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { cma_dbg(id_priv, "sending DREP\n"); ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); } break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); break; default: ret = -EINVAL; break; } out: return ret; } EXPORT_SYMBOL(rdma_disconnect); static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) { struct rdma_id_private *id_priv; struct cma_multicast *mc = multicast->context; struct rdma_cm_event event; struct rdma_dev_addr *dev_addr; int ret; struct net_device *ndev = NULL; u16 vlan; id_priv = mc->id_priv; dev_addr = &id_priv->id.route.addr.dev_addr; if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) && cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED)) return 0; mutex_lock(&id_priv->qp_mutex); if (!status && id_priv->id.qp) status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, be16_to_cpu(multicast->rec.mlid)); mutex_unlock(&id_priv->qp_mutex); memset(&event, 0, sizeof event); event.status = status; event.param.ud.private_data = mc->context; ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); if (!ndev) { status = -ENODEV; } else { vlan = rdma_vlan_dev_vlan_id(ndev); dev_put(ndev); } if (!status) { event.event = RDMA_CM_EVENT_MULTICAST_JOIN; ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, &event.param.ud.ah_attr); event.param.ud.ah_attr.vlan_id = vlan; event.param.ud.qp_num = 0xFFFFFF; event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); } else { event.event = RDMA_CM_EVENT_MULTICAST_ERROR; /* mark that the cached record is no longer valid */ if (status != -ENETRESET && status != -EAGAIN) { spin_lock(&id_priv->lock); id_priv->is_valid_rec = 0; spin_unlock(&id_priv->lock); } } ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return 0; } mutex_unlock(&id_priv->handler_mutex); return 0; } static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr, union ib_gid *mgid) { unsigned char mc_map[MAX_ADDR_LEN]; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sockaddr_in *sin = (struct sockaddr_in *) addr; #if defined(INET6) struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; #endif if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); #if defined(INET6) } else if ((addr->sa_family == AF_INET6) && ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); #endif } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } } static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct ib_sa_mcmember_rec rec; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; int ret = 0; ib_addr_get_mgid(dev_addr, &id_priv->rec.mgid); /* cache ipoib bc record */ spin_lock(&id_priv->lock); if (!id_priv->is_valid_rec) ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &id_priv->rec.mgid, &id_priv->rec); if (ret) { id_priv->is_valid_rec = 0; spin_unlock(&id_priv->lock); return ret; } else { rec = id_priv->rec; id_priv->is_valid_rec = 1; } spin_unlock(&id_priv->lock); cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); if (id_priv->id.ps == RDMA_PS_UDP) rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = 1; comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; if (id_priv->id.ps == RDMA_PS_IPOIB) comp_mask |= IB_SA_MCMEMBER_REC_RATE | IB_SA_MCMEMBER_REC_RATE_SELECTOR | IB_SA_MCMEMBER_REC_MTU_SELECTOR | IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_HOP_LIMIT; mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device, id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, cma_ib_mc_handler, mc); return PTR_RET(mc->multicast.ib); } static void iboe_mcast_work_handler(struct work_struct *work) { struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work); struct cma_multicast *mc = mw->mc; struct ib_sa_multicast *m = mc->multicast.ib; mc->multicast.ib->context = mc; cma_ib_mc_handler(0, m); kref_put(&mc->mcref, release_mc); kfree(mw); } static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { mgid->raw[0] = 0xff; mgid->raw[1] = 0x0e; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; mgid->raw[5] = 0; mgid->raw[6] = 0; mgid->raw[7] = 0; mgid->raw[8] = 0; mgid->raw[9] = 0; mgid->raw[10] = 0xff; mgid->raw[11] = 0xff; *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr; } } static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct iboe_mcast_work *work; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int err; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; if (cma_zero_addr((struct sockaddr *)&mc->addr)) return -EINVAL; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL); if (!mc->multicast.ib) { err = -ENOMEM; goto out1; } cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid); mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); if (id_priv->id.ps == RDMA_PS_UDP) mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); if (dev_addr->bound_dev_if) ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); if (!ndev) { err = -ENODEV; goto out2; } mc->multicast.ib->rec.rate = iboe_get_rate(ndev); mc->multicast.ib->rec.hop_limit = 1; mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->if_mtu); dev_put(ndev); if (!mc->multicast.ib->rec.mtu) { err = -EINVAL; goto out2; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &mc->multicast.ib->rec.port_gid); work->id = id_priv; work->mc = mc; INIT_WORK(&work->work, iboe_mcast_work_handler); kref_get(&mc->mcref); queue_work(cma_wq, &work->work); return 0; out2: kfree(mc->multicast.ib); out1: kfree(work); return err; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, void *context) { struct rdma_id_private *id_priv; struct cma_multicast *mc; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) && !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED)) return -EINVAL; mc = kmalloc(sizeof *mc, GFP_KERNEL); if (!mc) return -ENOMEM; memcpy(&mc->addr, addr, ip_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: switch (rdma_port_get_link_layer(id->device, id->port_num)) { case IB_LINK_LAYER_INFINIBAND: ret = cma_join_ib_multicast(id_priv, mc); break; case IB_LINK_LAYER_ETHERNET: kref_init(&mc->mcref); ret = cma_iboe_join_multicast(id_priv, mc); break; default: ret = -EINVAL; } break; default: ret = -ENOSYS; break; } if (ret) { spin_lock_irq(&id_priv->lock); list_del(&mc->list); spin_unlock_irq(&id_priv->lock); kfree(mc); } return ret; } EXPORT_SYMBOL(rdma_join_multicast); void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; struct cma_multicast *mc; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { if (!memcmp(&mc->addr, addr, ip_addr_size(addr))) { list_del(&mc->list); spin_unlock_irq(&id_priv->lock); if (id->qp) ib_detach_mcast(id->qp, &mc->multicast.ib->rec.mgid, be16_to_cpu(mc->multicast.ib->rec.mlid)); if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) { switch (rdma_port_get_link_layer(id->device, id->port_num)) { case IB_LINK_LAYER_INFINIBAND: ib_sa_free_multicast(mc->multicast.ib); kfree(mc); break; case IB_LINK_LAYER_ETHERNET: kref_put(&mc->mcref, release_mc); break; default: break; } } return; } } spin_unlock_irq(&id_priv->lock); } EXPORT_SYMBOL(rdma_leave_multicast); static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr; struct cma_ndev_work *work; dev_addr = &id_priv->id.route.addr.dev_addr; if ((dev_addr->bound_dev_if == ndev->if_index) && memcmp(dev_addr->src_dev_addr, IF_LLADDR(ndev), ndev->if_addrlen)) { printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", ndev->if_xname, &id_priv->id); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; INIT_WORK(&work->work, cma_ndev_work_handler); work->id = id_priv; work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; atomic_inc(&id_priv->refcount); queue_work(cma_wq, &work->work); } return 0; } static int cma_netdev_callback(struct notifier_block *self, unsigned long event, void *ctx) { struct net_device *ndev = (struct net_device *)ctx; struct cma_device *cma_dev; struct rdma_id_private *id_priv; int ret = NOTIFY_DONE; /* BONDING related, commented out until the bonding is resolved */ #if 0 if (dev_net(ndev) != &init_net) return NOTIFY_DONE; if (event != NETDEV_BONDING_FAILOVER) return NOTIFY_DONE; if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING)) return NOTIFY_DONE; #endif if (event != NETDEV_DOWN && event != NETDEV_UNREGISTER) return NOTIFY_DONE; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) list_for_each_entry(id_priv, &cma_dev->id_list, list) { ret = cma_netdev_change(ndev, id_priv); if (ret) goto out; } out: mutex_unlock(&lock); return ret; } static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; static void cma_add_one(struct ib_device *device) { struct cma_device *cma_dev; struct rdma_id_private *id_priv; cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); if (!cma_dev) return; cma_dev->device = device; init_completion(&cma_dev->comp); atomic_set(&cma_dev->refcount, 1); INIT_LIST_HEAD(&cma_dev->id_list); ib_set_client_data(device, &cma_client, cma_dev); mutex_lock(&lock); list_add_tail(&cma_dev->list, &dev_list); list_for_each_entry(id_priv, &listen_any_list, list) cma_listen_on_dev(id_priv, cma_dev); mutex_unlock(&lock); } static int cma_remove_id_dev(struct rdma_id_private *id_priv) { struct rdma_cm_event event; enum rdma_cm_state state; int ret = 0; /* Record that we want to remove the device */ state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL); if (state == RDMA_CM_DESTROYING) return 0; cma_cancel_operation(id_priv, state); mutex_lock(&id_priv->handler_mutex); /* Check for destruction from another callback. */ if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL)) goto out; memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_DEVICE_REMOVAL; ret = id_priv->id.event_handler(&id_priv->id, &event); out: mutex_unlock(&id_priv->handler_mutex); return ret; } static void cma_process_remove(struct cma_device *cma_dev) { struct rdma_id_private *id_priv; int ret; mutex_lock(&lock); while (!list_empty(&cma_dev->id_list)) { id_priv = list_entry(cma_dev->id_list.next, struct rdma_id_private, list); list_del(&id_priv->listen_list); list_del_init(&id_priv->list); atomic_inc(&id_priv->refcount); mutex_unlock(&lock); ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv); cma_deref_id(id_priv); if (ret) rdma_destroy_id(&id_priv->id); mutex_lock(&lock); } mutex_unlock(&lock); cma_deref_dev(cma_dev); wait_for_completion(&cma_dev->comp); } static void cma_remove_one(struct ib_device *device) { struct cma_device *cma_dev; cma_dev = ib_get_client_data(device, &cma_client); if (!cma_dev) return; mutex_lock(&lock); list_del(&cma_dev->list); mutex_unlock(&lock); cma_process_remove(cma_dev); kfree(cma_dev); } static int __init cma_init(void) { int ret = -ENOMEM; cma_wq = create_singlethread_workqueue("rdma_cm"); if (!cma_wq) return -ENOMEM; cma_free_wq = create_singlethread_workqueue("rdma_cm_fr"); if (!cma_free_wq) goto err1; ib_sa_register_client(&sa_client); rdma_addr_register_client(&addr_client); register_netdevice_notifier(&cma_nb); ret = ib_register_client(&cma_client); if (ret) goto err; return 0; err: unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(cma_free_wq); err1: destroy_workqueue(cma_wq); return ret; } static void __exit cma_cleanup(void) { ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); flush_workqueue(cma_free_wq); destroy_workqueue(cma_free_wq); destroy_workqueue(cma_wq); idr_destroy(&sdp_ps); idr_destroy(&tcp_ps); idr_destroy(&udp_ps); idr_destroy(&ipoib_ps); idr_destroy(&ib_ps); } module_init(cma_init); module_exit(cma_cleanup); Index: head/sys/ofed/drivers/infiniband/core/mad.c =================================================================== --- head/sys/ofed/drivers/infiniband/core/mad.c (revision 300675) +++ head/sys/ofed/drivers/infiniband/core/mad.c (revision 300676) @@ -1,3691 +1,3694 @@ /* * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2009 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ + +#define LINUXKPI_PARAM_PREFIX ibcore_ + #include #include #include #include #include #include "mad_priv.h" #include "mad_rmpp.h" #include "smi.h" #include "agent.h" MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("kernel IB MAD API"); MODULE_AUTHOR("Hal Rosenstock"); MODULE_AUTHOR("Sean Hefty"); static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; module_param_named(send_queue_size, mad_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests"); module_param_named(recv_queue_size, mad_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); static struct kmem_cache *ib_mad_cache; static struct list_head ib_mad_port_list; static u32 ib_mad_client_id = 0; /* * Timeout FIFO (tf) param */ enum { /* min time between 2 consecutive activations of tf workqueue */ MIN_BETWEEN_ACTIVATIONS_MS = 5 }; /* * SA congestion control params */ enum { MAX_OUTSTANDING_SA_MADS = 10, MIN_TIME_FOR_SA_MAD_SEND_MS = 20, MAX_SA_MADS = 10000 }; /* Port list lock */ static DEFINE_SPINLOCK(ib_mad_port_list_lock); /* Forward declarations */ static int method_in_use(struct ib_mad_mgmt_method_table **method, struct ib_mad_reg_req *mad_reg_req); static void remove_mad_reg_req(struct ib_mad_agent_private *priv); static struct ib_mad_agent_private *find_mad_agent( struct ib_mad_port_private *port_priv, struct ib_mad *mad); static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, struct ib_mad_private *mad); static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv); static void timeout_sends(struct work_struct *work); static void local_completions(struct work_struct *work); static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv, u8 mgmt_class); static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv); static int send_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr, u32 timeout_ms, u32 retries_left); /* * Timeout FIFO functions - implements FIFO with timeout mechanism */ static void activate_timeout_handler_task(unsigned long data) { struct to_fifo *tf; tf = (struct to_fifo *)data; del_timer(&tf->timer); queue_work(tf->workq, &tf->work); } static unsigned long adjusted_time(unsigned long last, unsigned long next) { unsigned long min_next; min_next = last + msecs_to_jiffies(MIN_BETWEEN_ACTIVATIONS_MS); if (time_after(min_next, next)) return min_next; return next; } static void notify_failure(struct ib_mad_send_wr_private *mad_send_wr, enum ib_wc_status status) { struct ib_mad_send_wc mad_send_wc; struct ib_mad_agent_private *mad_agent_priv; mad_send_wc.status = status; mad_send_wc.vendor_err = 0; mad_send_wc.send_buf = &mad_send_wr->send_buf; mad_agent_priv = mad_send_wr->mad_agent_priv; mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); } static inline struct sa_cc_data * get_cc_obj(struct ib_mad_send_wr_private *mad_send_wr) { return &mad_send_wr->mad_agent_priv->qp_info->port_priv->sa_cc; } static inline struct ib_mad_send_wr_private *tfe_to_mad(struct tf_entry *tfe) { return container_of(tfe, struct ib_mad_send_wr_private, tf_list); } static void timeout_handler_task(struct work_struct *work) { struct tf_entry *tmp1, *tmp2; struct list_head *list_item, exp_lst; unsigned long flags, curr_time; int lst_empty; struct to_fifo *tf; tf = container_of(work, struct to_fifo, work); do { INIT_LIST_HEAD(&exp_lst); spin_lock_irqsave(&tf->lists_lock, flags); curr_time = jiffies; list_for_each(list_item, &tf->to_head) { tmp1 = list_entry(list_item, struct tf_entry, to_list); if (time_before(curr_time, tmp1->exp_time)) break; list_del(&tmp1->fifo_list); tf->num_items--; } /* cut list up to and including list_item->prev */ list_cut_position(&exp_lst, &tf->to_head, list_item->prev); spin_unlock_irqrestore(&tf->lists_lock, flags); lst_empty = list_empty(&exp_lst); list_for_each_entry_safe(tmp1, tmp2, &exp_lst, to_list) { list_del(&tmp1->to_list); if (tmp1->canceled) { tmp1->canceled = 0; notify_failure(tfe_to_mad(tmp1), IB_WC_WR_FLUSH_ERR); } else { notify_failure(tfe_to_mad(tmp1), IB_WC_RESP_TIMEOUT_ERR); } } } while (!lst_empty); spin_lock_irqsave(&tf->lists_lock, flags); if (!list_empty(&tf->to_head)) { tmp1 = list_entry(tf->to_head.next, struct tf_entry, to_list); mod_timer(&tf->timer, adjusted_time(curr_time, tmp1->exp_time)); } spin_unlock_irqrestore(&tf->lists_lock, flags); } /** * tf_create - creates new timeout-fifo object * @fifo_size: Maximum fifo size * * Allocate and initialize new timeout-fifo object */ static struct to_fifo *tf_create(u32 fifo_size) { struct to_fifo *tf; tf = kzalloc(sizeof(*tf), GFP_KERNEL); if (tf) { tf->workq = create_singlethread_workqueue("to_fifo"); if (!tf->workq) { kfree(tf); return NULL; } spin_lock_init(&tf->lists_lock); INIT_LIST_HEAD(&tf->to_head); INIT_LIST_HEAD(&tf->fifo_head); init_timer(&tf->timer); INIT_WORK(&tf->work, timeout_handler_task); tf->timer.data = (unsigned long) tf; tf->timer.function = activate_timeout_handler_task; tf->timer.expires = jiffies; tf->fifo_size = fifo_size; tf->stop_enqueue = 0; tf->num_items = 0; } return tf; } /** * tf_enqueue - enqueue item to timeout-fifo object * @tf:timeout-fifo object * @item: item to enqueue. * @timeout_ms: item expiration time in ms. * * Enqueue item to fifo and modify expiration timer when required. * * Returns 0 on success and negative on failure. */ static int tf_enqueue(struct to_fifo *tf, struct tf_entry *item, u32 timeout_ms) { struct tf_entry *tmp; struct list_head *list_item; unsigned long flags; item->exp_time = jiffies + msecs_to_jiffies(timeout_ms); spin_lock_irqsave(&tf->lists_lock, flags); if (tf->num_items >= tf->fifo_size || tf->stop_enqueue) { spin_unlock_irqrestore(&tf->lists_lock, flags); return -EBUSY; } /* Insert item to timeout list */ list_for_each_prev(list_item, &tf->to_head) { tmp = list_entry(list_item, struct tf_entry, to_list); if (time_after(item->exp_time, tmp->exp_time)) break; } list_add(&item->to_list, list_item); /* Insert item to fifo list */ list_add_tail(&item->fifo_list, &tf->fifo_head); tf->num_items++; /* modify expiration timer if required */ if (list_item == &tf->to_head) mod_timer(&tf->timer, item->exp_time); spin_unlock_irqrestore(&tf->lists_lock, flags); return 0; } /** * tf_dequeue - dequeue item from timeout-fifo object * @tf:timeout-fifo object * @time_left_ms: returns the time left for expiration in ms. * * Dequeue item from fifo and modify expiration timer when required. * * Returns pointer to tf_entry on success and NULL on failure. */ static struct tf_entry *tf_dequeue(struct to_fifo *tf, u32 *time_left_ms) { unsigned long flags; unsigned long time_left; struct tf_entry *tmp, *tmp1; bool found = false; spin_lock_irqsave(&tf->lists_lock, flags); if (list_empty(&tf->fifo_head)) { spin_unlock_irqrestore(&tf->lists_lock, flags); return NULL; } list_for_each_entry(tmp, &tf->fifo_head, fifo_list) { if (!tmp->canceled) { found = true; break; } } if (!found) { spin_unlock_irqrestore(&tf->lists_lock, flags); return NULL; } /* modify timer in case enqueued item is the next to expire */ if (tf->to_head.next == &tmp->to_list) { if (list_is_last(&tmp->to_list, &tf->to_head)) { del_timer(&tf->timer); } else { tmp1 = list_entry(tmp->to_list.next, struct tf_entry, to_list); mod_timer(&tf->timer, tmp1->exp_time); } } list_del(&tmp->fifo_list); list_del(&tmp->to_list); tf->num_items--; spin_unlock_irqrestore(&tf->lists_lock, flags); time_left = tmp->exp_time - jiffies; if ((long) time_left <= 0) time_left = 0; *time_left_ms = jiffies_to_msecs(time_left); return tmp; } static void tf_stop_enqueue(struct to_fifo *tf) { unsigned long flags; spin_lock_irqsave(&tf->lists_lock, flags); tf->stop_enqueue = 1; spin_unlock_irqrestore(&tf->lists_lock, flags); } /** * tf_free - free empty timeout-fifo object * @tf:timeout-fifo object * */ static void tf_free(struct to_fifo *tf) { del_timer_sync(&tf->timer); flush_workqueue(tf->workq); destroy_workqueue(tf->workq); kfree(tf); } /** * tf_free_agent - free MADs related to specific MAD agent from timeout-fifo * @tf:timeout-fifo object * @mad_agent_priv: MAD agent. * */ static void tf_free_agent(struct to_fifo *tf, struct ib_mad_agent_private *mad_agent_priv) { unsigned long flags; struct tf_entry *tmp, *tmp1; struct list_head tmp_head; INIT_LIST_HEAD(&tmp_head); spin_lock_irqsave(&tf->lists_lock, flags); list_for_each_entry_safe(tmp, tmp1, &tf->fifo_head, fifo_list) { if (tfe_to_mad(tmp)->mad_agent_priv == mad_agent_priv) { list_del(&tmp->to_list); list_move(&tmp->fifo_list, &tmp_head); tf->num_items--; } } spin_unlock_irqrestore(&tf->lists_lock, flags); list_for_each_entry_safe(tmp, tmp1, &tmp_head, fifo_list) { list_del(&tmp->fifo_list); notify_failure(tfe_to_mad(tmp), IB_WC_WR_FLUSH_ERR); } } /** * tf_modify_item - to modify expiration time for specific item * @tf:timeout-fifo object * @mad_agent_priv: MAD agent. * @send_buf: the MAD to modify in queue * @timeout_ms: new timeout to set. * * Returns 0 if item found on list and -ENXIO if not. * * Note: The send_buf may point on MAD that is already released. * Therefore we can't use this struct before finding it in the list */ static int tf_modify_item(struct to_fifo *tf, struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_send_buf *send_buf, u32 timeout_ms) { struct tf_entry *tmp, *item; struct list_head *list_item; unsigned long flags; int found = 0; spin_lock_irqsave(&tf->lists_lock, flags); list_for_each_entry(item, &tf->fifo_head, fifo_list) { if (tfe_to_mad(item)->mad_agent_priv == mad_agent_priv && &tfe_to_mad(item)->send_buf == send_buf) { found = 1; break; } } if (!found) { spin_unlock_irqrestore(&tf->lists_lock, flags); return -ENXIO; } item->exp_time = jiffies + msecs_to_jiffies(timeout_ms); if (timeout_ms) { list_del(&item->to_list); list_for_each_prev(list_item, &tf->to_head) { tmp = list_entry(list_item, struct tf_entry, to_list); if (time_after(item->exp_time, tmp->exp_time)) break; } list_add(&item->to_list, list_item); /* modify expiration timer if required */ if (list_item == &tf->to_head) mod_timer(&tf->timer, item->exp_time); } else { /* * when item canceled (timeout_ms == 0) move item to * head of timeout list and to the tail of fifo list */ item->canceled = 1; list_move(&item->to_list, &tf->to_head); list_move_tail(&item->fifo_list, &tf->fifo_head); mod_timer(&tf->timer, item->exp_time); } spin_unlock_irqrestore(&tf->lists_lock, flags); return 0; } /* * SA congestion control functions */ /* * Defines which MAD is under congestion control. */ static int is_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_mad_hdr *mad; mad = (struct ib_mad_hdr *)mad_send_wr->send_buf.mad; return ((mad_send_wr->send_buf.timeout_ms) && (mad->mgmt_class == IB_MGMT_CLASS_SUBN_ADM) && ((mad->method == IB_MGMT_METHOD_GET) || (mad->method == IB_MGMT_METHOD_SET))); } /* * Notify that SA congestion controlled MAD is done. * to allow dequeuing SA MAD from congestion control queue. */ static void sa_cc_mad_done(struct sa_cc_data *cc_obj) { unsigned long flags; struct tf_entry *tfe; struct ib_mad_send_wr_private *mad_send_wr; u32 time_left_ms, timeout_ms, retries; int ret; do { spin_lock_irqsave(&cc_obj->lock, flags); tfe = tf_dequeue(cc_obj->tf, &time_left_ms); if (!tfe) { if (cc_obj->outstanding > 0) cc_obj->outstanding--; spin_unlock_irqrestore(&cc_obj->lock, flags); break; } spin_unlock_irqrestore(&cc_obj->lock, flags); mad_send_wr = tfe_to_mad(tfe); time_left_ms += MIN_TIME_FOR_SA_MAD_SEND_MS; if (time_left_ms > mad_send_wr->send_buf.timeout_ms) { retries = time_left_ms / mad_send_wr->send_buf.timeout_ms - 1; timeout_ms = mad_send_wr->send_buf.timeout_ms; } else { retries = 0; timeout_ms = time_left_ms; } ret = send_sa_cc_mad(mad_send_wr, timeout_ms, retries); if (ret) { if (ret == -ENOMEM) notify_failure(mad_send_wr, IB_WC_GENERAL_ERR); else notify_failure(mad_send_wr, IB_WC_LOC_QP_OP_ERR); } } while (ret); } /* * Send SA MAD under congestion control. */ static int sa_cc_mad_send(struct ib_mad_send_wr_private *mad_send_wr) { unsigned long flags; int ret; struct sa_cc_data *cc_obj; cc_obj = get_cc_obj(mad_send_wr); spin_lock_irqsave(&cc_obj->lock, flags); if (cc_obj->outstanding < MAX_OUTSTANDING_SA_MADS) { cc_obj->outstanding++; spin_unlock_irqrestore(&cc_obj->lock, flags); ret = send_sa_cc_mad(mad_send_wr, mad_send_wr->send_buf.timeout_ms, mad_send_wr->retries_left); if (ret) sa_cc_mad_done(cc_obj); } else { int qtime = (mad_send_wr->send_buf.timeout_ms * (mad_send_wr->retries_left + 1)) - MIN_TIME_FOR_SA_MAD_SEND_MS; if (qtime < 0) qtime = 0; ret = tf_enqueue(cc_obj->tf, &mad_send_wr->tf_list, (u32)qtime); spin_unlock_irqrestore(&cc_obj->lock, flags); } return ret; } /* * Initialize SA congestion control. */ static int sa_cc_init(struct sa_cc_data *cc_obj) { spin_lock_init(&cc_obj->lock); cc_obj->outstanding = 0; cc_obj->tf = tf_create(MAX_SA_MADS); if (!cc_obj->tf) return -ENOMEM; return 0; } /* * Cancel SA MADs from congestion control queue. */ static void cancel_sa_cc_mads(struct ib_mad_agent_private *mad_agent_priv) { tf_free_agent(mad_agent_priv->qp_info->port_priv->sa_cc.tf, mad_agent_priv); } /* * Modify timeout of SA MAD on congestion control queue. */ static int modify_sa_cc_mad(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_send_buf *send_buf, u32 timeout_ms) { int ret; int qtime = 0; if (timeout_ms > MIN_TIME_FOR_SA_MAD_SEND_MS) qtime = timeout_ms - MIN_TIME_FOR_SA_MAD_SEND_MS; ret = tf_modify_item(mad_agent_priv->qp_info->port_priv->sa_cc.tf, mad_agent_priv, send_buf, (u32)qtime); return ret; } static void sa_cc_destroy(struct sa_cc_data *cc_obj) { struct ib_mad_send_wr_private *mad_send_wr; struct tf_entry *tfe; struct ib_mad_send_wc mad_send_wc; struct ib_mad_agent_private *mad_agent_priv; u32 time_left_ms; mad_send_wc.status = IB_WC_WR_FLUSH_ERR; mad_send_wc.vendor_err = 0; tf_stop_enqueue(cc_obj->tf); tfe = tf_dequeue(cc_obj->tf, &time_left_ms); while (tfe) { mad_send_wr = tfe_to_mad(tfe); mad_send_wc.send_buf = &mad_send_wr->send_buf; mad_agent_priv = mad_send_wr->mad_agent_priv; mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); tfe = tf_dequeue(cc_obj->tf, &time_left_ms); } tf_free(cc_obj->tf); } /* * Returns a ib_mad_port_private structure or NULL for a device/port * Assumes ib_mad_port_list_lock is being held */ static inline struct ib_mad_port_private * __ib_get_mad_port(struct ib_device *device, int port_num) { struct ib_mad_port_private *entry; list_for_each_entry(entry, &ib_mad_port_list, port_list) { if (entry->device == device && entry->port_num == port_num) return entry; } return NULL; } /* * Wrapper function to return a ib_mad_port_private structure or NULL * for a device/port */ static inline struct ib_mad_port_private * ib_get_mad_port(struct ib_device *device, int port_num) { struct ib_mad_port_private *entry; unsigned long flags; spin_lock_irqsave(&ib_mad_port_list_lock, flags); entry = __ib_get_mad_port(device, port_num); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); return entry; } static inline u8 convert_mgmt_class(u8 mgmt_class) { /* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */ return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ? 0 : mgmt_class; } static int get_spl_qp_index(enum ib_qp_type qp_type) { switch (qp_type) { case IB_QPT_SMI: return 0; case IB_QPT_GSI: return 1; default: return -1; } } static int vendor_class_index(u8 mgmt_class) { return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START; } static int is_vendor_class(u8 mgmt_class) { if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) || (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END)) return 0; return 1; } static int is_vendor_oui(char *oui) { if (oui[0] || oui[1] || oui[2]) return 1; return 0; } static int is_vendor_method_in_use( struct ib_mad_mgmt_vendor_class *vendor_class, struct ib_mad_reg_req *mad_reg_req) { struct ib_mad_mgmt_method_table *method; int i; for (i = 0; i < MAX_MGMT_OUI; i++) { if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) { method = vendor_class->method_table[i]; if (method) { if (method_in_use(&method, mad_reg_req)) return 1; else break; } } } return 0; } int ib_response_mad(struct ib_mad *mad) { return ((mad->mad_hdr.method & IB_MGMT_METHOD_RESP) || (mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) || ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_BM) && (mad->mad_hdr.attr_mod & IB_BM_ATTR_MOD_RESP))); } EXPORT_SYMBOL(ib_response_mad); /* * ib_register_mad_agent - Register to send/receive MADs */ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, u8 port_num, enum ib_qp_type qp_type, struct ib_mad_reg_req *mad_reg_req, u8 rmpp_version, ib_mad_send_handler send_handler, ib_mad_recv_handler recv_handler, void *context) { struct ib_mad_port_private *port_priv; struct ib_mad_agent *ret = ERR_PTR(-EINVAL); struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_reg_req *reg_req = NULL; struct ib_mad_mgmt_class_table *class; struct ib_mad_mgmt_vendor_class_table *vendor; struct ib_mad_mgmt_vendor_class *vendor_class; struct ib_mad_mgmt_method_table *method; int ret2, qpn; unsigned long flags; u8 mgmt_class, vclass; /* Validate parameters */ qpn = get_spl_qp_index(qp_type); if (qpn == -1) goto error1; if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) goto error1; /* Validate MAD registration request if supplied */ if (mad_reg_req) { if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) goto error1; if (!recv_handler) goto error1; if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) { /* * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only * one in this range currently allowed */ if (mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) goto error1; } else if (mad_reg_req->mgmt_class == 0) { /* * Class 0 is reserved in IBA and is used for * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE */ goto error1; } else if (is_vendor_class(mad_reg_req->mgmt_class)) { /* * If class is in "new" vendor range, * ensure supplied OUI is not zero */ if (!is_vendor_oui(mad_reg_req->oui)) goto error1; } /* Make sure class supplied is consistent with RMPP */ if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) { if (rmpp_version) goto error1; } /* Make sure class supplied is consistent with QP type */ if (qp_type == IB_QPT_SMI) { if ((mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) && (mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) goto error1; } else { if ((mad_reg_req->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || (mad_reg_req->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) goto error1; } } else { /* No registration request supplied */ if (!send_handler) goto error1; } /* Validate device and port */ port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { ret = ERR_PTR(-ENODEV); goto error1; } /* Verify the QP requested is supported. For example, Ethernet devices * will not have QP0 */ if (!port_priv->qp_info[qpn].qp) { ret = ERR_PTR(-EPROTONOSUPPORT); goto error1; } /* Allocate structures */ mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL); if (!mad_agent_priv) { ret = ERR_PTR(-ENOMEM); goto error1; } mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(mad_agent_priv->agent.mr)) { ret = ERR_PTR(-ENOMEM); goto error2; } if (mad_reg_req) { reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL); if (!reg_req) { ret = ERR_PTR(-ENOMEM); goto error3; } } /* Now, fill in the various structures */ mad_agent_priv->qp_info = &port_priv->qp_info[qpn]; mad_agent_priv->reg_req = reg_req; mad_agent_priv->agent.rmpp_version = rmpp_version; mad_agent_priv->agent.device = device; mad_agent_priv->agent.recv_handler = recv_handler; mad_agent_priv->agent.send_handler = send_handler; mad_agent_priv->agent.context = context; mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp; mad_agent_priv->agent.port_num = port_num; spin_lock_init(&mad_agent_priv->lock); INIT_LIST_HEAD(&mad_agent_priv->send_list); INIT_LIST_HEAD(&mad_agent_priv->wait_list); INIT_LIST_HEAD(&mad_agent_priv->done_list); INIT_LIST_HEAD(&mad_agent_priv->rmpp_list); INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends); INIT_LIST_HEAD(&mad_agent_priv->local_list); INIT_WORK(&mad_agent_priv->local_work, local_completions); atomic_set(&mad_agent_priv->refcount, 1); init_completion(&mad_agent_priv->comp); spin_lock_irqsave(&port_priv->reg_lock, flags); mad_agent_priv->agent.hi_tid = ++ib_mad_client_id; /* * Make sure MAD registration (if supplied) * is non overlapping with any existing ones */ if (mad_reg_req) { mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class); if (!is_vendor_class(mgmt_class)) { class = port_priv->version[mad_reg_req-> mgmt_class_version].class; if (class) { method = class->method_table[mgmt_class]; if (method) { if (method_in_use(&method, mad_reg_req)) goto error4; } } ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv, mgmt_class); } else { /* "New" vendor class range */ vendor = port_priv->version[mad_reg_req-> mgmt_class_version].vendor; if (vendor) { vclass = vendor_class_index(mgmt_class); vendor_class = vendor->vendor_class[vclass]; if (vendor_class) { if (is_vendor_method_in_use( vendor_class, mad_reg_req)) goto error4; } } ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv); } if (ret2) { ret = ERR_PTR(ret2); goto error4; } } /* Add mad agent into port's agent list */ list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list); spin_unlock_irqrestore(&port_priv->reg_lock, flags); return &mad_agent_priv->agent; error4: spin_unlock_irqrestore(&port_priv->reg_lock, flags); kfree(reg_req); error3: ib_dereg_mr(mad_agent_priv->agent.mr); error2: kfree(mad_agent_priv); error1: return ret; } EXPORT_SYMBOL(ib_register_mad_agent); static inline int is_snooping_sends(int mad_snoop_flags) { return (mad_snoop_flags & (/*IB_MAD_SNOOP_POSTED_SENDS | IB_MAD_SNOOP_RMPP_SENDS |*/ IB_MAD_SNOOP_SEND_COMPLETIONS /*| IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/)); } static inline int is_snooping_recvs(int mad_snoop_flags) { return (mad_snoop_flags & (IB_MAD_SNOOP_RECVS /*| IB_MAD_SNOOP_RMPP_RECVS*/)); } static int register_snoop_agent(struct ib_mad_qp_info *qp_info, struct ib_mad_snoop_private *mad_snoop_priv) { struct ib_mad_snoop_private **new_snoop_table; unsigned long flags; int i; spin_lock_irqsave(&qp_info->snoop_lock, flags); /* Check for empty slot in array. */ for (i = 0; i < qp_info->snoop_table_size; i++) if (!qp_info->snoop_table[i]) break; if (i == qp_info->snoop_table_size) { /* Grow table. */ new_snoop_table = krealloc(qp_info->snoop_table, sizeof mad_snoop_priv * (qp_info->snoop_table_size + 1), GFP_ATOMIC); if (!new_snoop_table) { i = -ENOMEM; goto out; } qp_info->snoop_table = new_snoop_table; qp_info->snoop_table_size++; } qp_info->snoop_table[i] = mad_snoop_priv; atomic_inc(&qp_info->snoop_count); out: spin_unlock_irqrestore(&qp_info->snoop_lock, flags); return i; } struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device, u8 port_num, enum ib_qp_type qp_type, int mad_snoop_flags, ib_mad_snoop_handler snoop_handler, ib_mad_recv_handler recv_handler, void *context) { struct ib_mad_port_private *port_priv; struct ib_mad_agent *ret; struct ib_mad_snoop_private *mad_snoop_priv; int qpn; /* Validate parameters */ if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) || (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) { ret = ERR_PTR(-EINVAL); goto error1; } qpn = get_spl_qp_index(qp_type); if (qpn == -1) { ret = ERR_PTR(-EINVAL); goto error1; } port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { ret = ERR_PTR(-ENODEV); goto error1; } /* Allocate structures */ mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL); if (!mad_snoop_priv) { ret = ERR_PTR(-ENOMEM); goto error1; } /* Now, fill in the various structures */ mad_snoop_priv->qp_info = &port_priv->qp_info[qpn]; mad_snoop_priv->agent.device = device; mad_snoop_priv->agent.recv_handler = recv_handler; mad_snoop_priv->agent.snoop_handler = snoop_handler; mad_snoop_priv->agent.context = context; mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp; mad_snoop_priv->agent.port_num = port_num; mad_snoop_priv->mad_snoop_flags = mad_snoop_flags; init_completion(&mad_snoop_priv->comp); mad_snoop_priv->snoop_index = register_snoop_agent( &port_priv->qp_info[qpn], mad_snoop_priv); if (mad_snoop_priv->snoop_index < 0) { ret = ERR_PTR(mad_snoop_priv->snoop_index); goto error2; } atomic_set(&mad_snoop_priv->refcount, 1); return &mad_snoop_priv->agent; error2: kfree(mad_snoop_priv); error1: return ret; } EXPORT_SYMBOL(ib_register_mad_snoop); static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv) { if (atomic_dec_and_test(&mad_agent_priv->refcount)) complete(&mad_agent_priv->comp); } static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv) { if (atomic_dec_and_test(&mad_snoop_priv->refcount)) complete(&mad_snoop_priv->comp); } static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) { struct ib_mad_port_private *port_priv; unsigned long flags; /* Note that we could still be handling received MADs */ /* * Canceling all sends results in dropping received response * MADs, preventing us from queuing additional work */ cancel_mads(mad_agent_priv); port_priv = mad_agent_priv->qp_info->port_priv; cancel_delayed_work_sync(&mad_agent_priv->timed_work); spin_lock_irqsave(&port_priv->reg_lock, flags); remove_mad_reg_req(mad_agent_priv); list_del(&mad_agent_priv->agent_list); spin_unlock_irqrestore(&port_priv->reg_lock, flags); flush_workqueue(port_priv->wq); ib_cancel_rmpp_recvs(mad_agent_priv); deref_mad_agent(mad_agent_priv); wait_for_completion(&mad_agent_priv->comp); kfree(mad_agent_priv->reg_req); ib_dereg_mr(mad_agent_priv->agent.mr); kfree(mad_agent_priv); } static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv) { struct ib_mad_qp_info *qp_info; unsigned long flags; qp_info = mad_snoop_priv->qp_info; spin_lock_irqsave(&qp_info->snoop_lock, flags); qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL; atomic_dec(&qp_info->snoop_count); spin_unlock_irqrestore(&qp_info->snoop_lock, flags); deref_snoop_agent(mad_snoop_priv); wait_for_completion(&mad_snoop_priv->comp); kfree(mad_snoop_priv); } /* * ib_unregister_mad_agent - Unregisters a client from using MAD services */ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_snoop_private *mad_snoop_priv; if (!IS_ERR(mad_agent)) { /* If the TID is zero, the agent can only snoop. */ if (mad_agent->hi_tid) { mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, agent); unregister_mad_agent(mad_agent_priv); } else { mad_snoop_priv = container_of(mad_agent, struct ib_mad_snoop_private, agent); unregister_mad_snoop(mad_snoop_priv); } } return 0; } EXPORT_SYMBOL(ib_unregister_mad_agent); static void dequeue_mad(struct ib_mad_list_head *mad_list) { struct ib_mad_queue *mad_queue; unsigned long flags; BUG_ON(!mad_list->mad_queue); mad_queue = mad_list->mad_queue; spin_lock_irqsave(&mad_queue->lock, flags); list_del(&mad_list->list); mad_queue->count--; spin_unlock_irqrestore(&mad_queue->lock, flags); } static void snoop_send(struct ib_mad_qp_info *qp_info, struct ib_mad_send_buf *send_buf, struct ib_mad_send_wc *mad_send_wc, int mad_snoop_flags) { struct ib_mad_snoop_private *mad_snoop_priv; unsigned long flags; int i; spin_lock_irqsave(&qp_info->snoop_lock, flags); for (i = 0; i < qp_info->snoop_table_size; i++) { mad_snoop_priv = qp_info->snoop_table[i]; if (!mad_snoop_priv || !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags)) continue; atomic_inc(&mad_snoop_priv->refcount); spin_unlock_irqrestore(&qp_info->snoop_lock, flags); mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent, send_buf, mad_send_wc); deref_snoop_agent(mad_snoop_priv); spin_lock_irqsave(&qp_info->snoop_lock, flags); } spin_unlock_irqrestore(&qp_info->snoop_lock, flags); } static void snoop_recv(struct ib_mad_qp_info *qp_info, struct ib_mad_recv_wc *mad_recv_wc, int mad_snoop_flags) { struct ib_mad_snoop_private *mad_snoop_priv; unsigned long flags; int i; spin_lock_irqsave(&qp_info->snoop_lock, flags); for (i = 0; i < qp_info->snoop_table_size; i++) { mad_snoop_priv = qp_info->snoop_table[i]; if (!mad_snoop_priv || !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags)) continue; atomic_inc(&mad_snoop_priv->refcount); spin_unlock_irqrestore(&qp_info->snoop_lock, flags); mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, mad_recv_wc); deref_snoop_agent(mad_snoop_priv); spin_lock_irqsave(&qp_info->snoop_lock, flags); } spin_unlock_irqrestore(&qp_info->snoop_lock, flags); } static void build_smp_wc(struct ib_qp *qp, u64 wr_id, u16 slid, u16 pkey_index, u8 port_num, struct ib_wc *wc) { memset(wc, 0, sizeof *wc); wc->wr_id = wr_id; wc->status = IB_WC_SUCCESS; wc->opcode = IB_WC_RECV; wc->pkey_index = pkey_index; wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh); wc->src_qp = IB_QP0; wc->qp = qp; wc->slid = slid; wc->sl = 0; wc->dlid_path_bits = 0; wc->port_num = port_num; } /* * Return 0 if SMP is to be sent * Return 1 if SMP was consumed locally (whether or not solicited) * Return < 0 if error */ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_send_wr_private *mad_send_wr) { int ret = 0; struct ib_smp *smp = mad_send_wr->send_buf.mad; unsigned long flags; struct ib_mad_local_private *local; struct ib_mad_private *mad_priv; struct ib_mad_port_private *port_priv; struct ib_mad_agent_private *recv_mad_agent = NULL; struct ib_device *device = mad_agent_priv->agent.device; u8 port_num; struct ib_wc mad_wc; struct ib_send_wr *send_wr = &mad_send_wr->send_wr; if (device->node_type == RDMA_NODE_IB_SWITCH && smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) port_num = send_wr->wr.ud.port_num; else port_num = mad_agent_priv->agent.port_num; /* * Directed route handling starts if the initial LID routed part of * a request or the ending LID routed part of a response is empty. * If we are at the start of the LID routed part, don't update the * hop_ptr or hop_cnt. See section 14.2.2, Vol 1 IB spec. */ if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) != IB_LID_PERMISSIVE) goto out; if (smi_handle_dr_smp_send(smp, device->node_type, port_num) == IB_SMI_DISCARD) { ret = -EINVAL; printk(KERN_ERR PFX "Invalid directed route\n"); goto out; } /* Check to post send on QP or process locally */ if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD && smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD) goto out; local = kmalloc(sizeof *local, GFP_ATOMIC); if (!local) { ret = -ENOMEM; printk(KERN_ERR PFX "No memory for ib_mad_local_private\n"); goto out; } local->mad_priv = NULL; local->recv_mad_agent = NULL; mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC); if (!mad_priv) { ret = -ENOMEM; printk(KERN_ERR PFX "No memory for local response MAD\n"); kfree(local); goto out; } build_smp_wc(mad_agent_priv->agent.qp, send_wr->wr_id, be16_to_cpu(smp->dr_slid), send_wr->wr.ud.pkey_index, send_wr->wr.ud.port_num, &mad_wc); /* No GRH for DR SMP */ ret = device->process_mad(device, 0, port_num, &mad_wc, NULL, (struct ib_mad *)smp, (struct ib_mad *)&mad_priv->mad); switch (ret) { case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY: if (ib_response_mad(&mad_priv->mad.mad) && mad_agent_priv->agent.recv_handler) { local->mad_priv = mad_priv; local->recv_mad_agent = mad_agent_priv; /* * Reference MAD agent until receive * side of local completion handled */ atomic_inc(&mad_agent_priv->refcount); } else kmem_cache_free(ib_mad_cache, mad_priv); break; case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED: kmem_cache_free(ib_mad_cache, mad_priv); break; case IB_MAD_RESULT_SUCCESS: /* Treat like an incoming receive MAD */ port_priv = ib_get_mad_port(mad_agent_priv->agent.device, mad_agent_priv->agent.port_num); if (port_priv) { memcpy(&mad_priv->mad.mad, smp, sizeof(struct ib_mad)); recv_mad_agent = find_mad_agent(port_priv, &mad_priv->mad.mad); } if (!port_priv || !recv_mad_agent) { /* * No receiving agent so drop packet and * generate send completion. */ kmem_cache_free(ib_mad_cache, mad_priv); break; } local->mad_priv = mad_priv; local->recv_mad_agent = recv_mad_agent; break; default: kmem_cache_free(ib_mad_cache, mad_priv); kfree(local); ret = -EINVAL; goto out; } local->mad_send_wr = mad_send_wr; /* Reference MAD agent until send side of local completion handled */ atomic_inc(&mad_agent_priv->refcount); /* Queue local completion to local list */ spin_lock_irqsave(&mad_agent_priv->lock, flags); list_add_tail(&local->completion_list, &mad_agent_priv->local_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); queue_work(mad_agent_priv->qp_info->port_priv->wq, &mad_agent_priv->local_work); ret = 1; out: return ret; } static int get_pad_size(int hdr_len, int data_len) { int seg_size, pad; seg_size = sizeof(struct ib_mad) - hdr_len; if (data_len && seg_size) { pad = seg_size - data_len % seg_size; return pad == seg_size ? 0 : pad; } else return seg_size; } static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_rmpp_segment *s, *t; list_for_each_entry_safe(s, t, &mad_send_wr->rmpp_list, list) { list_del(&s->list); kfree(s); } } static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr, gfp_t gfp_mask) { struct ib_mad_send_buf *send_buf = &send_wr->send_buf; struct ib_rmpp_mad *rmpp_mad = send_buf->mad; struct ib_rmpp_segment *seg = NULL; int left, seg_size, pad; send_buf->seg_size = sizeof (struct ib_mad) - send_buf->hdr_len; seg_size = send_buf->seg_size; pad = send_wr->pad; /* Allocate data segments. */ for (left = send_buf->data_len + pad; left > 0; left -= seg_size) { seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask); if (!seg) { printk(KERN_ERR "alloc_send_rmpp_segs: RMPP mem " "alloc failed for len %zd, gfp %#x\n", sizeof (*seg) + seg_size, gfp_mask); free_send_rmpp_list(send_wr); return -ENOMEM; } seg->num = ++send_buf->seg_count; list_add_tail(&seg->list, &send_wr->rmpp_list); } /* Zero any padding */ if (pad) memset(seg->data + seg_size - pad, 0, pad); rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv-> agent.rmpp_version; rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA; ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); send_wr->cur_seg = container_of(send_wr->rmpp_list.next, struct ib_rmpp_segment, list); send_wr->last_ack_seg = send_wr->cur_seg; return 0; } struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, u32 remote_qpn, u16 pkey_index, int rmpp_active, int hdr_len, int data_len, gfp_t gfp_mask) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; int pad, message_size, ret, size; void *buf; mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, agent); pad = get_pad_size(hdr_len, data_len); message_size = hdr_len + data_len + pad; if ((!mad_agent->rmpp_version && (rmpp_active || message_size > sizeof(struct ib_mad))) || (!rmpp_active && message_size > sizeof(struct ib_mad))) return ERR_PTR(-EINVAL); size = rmpp_active ? hdr_len : sizeof(struct ib_mad); buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask); if (!buf) return ERR_PTR(-ENOMEM); mad_send_wr = buf + size; INIT_LIST_HEAD(&mad_send_wr->rmpp_list); mad_send_wr->send_buf.mad = buf; mad_send_wr->send_buf.hdr_len = hdr_len; mad_send_wr->send_buf.data_len = data_len; mad_send_wr->pad = pad; mad_send_wr->mad_agent_priv = mad_agent_priv; mad_send_wr->sg_list[0].length = hdr_len; mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey; mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len; mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey; mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr; mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list; mad_send_wr->send_wr.num_sge = 2; mad_send_wr->send_wr.opcode = IB_WR_SEND; mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED; mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn; mad_send_wr->send_wr.wr.ud.remote_qkey = IB_QP_SET_QKEY; mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index; if (rmpp_active) { ret = alloc_send_rmpp_list(mad_send_wr, gfp_mask); if (ret) { kfree(buf); return ERR_PTR(ret); } } mad_send_wr->send_buf.mad_agent = mad_agent; atomic_inc(&mad_agent_priv->refcount); return &mad_send_wr->send_buf; } EXPORT_SYMBOL(ib_create_send_mad); int ib_get_mad_data_offset(u8 mgmt_class) { if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM) return IB_MGMT_SA_HDR; else if ((mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) || (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) || (mgmt_class == IB_MGMT_CLASS_BIS)) return IB_MGMT_DEVICE_HDR; else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) && (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)) return IB_MGMT_VENDOR_HDR; else return IB_MGMT_MAD_HDR; } EXPORT_SYMBOL(ib_get_mad_data_offset); int ib_is_mad_class_rmpp(u8 mgmt_class) { if ((mgmt_class == IB_MGMT_CLASS_SUBN_ADM) || (mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) || (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) || (mgmt_class == IB_MGMT_CLASS_BIS) || ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) && (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))) return 1; return 0; } EXPORT_SYMBOL(ib_is_mad_class_rmpp); void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num) { struct ib_mad_send_wr_private *mad_send_wr; struct list_head *list; mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); list = &mad_send_wr->cur_seg->list; if (mad_send_wr->cur_seg->num < seg_num) { list_for_each_entry(mad_send_wr->cur_seg, list, list) if (mad_send_wr->cur_seg->num == seg_num) break; } else if (mad_send_wr->cur_seg->num > seg_num) { list_for_each_entry_reverse(mad_send_wr->cur_seg, list, list) if (mad_send_wr->cur_seg->num == seg_num) break; } return mad_send_wr->cur_seg->data; } EXPORT_SYMBOL(ib_get_rmpp_segment); static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr) { if (mad_send_wr->send_buf.seg_count) return ib_get_rmpp_segment(&mad_send_wr->send_buf, mad_send_wr->seg_num); else return mad_send_wr->send_buf.mad + mad_send_wr->send_buf.hdr_len; } void ib_free_send_mad(struct ib_mad_send_buf *send_buf) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; mad_agent_priv = container_of(send_buf->mad_agent, struct ib_mad_agent_private, agent); mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); free_send_rmpp_list(mad_send_wr); kfree(send_buf->mad); deref_mad_agent(mad_agent_priv); } EXPORT_SYMBOL(ib_free_send_mad); int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_mad_qp_info *qp_info; struct list_head *list; struct ib_send_wr *bad_send_wr; struct ib_mad_agent *mad_agent; struct ib_sge *sge; unsigned long flags; int ret; /* Set WR ID to find mad_send_wr upon completion */ qp_info = mad_send_wr->mad_agent_priv->qp_info; mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list; mad_send_wr->mad_list.mad_queue = &qp_info->send_queue; mad_agent = mad_send_wr->send_buf.mad_agent; sge = mad_send_wr->sg_list; sge[0].addr = ib_dma_map_single(mad_agent->device, mad_send_wr->send_buf.mad, sge[0].length, DMA_TO_DEVICE); if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr))) return -ENOMEM; sge[1].addr = ib_dma_map_single(mad_agent->device, ib_get_payload(mad_send_wr), sge[1].length, DMA_TO_DEVICE); if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) { ret = -ENOMEM; goto dma1_err; } mad_send_wr->header_mapping = sge[0].addr; mad_send_wr->payload_mapping = sge[1].addr; spin_lock_irqsave(&qp_info->send_queue.lock, flags); if (qp_info->send_queue.count < qp_info->send_queue.max_active) { ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr, &bad_send_wr); list = &qp_info->send_queue.list; } else { ret = 0; list = &qp_info->overflow_list; } if (!ret) { qp_info->send_queue.count++; list_add_tail(&mad_send_wr->mad_list.list, list); } spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); if (!ret) return 0; ib_dma_unmap_single(mad_agent->device, mad_send_wr->header_mapping, sge[1].length, DMA_TO_DEVICE); dma1_err: ib_dma_unmap_single(mad_agent->device, mad_send_wr->payload_mapping, sge[0].length, DMA_TO_DEVICE); return ret; } /* * Send SA MAD that passed congestion control */ static int send_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr, u32 timeout_ms, u32 retries_left) { int ret; unsigned long flags; struct ib_mad_agent_private *mad_agent_priv; mad_agent_priv = mad_send_wr->mad_agent_priv; mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); mad_send_wr->retries_left = retries_left; mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); /* Reference MAD agent until send completes */ atomic_inc(&mad_agent_priv->refcount); spin_lock_irqsave(&mad_agent_priv->lock, flags); list_add_tail(&mad_send_wr->agent_list, &mad_agent_priv->send_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); ret = ib_send_mad(mad_send_wr); if (ret < 0) { /* Fail send request */ spin_lock_irqsave(&mad_agent_priv->lock, flags); list_del(&mad_send_wr->agent_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); atomic_dec(&mad_agent_priv->refcount); } return ret; } /* * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated * with the registered client */ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, struct ib_mad_send_buf **bad_send_buf) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_buf *next_send_buf; struct ib_mad_send_wr_private *mad_send_wr; unsigned long flags; int ret = -EINVAL; /* Walk list of send WRs and post each on send list */ for (; send_buf; send_buf = next_send_buf) { mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); mad_agent_priv = mad_send_wr->mad_agent_priv; if (!send_buf->mad_agent->send_handler || (send_buf->timeout_ms && !send_buf->mad_agent->recv_handler)) { ret = -EINVAL; goto error; } if (!ib_is_mad_class_rmpp(((struct ib_mad_hdr *) send_buf->mad)->mgmt_class)) { if (mad_agent_priv->agent.rmpp_version) { ret = -EINVAL; goto error; } } /* * Save pointer to next work request to post in case the * current one completes, and the user modifies the work * request associated with the completion */ next_send_buf = send_buf->next; mad_send_wr->send_wr.wr.ud.ah = send_buf->ah; if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { ret = handle_outgoing_dr_smp(mad_agent_priv, mad_send_wr); if (ret < 0) /* error */ goto error; else if (ret == 1) /* locally consumed */ continue; } mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid; /* Timeout will be updated after send completes */ mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms); mad_send_wr->max_retries = send_buf->retries; mad_send_wr->retries_left = send_buf->retries; send_buf->retries = 0; /* Reference for work request to QP + response */ mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); mad_send_wr->status = IB_WC_SUCCESS; if (is_sa_cc_mad(mad_send_wr)) { mad_send_wr->is_sa_cc_mad = 1; ret = sa_cc_mad_send(mad_send_wr); if (ret < 0) goto error; } else { /* Reference MAD agent until send completes */ atomic_inc(&mad_agent_priv->refcount); spin_lock_irqsave(&mad_agent_priv->lock, flags); list_add_tail(&mad_send_wr->agent_list, &mad_agent_priv->send_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); if (mad_agent_priv->agent.rmpp_version) { ret = ib_send_rmpp_mad(mad_send_wr); if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED) ret = ib_send_mad(mad_send_wr); } else ret = ib_send_mad(mad_send_wr); if (ret < 0) { /* Fail send request */ spin_lock_irqsave(&mad_agent_priv->lock, flags); list_del(&mad_send_wr->agent_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); atomic_dec(&mad_agent_priv->refcount); goto error; } } } return 0; error: if (bad_send_buf) *bad_send_buf = send_buf; return ret; } EXPORT_SYMBOL(ib_post_send_mad); /* * ib_free_recv_mad - Returns data buffers used to receive * a MAD to the access layer */ void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc) { struct ib_mad_recv_buf *mad_recv_buf, *temp_recv_buf; struct ib_mad_private_header *mad_priv_hdr; struct ib_mad_private *priv; struct list_head free_list; INIT_LIST_HEAD(&free_list); list_splice_init(&mad_recv_wc->rmpp_list, &free_list); list_for_each_entry_safe(mad_recv_buf, temp_recv_buf, &free_list, list) { mad_recv_wc = container_of(mad_recv_buf, struct ib_mad_recv_wc, recv_buf); mad_priv_hdr = container_of(mad_recv_wc, struct ib_mad_private_header, recv_wc); priv = container_of(mad_priv_hdr, struct ib_mad_private, header); kmem_cache_free(ib_mad_cache, priv); } } EXPORT_SYMBOL(ib_free_recv_mad); struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp, u8 rmpp_version, ib_mad_send_handler send_handler, ib_mad_recv_handler recv_handler, void *context) { return ERR_PTR(-EINVAL); /* XXX: for now */ } EXPORT_SYMBOL(ib_redirect_mad_qp); int ib_process_mad_wc(struct ib_mad_agent *mad_agent, struct ib_wc *wc) { printk(KERN_ERR PFX "ib_process_mad_wc() not implemented yet\n"); return 0; } EXPORT_SYMBOL(ib_process_mad_wc); static int method_in_use(struct ib_mad_mgmt_method_table **method, struct ib_mad_reg_req *mad_reg_req) { int i; for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) { if ((*method)->agent[i]) { printk(KERN_ERR PFX "Method %d already in use\n", i); return -EINVAL; } } return 0; } static int allocate_method_table(struct ib_mad_mgmt_method_table **method) { /* Allocate management method table */ *method = kzalloc(sizeof **method, GFP_ATOMIC); if (!*method) { printk(KERN_ERR PFX "No memory for " "ib_mad_mgmt_method_table\n"); return -ENOMEM; } return 0; } /* * Check to see if there are any methods still in use */ static int check_method_table(struct ib_mad_mgmt_method_table *method) { int i; for (i = 0; i < IB_MGMT_MAX_METHODS; i++) if (method->agent[i]) return 1; return 0; } /* * Check to see if there are any method tables for this class still in use */ static int check_class_table(struct ib_mad_mgmt_class_table *class) { int i; for (i = 0; i < MAX_MGMT_CLASS; i++) if (class->method_table[i]) return 1; return 0; } static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class) { int i; for (i = 0; i < MAX_MGMT_OUI; i++) if (vendor_class->method_table[i]) return 1; return 0; } static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class, char *oui) { int i; for (i = 0; i < MAX_MGMT_OUI; i++) /* Is there matching OUI for this vendor class ? */ if (!memcmp(vendor_class->oui[i], oui, 3)) return i; return -1; } static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor) { int i; for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++) if (vendor->vendor_class[i]) return 1; return 0; } static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method, struct ib_mad_agent_private *agent) { int i; /* Remove any methods for this mad agent */ for (i = 0; i < IB_MGMT_MAX_METHODS; i++) { if (method->agent[i] == agent) { method->agent[i] = NULL; } } } static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv, u8 mgmt_class) { struct ib_mad_port_private *port_priv; struct ib_mad_mgmt_class_table **class; struct ib_mad_mgmt_method_table **method; int i, ret; port_priv = agent_priv->qp_info->port_priv; class = &port_priv->version[mad_reg_req->mgmt_class_version].class; if (!*class) { /* Allocate management class table for "new" class version */ *class = kzalloc(sizeof **class, GFP_ATOMIC); if (!*class) { printk(KERN_ERR PFX "No memory for " "ib_mad_mgmt_class_table\n"); ret = -ENOMEM; goto error1; } /* Allocate method table for this management class */ method = &(*class)->method_table[mgmt_class]; if ((ret = allocate_method_table(method))) goto error2; } else { method = &(*class)->method_table[mgmt_class]; if (!*method) { /* Allocate method table for this management class */ if ((ret = allocate_method_table(method))) goto error1; } } /* Now, make sure methods are not already in use */ if (method_in_use(method, mad_reg_req)) goto error3; /* Finally, add in methods being registered */ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) (*method)->agent[i] = agent_priv; return 0; error3: /* Remove any methods for this mad agent */ remove_methods_mad_agent(*method, agent_priv); /* Now, check to see if there are any methods in use */ if (!check_method_table(*method)) { /* If not, release management method table */ kfree(*method); *method = NULL; } ret = -EINVAL; goto error1; error2: kfree(*class); *class = NULL; error1: return ret; } static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv) { struct ib_mad_port_private *port_priv; struct ib_mad_mgmt_vendor_class_table **vendor_table; struct ib_mad_mgmt_vendor_class_table *vendor = NULL; struct ib_mad_mgmt_vendor_class *vendor_class = NULL; struct ib_mad_mgmt_method_table **method; int i, ret = -ENOMEM; u8 vclass; /* "New" vendor (with OUI) class */ vclass = vendor_class_index(mad_reg_req->mgmt_class); port_priv = agent_priv->qp_info->port_priv; vendor_table = &port_priv->version[ mad_reg_req->mgmt_class_version].vendor; if (!*vendor_table) { /* Allocate mgmt vendor class table for "new" class version */ vendor = kzalloc(sizeof *vendor, GFP_ATOMIC); if (!vendor) { printk(KERN_ERR PFX "No memory for " "ib_mad_mgmt_vendor_class_table\n"); goto error1; } *vendor_table = vendor; } if (!(*vendor_table)->vendor_class[vclass]) { /* Allocate table for this management vendor class */ vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC); if (!vendor_class) { printk(KERN_ERR PFX "No memory for " "ib_mad_mgmt_vendor_class\n"); goto error2; } (*vendor_table)->vendor_class[vclass] = vendor_class; } for (i = 0; i < MAX_MGMT_OUI; i++) { /* Is there matching OUI for this vendor class ? */ if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i], mad_reg_req->oui, 3)) { method = &(*vendor_table)->vendor_class[ vclass]->method_table[i]; BUG_ON(!*method); goto check_in_use; } } for (i = 0; i < MAX_MGMT_OUI; i++) { /* OUI slot available ? */ if (!is_vendor_oui((*vendor_table)->vendor_class[ vclass]->oui[i])) { method = &(*vendor_table)->vendor_class[ vclass]->method_table[i]; BUG_ON(*method); /* Allocate method table for this OUI */ if ((ret = allocate_method_table(method))) goto error3; memcpy((*vendor_table)->vendor_class[vclass]->oui[i], mad_reg_req->oui, 3); goto check_in_use; } } printk(KERN_ERR PFX "All OUI slots in use\n"); goto error3; check_in_use: /* Now, make sure methods are not already in use */ if (method_in_use(method, mad_reg_req)) goto error4; /* Finally, add in methods being registered */ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) (*method)->agent[i] = agent_priv; return 0; error4: /* Remove any methods for this mad agent */ remove_methods_mad_agent(*method, agent_priv); /* Now, check to see if there are any methods in use */ if (!check_method_table(*method)) { /* If not, release management method table */ kfree(*method); *method = NULL; } ret = -EINVAL; error3: if (vendor_class) { (*vendor_table)->vendor_class[vclass] = NULL; kfree(vendor_class); } error2: if (vendor) { *vendor_table = NULL; kfree(vendor); } error1: return ret; } static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv) { struct ib_mad_port_private *port_priv; struct ib_mad_mgmt_class_table *class; struct ib_mad_mgmt_method_table *method; struct ib_mad_mgmt_vendor_class_table *vendor; struct ib_mad_mgmt_vendor_class *vendor_class; int index; u8 mgmt_class; /* * Was MAD registration request supplied * with original registration ? */ if (!agent_priv->reg_req) { goto out; } port_priv = agent_priv->qp_info->port_priv; mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class); class = port_priv->version[ agent_priv->reg_req->mgmt_class_version].class; if (!class) goto vendor_check; method = class->method_table[mgmt_class]; if (method) { /* Remove any methods for this mad agent */ remove_methods_mad_agent(method, agent_priv); /* Now, check to see if there are any methods still in use */ if (!check_method_table(method)) { /* If not, release management method table */ kfree(method); class->method_table[mgmt_class] = NULL; /* Any management classes left ? */ if (!check_class_table(class)) { /* If not, release management class table */ kfree(class); port_priv->version[ agent_priv->reg_req-> mgmt_class_version].class = NULL; } } } vendor_check: if (!is_vendor_class(mgmt_class)) goto out; /* normalize mgmt_class to vendor range 2 */ mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class); vendor = port_priv->version[ agent_priv->reg_req->mgmt_class_version].vendor; if (!vendor) goto out; vendor_class = vendor->vendor_class[mgmt_class]; if (vendor_class) { index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui); if (index < 0) goto out; method = vendor_class->method_table[index]; if (method) { /* Remove any methods for this mad agent */ remove_methods_mad_agent(method, agent_priv); /* * Now, check to see if there are * any methods still in use */ if (!check_method_table(method)) { /* If not, release management method table */ kfree(method); vendor_class->method_table[index] = NULL; memset(vendor_class->oui[index], 0, 3); /* Any OUIs left ? */ if (!check_vendor_class(vendor_class)) { /* If not, release vendor class table */ kfree(vendor_class); vendor->vendor_class[mgmt_class] = NULL; /* Any other vendor classes left ? */ if (!check_vendor_table(vendor)) { kfree(vendor); port_priv->version[ agent_priv->reg_req-> mgmt_class_version]. vendor = NULL; } } } } } out: return; } static struct ib_mad_agent_private * find_mad_agent(struct ib_mad_port_private *port_priv, struct ib_mad *mad) { struct ib_mad_agent_private *mad_agent = NULL; unsigned long flags; spin_lock_irqsave(&port_priv->reg_lock, flags); if (ib_response_mad(mad)) { u32 hi_tid; struct ib_mad_agent_private *entry; /* * Routing is based on high 32 bits of transaction ID * of MAD. */ hi_tid = be64_to_cpu(mad->mad_hdr.tid) >> 32; list_for_each_entry(entry, &port_priv->agent_list, agent_list) { if (entry->agent.hi_tid == hi_tid) { mad_agent = entry; break; } } } else { struct ib_mad_mgmt_class_table *class; struct ib_mad_mgmt_method_table *method; struct ib_mad_mgmt_vendor_class_table *vendor; struct ib_mad_mgmt_vendor_class *vendor_class; struct ib_vendor_mad *vendor_mad; int index; /* * Routing is based on version, class, and method * For "newer" vendor MADs, also based on OUI */ if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION) goto out; if (!is_vendor_class(mad->mad_hdr.mgmt_class)) { class = port_priv->version[ mad->mad_hdr.class_version].class; if (!class) goto out; if (convert_mgmt_class(mad->mad_hdr.mgmt_class) >= IB_MGMT_MAX_METHODS) goto out; method = class->method_table[convert_mgmt_class( mad->mad_hdr.mgmt_class)]; if (method) mad_agent = method->agent[mad->mad_hdr.method & ~IB_MGMT_METHOD_RESP]; } else { vendor = port_priv->version[ mad->mad_hdr.class_version].vendor; if (!vendor) goto out; vendor_class = vendor->vendor_class[vendor_class_index( mad->mad_hdr.mgmt_class)]; if (!vendor_class) goto out; /* Find matching OUI */ vendor_mad = (struct ib_vendor_mad *)mad; index = find_vendor_oui(vendor_class, vendor_mad->oui); if (index == -1) goto out; method = vendor_class->method_table[index]; if (method) { mad_agent = method->agent[mad->mad_hdr.method & ~IB_MGMT_METHOD_RESP]; } } } if (mad_agent) { if (mad_agent->agent.recv_handler) atomic_inc(&mad_agent->refcount); else { printk(KERN_NOTICE PFX "No receive handler for client " "%p on port %d\n", &mad_agent->agent, port_priv->port_num); mad_agent = NULL; } } out: spin_unlock_irqrestore(&port_priv->reg_lock, flags); return mad_agent; } static int validate_mad(struct ib_mad *mad, u32 qp_num) { int valid = 0; /* Make sure MAD base version is understood */ if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) { printk(KERN_ERR PFX "MAD received with unsupported base " "version %d\n", mad->mad_hdr.base_version); goto out; } /* Filter SMI packets sent to other than QP0 */ if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { if (qp_num == 0) valid = 1; } else { /* Filter GSI packets sent to QP0 */ if (qp_num != 0) valid = 1; } out: return valid; } static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_hdr *mad_hdr) { struct ib_rmpp_mad *rmpp_mad; rmpp_mad = (struct ib_rmpp_mad *)mad_hdr; return !mad_agent_priv->agent.rmpp_version || !(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE) || (rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA); } static inline int rcv_has_same_class(struct ib_mad_send_wr_private *wr, struct ib_mad_recv_wc *rwc) { return ((struct ib_mad *)(wr->send_buf.mad))->mad_hdr.mgmt_class == rwc->recv_buf.mad->mad_hdr.mgmt_class; } static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_send_wr_private *wr, struct ib_mad_recv_wc *rwc ) { struct ib_ah_attr attr; u8 send_resp, rcv_resp; union ib_gid sgid; struct ib_device *device = mad_agent_priv->agent.device; u8 port_num = mad_agent_priv->agent.port_num; u8 lmc; send_resp = ib_response_mad((struct ib_mad *)wr->send_buf.mad); rcv_resp = ib_response_mad(rwc->recv_buf.mad); if (send_resp == rcv_resp) /* both requests, or both responses. GIDs different */ return 0; if (ib_query_ah(wr->send_buf.ah, &attr)) /* Assume not equal, to avoid false positives. */ return 0; if (!!(attr.ah_flags & IB_AH_GRH) != !!(rwc->wc->wc_flags & IB_WC_GRH)) /* one has GID, other does not. Assume different */ return 0; if (!send_resp && rcv_resp) { /* is request/response. */ if (!(attr.ah_flags & IB_AH_GRH)) { if (ib_get_cached_lmc(device, port_num, &lmc)) return 0; return (!lmc || !((attr.src_path_bits ^ rwc->wc->dlid_path_bits) & ((1 << lmc) - 1))); } else { if (ib_get_cached_gid(device, port_num, attr.grh.sgid_index, &sgid)) return 0; return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw, 16); } } if (!(attr.ah_flags & IB_AH_GRH)) return attr.dlid == rwc->wc->slid; else return !memcmp(attr.grh.dgid.raw, rwc->recv_buf.grh->sgid.raw, 16); } static inline int is_direct(u8 class) { return (class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE); } struct ib_mad_send_wr_private* ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_recv_wc *wc) { struct ib_mad_send_wr_private *wr; struct ib_mad *mad; mad = (struct ib_mad *)wc->recv_buf.mad; list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) { if ((wr->tid == mad->mad_hdr.tid) && rcv_has_same_class(wr, wc) && /* * Don't check GID for direct routed MADs. * These might have permissive LIDs. */ (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) return (wr->status == IB_WC_SUCCESS) ? wr : NULL; } /* * It's possible to receive the response before we've * been notified that the send has completed */ list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) { if (is_data_mad(mad_agent_priv, wr->send_buf.mad) && wr->tid == mad->mad_hdr.tid && wr->timeout && rcv_has_same_class(wr, wc) && /* * Don't check GID for direct routed MADs. * These might have permissive LIDs. */ (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) /* Verify request has not been canceled */ return (wr->status == IB_WC_SUCCESS) ? wr : NULL; } return NULL; } void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr) { mad_send_wr->timeout = 0; if (mad_send_wr->refcount == 1) list_move_tail(&mad_send_wr->agent_list, &mad_send_wr->mad_agent_priv->done_list); } static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc mad_send_wc; unsigned long flags; INIT_LIST_HEAD(&mad_recv_wc->rmpp_list); list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list); if (mad_agent_priv->agent.rmpp_version) { mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv, mad_recv_wc); if (!mad_recv_wc) { deref_mad_agent(mad_agent_priv); return; } } /* Complete corresponding request */ if (ib_response_mad(mad_recv_wc->recv_buf.mad)) { spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc); if (!mad_send_wr) { spin_unlock_irqrestore(&mad_agent_priv->lock, flags); ib_free_recv_mad(mad_recv_wc); deref_mad_agent(mad_agent_priv); return; } ib_mark_mad_done(mad_send_wr); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); /* Defined behavior is to complete response before request */ mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf; mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, mad_recv_wc); atomic_dec(&mad_agent_priv->refcount); mad_send_wc.status = IB_WC_SUCCESS; mad_send_wc.vendor_err = 0; mad_send_wc.send_buf = &mad_send_wr->send_buf; ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); } else { mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, mad_recv_wc); deref_mad_agent(mad_agent_priv); } } static bool generate_unmatched_resp(struct ib_mad_private *recv, struct ib_mad_private *response) { if (recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_GET || recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_SET) { memcpy(response, recv, sizeof *response); response->header.recv_wc.wc = &response->header.wc; response->header.recv_wc.recv_buf.mad = &response->mad.mad; response->header.recv_wc.recv_buf.grh = &response->grh; response->mad.mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP; response->mad.mad.mad_hdr.status = cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB); if (recv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) response->mad.mad.mad_hdr.status |= IB_SMP_DIRECTION; return true; } else { return false; } } static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, struct ib_wc *wc) { struct ib_mad_qp_info *qp_info; struct ib_mad_private_header *mad_priv_hdr; struct ib_mad_private *recv, *response = NULL; struct ib_mad_list_head *mad_list; struct ib_mad_agent_private *mad_agent; int port_num; int ret = IB_MAD_RESULT_SUCCESS; mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; qp_info = mad_list->mad_queue->qp_info; dequeue_mad(mad_list); mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header, mad_list); recv = container_of(mad_priv_hdr, struct ib_mad_private, header); ib_dma_unmap_single(port_priv->device, recv->header.mapping, sizeof(struct ib_mad_private) - sizeof(struct ib_mad_private_header), DMA_FROM_DEVICE); /* Setup MAD receive work completion from "normal" work completion */ recv->header.wc = *wc; recv->header.recv_wc.wc = &recv->header.wc; recv->header.recv_wc.mad_len = sizeof(struct ib_mad); recv->header.recv_wc.recv_buf.mad = &recv->mad.mad; recv->header.recv_wc.recv_buf.grh = &recv->grh; if (atomic_read(&qp_info->snoop_count)) snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS); /* Validate MAD */ if (!validate_mad(&recv->mad.mad, qp_info->qp->qp_num)) goto out; response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); if (!response) { printk(KERN_ERR PFX "ib_mad_recv_done_handler no memory " "for response buffer\n"); goto out; } if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) port_num = wc->port_num; else port_num = port_priv->port_num; if (recv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { enum smi_forward_action retsmi; if (smi_handle_dr_smp_recv(&recv->mad.smp, port_priv->device->node_type, port_num, port_priv->device->phys_port_cnt) == IB_SMI_DISCARD) goto out; retsmi = smi_check_forward_dr_smp(&recv->mad.smp); if (retsmi == IB_SMI_LOCAL) goto local; if (retsmi == IB_SMI_SEND) { /* don't forward */ if (smi_handle_dr_smp_send(&recv->mad.smp, port_priv->device->node_type, port_num) == IB_SMI_DISCARD) goto out; if (smi_check_local_smp(&recv->mad.smp, port_priv->device) == IB_SMI_DISCARD) goto out; } else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) { /* forward case for switches */ memcpy(response, recv, sizeof(*response)); response->header.recv_wc.wc = &response->header.wc; response->header.recv_wc.recv_buf.mad = &response->mad.mad; response->header.recv_wc.recv_buf.grh = &response->grh; agent_send_response(&response->mad.mad, &response->grh, wc, port_priv->device, smi_get_fwd_port(&recv->mad.smp), qp_info->qp->qp_num); goto out; } } local: /* Give driver "right of first refusal" on incoming MAD */ if (port_priv->device->process_mad) { ret = port_priv->device->process_mad(port_priv->device, 0, port_priv->port_num, wc, &recv->grh, &recv->mad.mad, &response->mad.mad); if (ret & IB_MAD_RESULT_SUCCESS) { if (ret & IB_MAD_RESULT_CONSUMED) goto out; if (ret & IB_MAD_RESULT_REPLY) { agent_send_response(&response->mad.mad, &recv->grh, wc, port_priv->device, port_num, qp_info->qp->qp_num); goto out; } } } mad_agent = find_mad_agent(port_priv, &recv->mad.mad); if (mad_agent) { ib_mad_complete_recv(mad_agent, &recv->header.recv_wc); /* * recv is freed up in error cases in ib_mad_complete_recv * or via recv_handler in ib_mad_complete_recv() */ recv = NULL; } else if ((ret & IB_MAD_RESULT_SUCCESS) && generate_unmatched_resp(recv, response)) { agent_send_response(&response->mad.mad, &recv->grh, wc, port_priv->device, port_num, qp_info->qp->qp_num); } out: /* Post another receive request for this QP */ if (response) { ib_mad_post_receive_mads(qp_info, response); if (recv) kmem_cache_free(ib_mad_cache, recv); } else ib_mad_post_receive_mads(qp_info, recv); } static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv) { struct ib_mad_send_wr_private *mad_send_wr; unsigned long delay; if (list_empty(&mad_agent_priv->wait_list)) { cancel_delayed_work(&mad_agent_priv->timed_work); } else { mad_send_wr = list_entry(mad_agent_priv->wait_list.next, struct ib_mad_send_wr_private, agent_list); if (time_after(mad_agent_priv->timeout, mad_send_wr->timeout)) { mad_agent_priv->timeout = mad_send_wr->timeout; delay = mad_send_wr->timeout - jiffies; if ((long)delay <= 0) delay = 1; mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq, &mad_agent_priv->timed_work, delay); } } } static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *temp_mad_send_wr; struct list_head *list_item; unsigned long delay; mad_agent_priv = mad_send_wr->mad_agent_priv; list_del(&mad_send_wr->agent_list); delay = mad_send_wr->timeout; mad_send_wr->timeout += jiffies; if (delay) { list_for_each_prev(list_item, &mad_agent_priv->wait_list) { temp_mad_send_wr = list_entry(list_item, struct ib_mad_send_wr_private, agent_list); if (time_after(mad_send_wr->timeout, temp_mad_send_wr->timeout)) break; } } else list_item = &mad_agent_priv->wait_list; list_add(&mad_send_wr->agent_list, list_item); /* Reschedule a work item if we have a shorter timeout */ if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list) mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq, &mad_agent_priv->timed_work, delay); } void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, int timeout_ms) { mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); wait_for_response(mad_send_wr); } /* * Process a send work completion */ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, struct ib_mad_send_wc *mad_send_wc) { struct ib_mad_agent_private *mad_agent_priv; unsigned long flags; int ret; mad_agent_priv = mad_send_wr->mad_agent_priv; spin_lock_irqsave(&mad_agent_priv->lock, flags); if (mad_agent_priv->agent.rmpp_version) { ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc); if (ret == IB_RMPP_RESULT_CONSUMED) goto done; } else ret = IB_RMPP_RESULT_UNHANDLED; if (mad_send_wc->status != IB_WC_SUCCESS && mad_send_wr->status == IB_WC_SUCCESS) { mad_send_wr->status = mad_send_wc->status; mad_send_wr->refcount -= (mad_send_wr->timeout > 0); } if (--mad_send_wr->refcount > 0) { if (mad_send_wr->refcount == 1 && mad_send_wr->timeout && mad_send_wr->status == IB_WC_SUCCESS) { wait_for_response(mad_send_wr); } goto done; } /* Remove send from MAD agent and notify client of completion */ list_del(&mad_send_wr->agent_list); adjust_timeout(mad_agent_priv); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); if (mad_send_wr->status != IB_WC_SUCCESS ) mad_send_wc->status = mad_send_wr->status; if (ret == IB_RMPP_RESULT_INTERNAL) ib_rmpp_send_handler(mad_send_wc); else { if (mad_send_wr->is_sa_cc_mad) sa_cc_mad_done(get_cc_obj(mad_send_wr)); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, mad_send_wc); } /* Release reference on agent taken when sending */ deref_mad_agent(mad_agent_priv); return; done: spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv, struct ib_wc *wc) { struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr; struct ib_mad_list_head *mad_list; struct ib_mad_qp_info *qp_info; struct ib_mad_queue *send_queue; struct ib_send_wr *bad_send_wr; struct ib_mad_send_wc mad_send_wc; unsigned long flags; int ret; mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); send_queue = mad_list->mad_queue; qp_info = send_queue->qp_info; retry: ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device, mad_send_wr->header_mapping, mad_send_wr->sg_list[0].length, DMA_TO_DEVICE); ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device, mad_send_wr->payload_mapping, mad_send_wr->sg_list[1].length, DMA_TO_DEVICE); queued_send_wr = NULL; spin_lock_irqsave(&send_queue->lock, flags); list_del(&mad_list->list); /* Move queued send to the send queue */ if (send_queue->count-- > send_queue->max_active) { mad_list = container_of(qp_info->overflow_list.next, struct ib_mad_list_head, list); queued_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); list_move_tail(&mad_list->list, &send_queue->list); } spin_unlock_irqrestore(&send_queue->lock, flags); mad_send_wc.send_buf = &mad_send_wr->send_buf; mad_send_wc.status = wc->status; mad_send_wc.vendor_err = wc->vendor_err; if (atomic_read(&qp_info->snoop_count)) snoop_send(qp_info, &mad_send_wr->send_buf, &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS); ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); if (queued_send_wr) { ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr, &bad_send_wr); if (ret) { printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret); mad_send_wr = queued_send_wr; wc->status = IB_WC_LOC_QP_OP_ERR; goto retry; } } } static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info) { struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_list_head *mad_list; unsigned long flags; spin_lock_irqsave(&qp_info->send_queue.lock, flags); list_for_each_entry(mad_list, &qp_info->send_queue.list, list) { mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); mad_send_wr->retry = 1; } spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); } static void mad_error_handler(struct ib_mad_port_private *port_priv, struct ib_wc *wc) { struct ib_mad_list_head *mad_list; struct ib_mad_qp_info *qp_info; struct ib_mad_send_wr_private *mad_send_wr; int ret; /* Determine if failure was a send or receive */ mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; qp_info = mad_list->mad_queue->qp_info; if (mad_list->mad_queue == &qp_info->recv_queue) /* * Receive errors indicate that the QP has entered the error * state - error handling/shutdown code will cleanup */ return; /* * Send errors will transition the QP to SQE - move * QP to RTS and repost flushed work requests */ mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); if (wc->status == IB_WC_WR_FLUSH_ERR) { if (mad_send_wr->retry) { /* Repost send */ struct ib_send_wr *bad_send_wr; mad_send_wr->retry = 0; ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr, &bad_send_wr); if (ret) ib_mad_send_done_handler(port_priv, wc); } else ib_mad_send_done_handler(port_priv, wc); } else { struct ib_qp_attr *attr; /* Transition QP to RTS and fail offending send */ attr = kmalloc(sizeof *attr, GFP_KERNEL); if (attr) { attr->qp_state = IB_QPS_RTS; attr->cur_qp_state = IB_QPS_SQE; ret = ib_modify_qp(qp_info->qp, attr, IB_QP_STATE | IB_QP_CUR_STATE); kfree(attr); if (ret) printk(KERN_ERR PFX "mad_error_handler - " "ib_modify_qp to RTS : %d\n", ret); else mark_sends_for_retry(qp_info); } ib_mad_send_done_handler(port_priv, wc); } } /* * IB MAD completion callback */ static void ib_mad_completion_handler(struct work_struct *work) { struct ib_mad_port_private *port_priv; struct ib_wc wc; port_priv = container_of(work, struct ib_mad_port_private, work); ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) { if (wc.status == IB_WC_SUCCESS) { switch (wc.opcode) { case IB_WC_SEND: ib_mad_send_done_handler(port_priv, &wc); break; case IB_WC_RECV: ib_mad_recv_done_handler(port_priv, &wc); break; default: BUG_ON(1); break; } } else mad_error_handler(port_priv, &wc); } } static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) { unsigned long flags; struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr; struct ib_mad_send_wc mad_send_wc; struct list_head cancel_list; INIT_LIST_HEAD(&cancel_list); cancel_sa_cc_mads(mad_agent_priv); spin_lock_irqsave(&mad_agent_priv->lock, flags); list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, &mad_agent_priv->send_list, agent_list) { if (mad_send_wr->status == IB_WC_SUCCESS) { mad_send_wr->status = IB_WC_WR_FLUSH_ERR; mad_send_wr->refcount -= (mad_send_wr->timeout > 0); } } /* Empty wait list to prevent receives from finding a request */ list_splice_init(&mad_agent_priv->wait_list, &cancel_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); /* Report all cancelled requests */ mad_send_wc.status = IB_WC_WR_FLUSH_ERR; mad_send_wc.vendor_err = 0; list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, &cancel_list, agent_list) { mad_send_wc.send_buf = &mad_send_wr->send_buf; list_del(&mad_send_wr->agent_list); if (mad_send_wr->is_sa_cc_mad) sa_cc_mad_done(get_cc_obj(mad_send_wr)); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); atomic_dec(&mad_agent_priv->refcount); } } static struct ib_mad_send_wr_private* find_send_wr(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_send_buf *send_buf) { struct ib_mad_send_wr_private *mad_send_wr; list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list, agent_list) { if (&mad_send_wr->send_buf == send_buf) return mad_send_wr; } list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list, agent_list) { if (is_data_mad(mad_agent_priv, mad_send_wr->send_buf.mad) && &mad_send_wr->send_buf == send_buf) return mad_send_wr; } return NULL; } int ib_modify_mad(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, u32 timeout_ms) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; unsigned long flags; int active; mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, agent); spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = find_send_wr(mad_agent_priv, send_buf); if (!mad_send_wr) { spin_unlock_irqrestore(&mad_agent_priv->lock, flags); if (modify_sa_cc_mad(mad_agent_priv, send_buf, timeout_ms)) return -EINVAL; return 0; } if (mad_send_wr->status != IB_WC_SUCCESS) { spin_unlock_irqrestore(&mad_agent_priv->lock, flags); return -EINVAL; } active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1); if (!timeout_ms) { mad_send_wr->status = IB_WC_WR_FLUSH_ERR; mad_send_wr->refcount -= (mad_send_wr->timeout > 0); } mad_send_wr->send_buf.timeout_ms = timeout_ms; if (active) mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); else ib_reset_mad_timeout(mad_send_wr, timeout_ms); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); return 0; } EXPORT_SYMBOL(ib_modify_mad); void ib_cancel_mad(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf) { ib_modify_mad(mad_agent, send_buf, 0); } EXPORT_SYMBOL(ib_cancel_mad); static void local_completions(struct work_struct *work) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_local_private *local; struct ib_mad_agent_private *recv_mad_agent; unsigned long flags; int free_mad; struct ib_wc wc; struct ib_mad_send_wc mad_send_wc; mad_agent_priv = container_of(work, struct ib_mad_agent_private, local_work); spin_lock_irqsave(&mad_agent_priv->lock, flags); while (!list_empty(&mad_agent_priv->local_list)) { local = list_entry(mad_agent_priv->local_list.next, struct ib_mad_local_private, completion_list); list_del(&local->completion_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); free_mad = 0; if (local->mad_priv) { recv_mad_agent = local->recv_mad_agent; if (!recv_mad_agent) { printk(KERN_ERR PFX "No receive MAD agent for local completion\n"); free_mad = 1; goto local_send_completion; } /* * Defined behavior is to complete response * before request */ build_smp_wc(recv_mad_agent->agent.qp, (unsigned long) local->mad_send_wr, be16_to_cpu(IB_LID_PERMISSIVE), 0, recv_mad_agent->agent.port_num, &wc); local->mad_priv->header.recv_wc.wc = &wc; local->mad_priv->header.recv_wc.mad_len = sizeof(struct ib_mad); INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list); list_add(&local->mad_priv->header.recv_wc.recv_buf.list, &local->mad_priv->header.recv_wc.rmpp_list); local->mad_priv->header.recv_wc.recv_buf.grh = NULL; local->mad_priv->header.recv_wc.recv_buf.mad = &local->mad_priv->mad.mad; if (atomic_read(&recv_mad_agent->qp_info->snoop_count)) snoop_recv(recv_mad_agent->qp_info, &local->mad_priv->header.recv_wc, IB_MAD_SNOOP_RECVS); recv_mad_agent->agent.recv_handler( &recv_mad_agent->agent, &local->mad_priv->header.recv_wc); spin_lock_irqsave(&recv_mad_agent->lock, flags); atomic_dec(&recv_mad_agent->refcount); spin_unlock_irqrestore(&recv_mad_agent->lock, flags); } local_send_completion: /* Complete send */ mad_send_wc.status = IB_WC_SUCCESS; mad_send_wc.vendor_err = 0; mad_send_wc.send_buf = &local->mad_send_wr->send_buf; if (atomic_read(&mad_agent_priv->qp_info->snoop_count)) snoop_send(mad_agent_priv->qp_info, &local->mad_send_wr->send_buf, &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); spin_lock_irqsave(&mad_agent_priv->lock, flags); atomic_dec(&mad_agent_priv->refcount); if (free_mad) kmem_cache_free(ib_mad_cache, local->mad_priv); kfree(local); } spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) { int ret; if (!mad_send_wr->retries_left) return -ETIMEDOUT; mad_send_wr->retries_left--; mad_send_wr->send_buf.retries++; mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); if (mad_send_wr->mad_agent_priv->agent.rmpp_version) { ret = ib_retry_rmpp(mad_send_wr); switch (ret) { case IB_RMPP_RESULT_UNHANDLED: ret = ib_send_mad(mad_send_wr); break; case IB_RMPP_RESULT_CONSUMED: ret = 0; break; default: ret = -ECOMM; break; } } else ret = ib_send_mad(mad_send_wr); if (!ret) { mad_send_wr->refcount++; list_add_tail(&mad_send_wr->agent_list, &mad_send_wr->mad_agent_priv->send_list); } return ret; } static void timeout_sends(struct work_struct *work) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc mad_send_wc; unsigned long flags, delay; mad_agent_priv = container_of(work, struct ib_mad_agent_private, timed_work.work); mad_send_wc.vendor_err = 0; spin_lock_irqsave(&mad_agent_priv->lock, flags); while (!list_empty(&mad_agent_priv->wait_list)) { mad_send_wr = list_entry(mad_agent_priv->wait_list.next, struct ib_mad_send_wr_private, agent_list); if (time_after(mad_send_wr->timeout, jiffies)) { delay = mad_send_wr->timeout - jiffies; if ((long)delay <= 0) delay = 1; queue_delayed_work(mad_agent_priv->qp_info-> port_priv->wq, &mad_agent_priv->timed_work, delay); break; } list_del(&mad_send_wr->agent_list); if (mad_send_wr->status == IB_WC_SUCCESS && !retry_send(mad_send_wr)) continue; spin_unlock_irqrestore(&mad_agent_priv->lock, flags); if (mad_send_wr->status == IB_WC_SUCCESS) mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR; else mad_send_wc.status = mad_send_wr->status; mad_send_wc.send_buf = &mad_send_wr->send_buf; if (mad_send_wr->is_sa_cc_mad) sa_cc_mad_done(get_cc_obj(mad_send_wr)); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); atomic_dec(&mad_agent_priv->refcount); spin_lock_irqsave(&mad_agent_priv->lock, flags); } spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg) { struct ib_mad_port_private *port_priv = cq->cq_context; unsigned long flags; spin_lock_irqsave(&ib_mad_port_list_lock, flags); if (!list_empty(&port_priv->port_list)) queue_work(port_priv->wq, &port_priv->work); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); } /* * Allocate receive MADs and post receive WRs for them */ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, struct ib_mad_private *mad) { unsigned long flags; int post, ret; struct ib_mad_private *mad_priv; struct ib_sge sg_list; struct ib_recv_wr recv_wr, *bad_recv_wr; struct ib_mad_queue *recv_queue = &qp_info->recv_queue; /* Initialize common scatter list fields */ sg_list.length = sizeof *mad_priv - sizeof mad_priv->header; sg_list.lkey = (*qp_info->port_priv->mr).lkey; /* Initialize common receive WR fields */ recv_wr.next = NULL; recv_wr.sg_list = &sg_list; recv_wr.num_sge = 1; do { /* Allocate and map receive buffer */ if (mad) { mad_priv = mad; mad = NULL; } else { mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); if (!mad_priv) { printk(KERN_ERR PFX "No memory for receive buffer\n"); ret = -ENOMEM; break; } } sg_list.addr = ib_dma_map_single(qp_info->port_priv->device, &mad_priv->grh, sizeof *mad_priv - sizeof mad_priv->header, DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, sg_list.addr))) { ret = -ENOMEM; kmem_cache_free(ib_mad_cache, mad_priv); printk(KERN_ERR PFX "ib_dma_map_single failed\n"); break; } mad_priv->header.mapping = sg_list.addr; recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list; mad_priv->header.mad_list.mad_queue = recv_queue; /* Post receive WR */ spin_lock_irqsave(&recv_queue->lock, flags); post = (++recv_queue->count < recv_queue->max_active); list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list); spin_unlock_irqrestore(&recv_queue->lock, flags); ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr); if (ret) { spin_lock_irqsave(&recv_queue->lock, flags); list_del(&mad_priv->header.mad_list.list); recv_queue->count--; spin_unlock_irqrestore(&recv_queue->lock, flags); ib_dma_unmap_single(qp_info->port_priv->device, mad_priv->header.mapping, sizeof *mad_priv - sizeof mad_priv->header, DMA_FROM_DEVICE); kmem_cache_free(ib_mad_cache, mad_priv); printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret); break; } } while (post); return ret; } /* * Return all the posted receive MADs */ static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info) { struct ib_mad_private_header *mad_priv_hdr; struct ib_mad_private *recv; struct ib_mad_list_head *mad_list; if (!qp_info->qp) return; while (!list_empty(&qp_info->recv_queue.list)) { mad_list = list_entry(qp_info->recv_queue.list.next, struct ib_mad_list_head, list); mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header, mad_list); recv = container_of(mad_priv_hdr, struct ib_mad_private, header); /* Remove from posted receive MAD list */ list_del(&mad_list->list); ib_dma_unmap_single(qp_info->port_priv->device, recv->header.mapping, sizeof(struct ib_mad_private) - sizeof(struct ib_mad_private_header), DMA_FROM_DEVICE); kmem_cache_free(ib_mad_cache, recv); } qp_info->recv_queue.count = 0; } /* * Start the port */ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) { int ret, i; struct ib_qp_attr *attr; struct ib_qp *qp; u16 pkey_index = 0; attr = kmalloc(sizeof *attr, GFP_KERNEL); if (!attr) { printk(KERN_ERR PFX "Couldn't kmalloc ib_qp_attr\n"); return -ENOMEM; } ret = ib_find_pkey(port_priv->device, port_priv->port_num, 0xFFFF, &pkey_index); if (ret) pkey_index = 0; for (i = 0; i < IB_MAD_QPS_CORE; i++) { qp = port_priv->qp_info[i].qp; if (!qp) continue; /* * PKey index for QP1 is irrelevant but * one is needed for the Reset to Init transition */ attr->qp_state = IB_QPS_INIT; attr->pkey_index = pkey_index; attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY; ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY); if (ret) { printk(KERN_ERR PFX "Couldn't change QP%d state to " "INIT: %d\n", i, ret); goto out; } attr->qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, attr, IB_QP_STATE); if (ret) { printk(KERN_ERR PFX "Couldn't change QP%d state to " "RTR: %d\n", i, ret); goto out; } attr->qp_state = IB_QPS_RTS; attr->sq_psn = IB_MAD_SEND_Q_PSN; ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN); if (ret) { printk(KERN_ERR PFX "Couldn't change QP%d state to " "RTS: %d\n", i, ret); goto out; } } ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); if (ret) { printk(KERN_ERR PFX "Failed to request completion " "notification: %d\n", ret); goto out; } for (i = 0; i < IB_MAD_QPS_CORE; i++) { if (!port_priv->qp_info[i].qp) continue; ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL); if (ret) { printk(KERN_ERR PFX "Couldn't post receive WRs\n"); goto out; } } out: kfree(attr); return ret; } static void qp_event_handler(struct ib_event *event, void *qp_context) { struct ib_mad_qp_info *qp_info = qp_context; /* It's worse than that! He's dead, Jim! */ printk(KERN_ERR PFX "Fatal error (%d) on MAD QP (%d)\n", event->event, qp_info->qp->qp_num); } static void init_mad_queue(struct ib_mad_qp_info *qp_info, struct ib_mad_queue *mad_queue) { mad_queue->qp_info = qp_info; mad_queue->count = 0; spin_lock_init(&mad_queue->lock); INIT_LIST_HEAD(&mad_queue->list); } static void init_mad_qp(struct ib_mad_port_private *port_priv, struct ib_mad_qp_info *qp_info) { qp_info->port_priv = port_priv; init_mad_queue(qp_info, &qp_info->send_queue); init_mad_queue(qp_info, &qp_info->recv_queue); INIT_LIST_HEAD(&qp_info->overflow_list); spin_lock_init(&qp_info->snoop_lock); qp_info->snoop_table = NULL; qp_info->snoop_table_size = 0; atomic_set(&qp_info->snoop_count, 0); } static int create_mad_qp(struct ib_mad_qp_info *qp_info, enum ib_qp_type qp_type) { struct ib_qp_init_attr qp_init_attr; int ret; memset(&qp_init_attr, 0, sizeof qp_init_attr); qp_init_attr.send_cq = qp_info->port_priv->cq; qp_init_attr.recv_cq = qp_info->port_priv->cq; qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; qp_init_attr.cap.max_send_wr = mad_sendq_size; qp_init_attr.cap.max_recv_wr = mad_recvq_size; qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG; qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG; qp_init_attr.qp_type = qp_type; qp_init_attr.port_num = qp_info->port_priv->port_num; qp_init_attr.qp_context = qp_info; qp_init_attr.event_handler = qp_event_handler; qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr); if (IS_ERR(qp_info->qp)) { printk(KERN_ERR PFX "Couldn't create ib_mad QP%d\n", get_spl_qp_index(qp_type)); ret = PTR_ERR(qp_info->qp); goto error; } /* Use minimum queue sizes unless the CQ is resized */ qp_info->send_queue.max_active = mad_sendq_size; qp_info->recv_queue.max_active = mad_recvq_size; return 0; error: return ret; } static void destroy_mad_qp(struct ib_mad_qp_info *qp_info) { if (!qp_info->qp) return; ib_destroy_qp(qp_info->qp); kfree(qp_info->snoop_table); } /* * Open the port * Create the QP, PD, MR, and CQ if needed */ static int ib_mad_port_open(struct ib_device *device, int port_num) { int ret, cq_size; struct ib_mad_port_private *port_priv; unsigned long flags; char name[sizeof "ib_mad123"]; int has_smi; /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); if (!port_priv) { printk(KERN_ERR PFX "No memory for ib_mad_port_private\n"); return -ENOMEM; } port_priv->device = device; port_priv->port_num = port_num; spin_lock_init(&port_priv->reg_lock); INIT_LIST_HEAD(&port_priv->agent_list); init_mad_qp(port_priv, &port_priv->qp_info[0]); init_mad_qp(port_priv, &port_priv->qp_info[1]); cq_size = mad_sendq_size + mad_recvq_size; has_smi = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND; if (has_smi) cq_size *= 2; port_priv->cq = ib_create_cq(port_priv->device, ib_mad_thread_completion_handler, NULL, port_priv, cq_size, 0); if (IS_ERR(port_priv->cq)) { printk(KERN_ERR PFX "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); goto error3; } port_priv->pd = ib_alloc_pd(device); if (IS_ERR(port_priv->pd)) { printk(KERN_ERR PFX "Couldn't create ib_mad PD\n"); ret = PTR_ERR(port_priv->pd); goto error4; } port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(port_priv->mr)) { printk(KERN_ERR PFX "Couldn't get ib_mad DMA MR\n"); ret = PTR_ERR(port_priv->mr); goto error5; } if (has_smi) { ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI); if (ret) goto error6; } ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI); if (ret) goto error7; snprintf(name, sizeof name, "ib_mad%d", port_num); port_priv->wq = create_singlethread_workqueue(name); if (!port_priv->wq) { ret = -ENOMEM; goto error8; } INIT_WORK(&port_priv->work, ib_mad_completion_handler); if (sa_cc_init(&port_priv->sa_cc)) goto error9; spin_lock_irqsave(&ib_mad_port_list_lock, flags); list_add_tail(&port_priv->port_list, &ib_mad_port_list); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); ret = ib_mad_port_start(port_priv); if (ret) { printk(KERN_ERR PFX "Couldn't start port\n"); goto error10; } return 0; error10: spin_lock_irqsave(&ib_mad_port_list_lock, flags); list_del_init(&port_priv->port_list); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); destroy_workqueue(port_priv->wq); error9: sa_cc_destroy(&port_priv->sa_cc); error8: destroy_mad_qp(&port_priv->qp_info[1]); error7: destroy_mad_qp(&port_priv->qp_info[0]); error6: ib_dereg_mr(port_priv->mr); error5: ib_dealloc_pd(port_priv->pd); error4: ib_destroy_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); error3: kfree(port_priv); return ret; } /* * Close the port * If there are no classes using the port, free the port * resources (CQ, MR, PD, QP) and remove the port's info structure */ static int ib_mad_port_close(struct ib_device *device, int port_num) { struct ib_mad_port_private *port_priv; unsigned long flags; spin_lock_irqsave(&ib_mad_port_list_lock, flags); port_priv = __ib_get_mad_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); printk(KERN_ERR PFX "Port %d not found\n", port_num); return -ENODEV; } list_del_init(&port_priv->port_list); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); destroy_workqueue(port_priv->wq); sa_cc_destroy(&port_priv->sa_cc); destroy_mad_qp(&port_priv->qp_info[1]); destroy_mad_qp(&port_priv->qp_info[0]); ib_dereg_mr(port_priv->mr); ib_dealloc_pd(port_priv->pd); ib_destroy_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); /* XXX: Handle deallocation of MAD registration tables */ kfree(port_priv); return 0; } static void ib_mad_init_device(struct ib_device *device) { int start, end, i; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; if (device->node_type == RDMA_NODE_IB_SWITCH) { start = 0; end = 0; } else { start = 1; end = device->phys_port_cnt; } for (i = start; i <= end; i++) { if (ib_mad_port_open(device, i)) { printk(KERN_ERR PFX "Couldn't open %s port %d\n", device->name, i); goto error; } if (ib_agent_port_open(device, i)) { printk(KERN_ERR PFX "Couldn't open %s port %d " "for agents\n", device->name, i); goto error_agent; } } return; error_agent: if (ib_mad_port_close(device, i)) printk(KERN_ERR PFX "Couldn't close %s port %d\n", device->name, i); error: i--; while (i >= start) { if (ib_agent_port_close(device, i)) printk(KERN_ERR PFX "Couldn't close %s port %d " "for agents\n", device->name, i); if (ib_mad_port_close(device, i)) printk(KERN_ERR PFX "Couldn't close %s port %d\n", device->name, i); i--; } } static void ib_mad_remove_device(struct ib_device *device) { int i, num_ports, cur_port; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; if (device->node_type == RDMA_NODE_IB_SWITCH) { num_ports = 1; cur_port = 0; } else { num_ports = device->phys_port_cnt; cur_port = 1; } for (i = 0; i < num_ports; i++, cur_port++) { if (ib_agent_port_close(device, cur_port)) printk(KERN_ERR PFX "Couldn't close %s port %d " "for agents\n", device->name, cur_port); if (ib_mad_port_close(device, cur_port)) printk(KERN_ERR PFX "Couldn't close %s port %d\n", device->name, cur_port); } } static struct ib_client mad_client = { .name = "mad", .add = ib_mad_init_device, .remove = ib_mad_remove_device }; static int __init ib_mad_init_module(void) { int ret; mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE); mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE); mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE); mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE); ib_mad_cache = kmem_cache_create("ib_mad", sizeof(struct ib_mad_private), 0, SLAB_HWCACHE_ALIGN, NULL); if (!ib_mad_cache) { printk(KERN_ERR PFX "Couldn't create ib_mad cache\n"); ret = -ENOMEM; goto error1; } INIT_LIST_HEAD(&ib_mad_port_list); if (ib_register_client(&mad_client)) { printk(KERN_ERR PFX "Couldn't register ib_mad client\n"); ret = -EINVAL; goto error2; } return 0; error2: kmem_cache_destroy(ib_mad_cache); error1: return ret; } static void __exit ib_mad_cleanup_module(void) { ib_unregister_client(&mad_client); kmem_cache_destroy(ib_mad_cache); } module_init(ib_mad_init_module); module_exit(ib_mad_cleanup_module); Index: head/sys/ofed/drivers/infiniband/core/multicast.c =================================================================== --- head/sys/ofed/drivers/infiniband/core/multicast.c (revision 300675) +++ head/sys/ofed/drivers/infiniband/core/multicast.c (revision 300676) @@ -1,914 +1,916 @@ /* * Copyright (c) 2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#define LINUXKPI_PARAM_PREFIX ibcore_ + #include #include #include #include #include #include #include #include #include #include #include #include "sa.h" static int mcast_leave_retries = 3; /*static const struct kernel_param_ops retry_ops = { .set = param_set_int, .get = param_get_int, }; module_param_cb(mcast_leave_retries, &retry_ops, &mcast_leave_retries, 0644); MODULE_PARM_DESC(mcast_leave_retries, "Number of retries for multicast leave " "requests before giving up (default: 3)"); */ static void mcast_add_one(struct ib_device *device); static void mcast_remove_one(struct ib_device *device); static struct ib_client mcast_client = { .name = "ib_multicast", .add = mcast_add_one, .remove = mcast_remove_one }; static struct ib_sa_client sa_client; static struct workqueue_struct *mcast_wq; static union ib_gid mgid0; struct mcast_device; struct mcast_port { struct mcast_device *dev; spinlock_t lock; struct rb_root table; atomic_t refcount; struct completion comp; u8 port_num; }; struct mcast_device { struct ib_device *device; struct ib_event_handler event_handler; int start_port; int end_port; struct mcast_port port[0]; }; enum mcast_state { MCAST_JOINING, MCAST_MEMBER, MCAST_ERROR, }; enum mcast_group_state { MCAST_IDLE, MCAST_BUSY, MCAST_GROUP_ERROR, MCAST_PKEY_EVENT }; enum { MCAST_INVALID_PKEY_INDEX = 0xFFFF }; struct mcast_member; struct mcast_group { struct ib_sa_mcmember_rec rec; struct rb_node node; struct mcast_port *port; spinlock_t lock; struct work_struct work; struct list_head pending_list; struct list_head active_list; struct mcast_member *last_join; int members[3]; atomic_t refcount; enum mcast_group_state state; struct ib_sa_query *query; int query_id; u16 pkey_index; u8 leave_state; int retries; }; struct mcast_member { struct ib_sa_multicast multicast; struct ib_sa_client *client; struct mcast_group *group; struct list_head list; enum mcast_state state; atomic_t refcount; struct completion comp; }; static void join_handler(int status, struct ib_sa_mcmember_rec *rec, void *context); static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, void *context); static struct mcast_group *mcast_find(struct mcast_port *port, union ib_gid *mgid) { struct rb_node *node = port->table.rb_node; struct mcast_group *group; int ret; while (node) { group = rb_entry(node, struct mcast_group, node); ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid); if (!ret) return group; if (ret < 0) node = node->rb_left; else node = node->rb_right; } return NULL; } static struct mcast_group *mcast_insert(struct mcast_port *port, struct mcast_group *group, int allow_duplicates) { struct rb_node **link = &port->table.rb_node; struct rb_node *parent = NULL; struct mcast_group *cur_group; int ret; while (*link) { parent = *link; cur_group = rb_entry(parent, struct mcast_group, node); ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw, sizeof group->rec.mgid); if (ret < 0) link = &(*link)->rb_left; else if (ret > 0) link = &(*link)->rb_right; else if (allow_duplicates) link = &(*link)->rb_left; else return cur_group; } rb_link_node(&group->node, parent, link); rb_insert_color(&group->node, &port->table); return NULL; } static void deref_port(struct mcast_port *port) { if (atomic_dec_and_test(&port->refcount)) complete(&port->comp); } static void release_group(struct mcast_group *group) { struct mcast_port *port = group->port; unsigned long flags; spin_lock_irqsave(&port->lock, flags); if (atomic_dec_and_test(&group->refcount)) { rb_erase(&group->node, &port->table); spin_unlock_irqrestore(&port->lock, flags); kfree(group); deref_port(port); } else spin_unlock_irqrestore(&port->lock, flags); } static void deref_member(struct mcast_member *member) { if (atomic_dec_and_test(&member->refcount)) complete(&member->comp); } static void queue_join(struct mcast_member *member) { struct mcast_group *group = member->group; unsigned long flags; spin_lock_irqsave(&group->lock, flags); list_add_tail(&member->list, &group->pending_list); if (group->state == MCAST_IDLE) { group->state = MCAST_BUSY; atomic_inc(&group->refcount); queue_work(mcast_wq, &group->work); } spin_unlock_irqrestore(&group->lock, flags); } /* * A multicast group has three types of members: full member, non member, and * send only member. We need to keep track of the number of members of each * type based on their join state. Adjust the number of members the belong to * the specified join states. */ static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) { int i; for (i = 0; i < 3; i++, join_state >>= 1) if (join_state & 0x1) group->members[i] += inc; } /* * If a multicast group has zero members left for a particular join state, but * the group is still a member with the SA, we need to leave that join state. * Determine which join states we still belong to, but that do not have any * active members. */ static u8 get_leave_state(struct mcast_group *group) { u8 leave_state = 0; int i; for (i = 0; i < 3; i++) if (!group->members[i]) leave_state |= (0x1 << i); return leave_state & group->rec.join_state; } static int check_selector(ib_sa_comp_mask comp_mask, ib_sa_comp_mask selector_mask, ib_sa_comp_mask value_mask, u8 selector, u8 src_value, u8 dst_value) { int err; if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) return 0; switch (selector) { case IB_SA_GT: err = (src_value <= dst_value); break; case IB_SA_LT: err = (src_value >= dst_value); break; case IB_SA_EQ: err = (src_value != dst_value); break; default: err = 0; break; } return err; } static int cmp_rec(struct ib_sa_mcmember_rec *src, struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask) { /* MGID must already match */ if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID && memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector, src->mtu, dst->mtu)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && src->traffic_class != dst->traffic_class) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, IB_SA_MCMEMBER_REC_RATE, dst->rate_selector, src->rate, dst->rate)) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, dst->packet_life_time_selector, src->packet_life_time, dst->packet_life_time)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL && src->flow_label != dst->flow_label) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT && src->hop_limit != dst->hop_limit) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope) return -EINVAL; /* join_state checked separately, proxy_join ignored */ return 0; } static int send_join(struct mcast_group *group, struct mcast_member *member) { struct mcast_port *port = group->port; int ret; group->last_join = member; ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, port->port_num, IB_MGMT_METHOD_SET, &member->multicast.rec, member->multicast.comp_mask, 3000, GFP_KERNEL, join_handler, group, &group->query); if (ret >= 0) { group->query_id = ret; ret = 0; } return ret; } static int send_leave(struct mcast_group *group, u8 leave_state) { struct mcast_port *port = group->port; struct ib_sa_mcmember_rec rec; int ret; rec = group->rec; rec.join_state = leave_state; group->leave_state = leave_state; ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, port->port_num, IB_SA_METHOD_DELETE, &rec, IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE, 3000, GFP_KERNEL, leave_handler, group, &group->query); if (ret >= 0) { group->query_id = ret; ret = 0; } return ret; } static void join_group(struct mcast_group *group, struct mcast_member *member, u8 join_state) { member->state = MCAST_MEMBER; adjust_membership(group, join_state, 1); group->rec.join_state |= join_state; member->multicast.rec = group->rec; member->multicast.rec.join_state = join_state; list_move(&member->list, &group->active_list); } static int fail_join(struct mcast_group *group, struct mcast_member *member, int status) { spin_lock_irq(&group->lock); list_del_init(&member->list); spin_unlock_irq(&group->lock); return member->multicast.callback(status, &member->multicast); } static void process_group_error(struct mcast_group *group) { struct mcast_member *member; int ret = 0; u16 pkey_index; if (group->state == MCAST_PKEY_EVENT) ret = ib_find_pkey(group->port->dev->device, group->port->port_num, be16_to_cpu(group->rec.pkey), &pkey_index); spin_lock_irq(&group->lock); if (group->state == MCAST_PKEY_EVENT && !ret && group->pkey_index == pkey_index) goto out; while (!list_empty(&group->active_list)) { member = list_entry(group->active_list.next, struct mcast_member, list); atomic_inc(&member->refcount); list_del_init(&member->list); adjust_membership(group, member->multicast.rec.join_state, -1); member->state = MCAST_ERROR; spin_unlock_irq(&group->lock); ret = member->multicast.callback(-ENETRESET, &member->multicast); deref_member(member); if (ret) ib_sa_free_multicast(&member->multicast); spin_lock_irq(&group->lock); } group->rec.join_state = 0; out: group->state = MCAST_BUSY; spin_unlock_irq(&group->lock); } static void mcast_work_handler(struct work_struct *work) { struct mcast_group *group; struct mcast_member *member; struct ib_sa_multicast *multicast; int status, ret; u8 join_state; group = container_of(work, typeof(*group), work); retest: spin_lock_irq(&group->lock); while (!list_empty(&group->pending_list) || (group->state != MCAST_BUSY)) { if (group->state != MCAST_BUSY) { spin_unlock_irq(&group->lock); process_group_error(group); goto retest; } member = list_entry(group->pending_list.next, struct mcast_member, list); multicast = &member->multicast; join_state = multicast->rec.join_state; atomic_inc(&member->refcount); if (join_state == (group->rec.join_state & join_state)) { status = cmp_rec(&group->rec, &multicast->rec, multicast->comp_mask); if (!status) join_group(group, member, join_state); else list_del_init(&member->list); spin_unlock_irq(&group->lock); ret = multicast->callback(status, multicast); } else { spin_unlock_irq(&group->lock); status = send_join(group, member); if (!status) { deref_member(member); return; } ret = fail_join(group, member, status); } deref_member(member); if (ret) ib_sa_free_multicast(&member->multicast); spin_lock_irq(&group->lock); } join_state = get_leave_state(group); if (join_state) { group->rec.join_state &= ~join_state; spin_unlock_irq(&group->lock); if (send_leave(group, join_state)) goto retest; } else { group->state = MCAST_IDLE; spin_unlock_irq(&group->lock); release_group(group); } } /* * Fail a join request if it is still active - at the head of the pending queue. */ static void process_join_error(struct mcast_group *group, int status) { struct mcast_member *member; int ret; spin_lock_irq(&group->lock); member = list_entry(group->pending_list.next, struct mcast_member, list); if (group->last_join == member) { atomic_inc(&member->refcount); list_del_init(&member->list); spin_unlock_irq(&group->lock); ret = member->multicast.callback(status, &member->multicast); deref_member(member); if (ret) ib_sa_free_multicast(&member->multicast); } else spin_unlock_irq(&group->lock); } static void join_handler(int status, struct ib_sa_mcmember_rec *rec, void *context) { struct mcast_group *group = context; u16 pkey_index = MCAST_INVALID_PKEY_INDEX; if (status) process_join_error(group, status); else { ib_find_pkey(group->port->dev->device, group->port->port_num, be16_to_cpu(rec->pkey), &pkey_index); spin_lock_irq(&group->port->lock); group->rec = *rec; if (group->state == MCAST_BUSY && group->pkey_index == MCAST_INVALID_PKEY_INDEX) group->pkey_index = pkey_index; if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) { rb_erase(&group->node, &group->port->table); mcast_insert(group->port, group, 1); } spin_unlock_irq(&group->port->lock); } mcast_work_handler(&group->work); } static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, void *context) { struct mcast_group *group = context; if (status && group->retries > 0 && !send_leave(group, group->leave_state)) group->retries--; else { if (status && group->retries <= 0) printk(KERN_WARNING "reached max retry count. " "status=%d. Giving up\n", status); mcast_work_handler(&group->work); } } static struct mcast_group *acquire_group(struct mcast_port *port, union ib_gid *mgid, gfp_t gfp_mask) { struct mcast_group *group, *cur_group; unsigned long flags; int is_mgid0; is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0); if (!is_mgid0) { spin_lock_irqsave(&port->lock, flags); group = mcast_find(port, mgid); if (group) goto found; spin_unlock_irqrestore(&port->lock, flags); } group = kzalloc(sizeof *group, gfp_mask); if (!group) return NULL; group->retries = mcast_leave_retries; group->port = port; group->rec.mgid = *mgid; group->pkey_index = MCAST_INVALID_PKEY_INDEX; INIT_LIST_HEAD(&group->pending_list); INIT_LIST_HEAD(&group->active_list); INIT_WORK(&group->work, mcast_work_handler); spin_lock_init(&group->lock); spin_lock_irqsave(&port->lock, flags); cur_group = mcast_insert(port, group, is_mgid0); if (cur_group) { kfree(group); group = cur_group; } else atomic_inc(&port->refcount); found: atomic_inc(&group->refcount); spin_unlock_irqrestore(&port->lock, flags); return group; } /* * We serialize all join requests to a single group to make our lives much * easier. Otherwise, two users could try to join the same group * simultaneously, with different configurations, one could leave while the * join is in progress, etc., which makes locking around error recovery * difficult. */ struct ib_sa_multicast * ib_sa_join_multicast(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, gfp_t gfp_mask, int (*callback)(int status, struct ib_sa_multicast *multicast), void *context) { struct mcast_device *dev; struct mcast_member *member; struct ib_sa_multicast *multicast; int ret; dev = ib_get_client_data(device, &mcast_client); if (!dev) return ERR_PTR(-ENODEV); member = kmalloc(sizeof *member, gfp_mask); if (!member) return ERR_PTR(-ENOMEM); ib_sa_client_get(client); member->client = client; member->multicast.rec = *rec; member->multicast.comp_mask = comp_mask; member->multicast.callback = callback; member->multicast.context = context; init_completion(&member->comp); atomic_set(&member->refcount, 1); member->state = MCAST_JOINING; member->group = acquire_group(&dev->port[port_num - dev->start_port], &rec->mgid, gfp_mask); if (!member->group) { ret = -ENOMEM; goto err; } /* * The user will get the multicast structure in their callback. They * could then free the multicast structure before we can return from * this routine. So we save the pointer to return before queuing * any callback. */ multicast = &member->multicast; queue_join(member); return multicast; err: ib_sa_client_put(client); kfree(member); return ERR_PTR(ret); } EXPORT_SYMBOL(ib_sa_join_multicast); void ib_sa_free_multicast(struct ib_sa_multicast *multicast) { struct mcast_member *member; struct mcast_group *group; member = container_of(multicast, struct mcast_member, multicast); group = member->group; spin_lock_irq(&group->lock); if (member->state == MCAST_MEMBER) adjust_membership(group, multicast->rec.join_state, -1); list_del_init(&member->list); if (group->state == MCAST_IDLE) { group->state = MCAST_BUSY; spin_unlock_irq(&group->lock); /* Continue to hold reference on group until callback */ queue_work(mcast_wq, &group->work); } else { spin_unlock_irq(&group->lock); release_group(group); } deref_member(member); wait_for_completion(&member->comp); ib_sa_client_put(member->client); kfree(member); } EXPORT_SYMBOL(ib_sa_free_multicast); int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num, union ib_gid *mgid, struct ib_sa_mcmember_rec *rec) { struct mcast_device *dev; struct mcast_port *port; struct mcast_group *group; unsigned long flags; int ret = 0; dev = ib_get_client_data(device, &mcast_client); if (!dev) return -ENODEV; port = &dev->port[port_num - dev->start_port]; spin_lock_irqsave(&port->lock, flags); group = mcast_find(port, mgid); if (group) *rec = group->rec; else ret = -EADDRNOTAVAIL; spin_unlock_irqrestore(&port->lock, flags); return ret; } EXPORT_SYMBOL(ib_sa_get_mcmember_rec); int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, struct ib_ah_attr *ah_attr) { int ret; u16 gid_index; u8 p; ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index); if (ret) return ret; memset(ah_attr, 0, sizeof *ah_attr); ah_attr->dlid = be16_to_cpu(rec->mlid); ah_attr->sl = rec->sl; ah_attr->port_num = port_num; ah_attr->static_rate = rec->rate; ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = rec->mgid; ah_attr->grh.sgid_index = (u8) gid_index; ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label); ah_attr->grh.hop_limit = rec->hop_limit; ah_attr->grh.traffic_class = rec->traffic_class; return 0; } EXPORT_SYMBOL(ib_init_ah_from_mcmember); static void mcast_groups_event(struct mcast_port *port, enum mcast_group_state state) { struct mcast_group *group; struct rb_node *node; unsigned long flags; spin_lock_irqsave(&port->lock, flags); for (node = rb_first(&port->table); node; node = rb_next(node)) { group = rb_entry(node, struct mcast_group, node); spin_lock(&group->lock); if (group->state == MCAST_IDLE) { atomic_inc(&group->refcount); queue_work(mcast_wq, &group->work); } if (group->state != MCAST_GROUP_ERROR) group->state = state; spin_unlock(&group->lock); } spin_unlock_irqrestore(&port->lock, flags); } static void mcast_event_handler(struct ib_event_handler *handler, struct ib_event *event) { struct mcast_device *dev; int index; dev = container_of(handler, struct mcast_device, event_handler); if (rdma_port_get_link_layer(dev->device, event->element.port_num) != IB_LINK_LAYER_INFINIBAND) return; index = event->element.port_num - dev->start_port; switch (event->event) { case IB_EVENT_PORT_ERR: case IB_EVENT_LID_CHANGE: case IB_EVENT_CLIENT_REREGISTER: mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR); break; case IB_EVENT_PKEY_CHANGE: mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT); break; default: break; } } static void mcast_add_one(struct ib_device *device) { struct mcast_device *dev; struct mcast_port *port; int i; int count = 0; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port, GFP_KERNEL); if (!dev) return; if (device->node_type == RDMA_NODE_IB_SWITCH) dev->start_port = dev->end_port = 0; else { dev->start_port = 1; dev->end_port = device->phys_port_cnt; } for (i = 0; i <= dev->end_port - dev->start_port; i++) { if (rdma_port_get_link_layer(device, dev->start_port + i) != IB_LINK_LAYER_INFINIBAND) continue; port = &dev->port[i]; port->dev = dev; port->port_num = dev->start_port + i; spin_lock_init(&port->lock); port->table = RB_ROOT; init_completion(&port->comp); atomic_set(&port->refcount, 1); ++count; } if (!count) { kfree(dev); return; } dev->device = device; ib_set_client_data(device, &mcast_client, dev); INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler); ib_register_event_handler(&dev->event_handler); } static void mcast_remove_one(struct ib_device *device) { struct mcast_device *dev; struct mcast_port *port; int i; dev = ib_get_client_data(device, &mcast_client); if (!dev) return; ib_unregister_event_handler(&dev->event_handler); flush_workqueue(mcast_wq); for (i = 0; i <= dev->end_port - dev->start_port; i++) { if (rdma_port_get_link_layer(device, dev->start_port + i) == IB_LINK_LAYER_INFINIBAND) { port = &dev->port[i]; deref_port(port); wait_for_completion(&port->comp); } } kfree(dev); } int mcast_init(void) { int ret; mcast_wq = create_singlethread_workqueue("ib_mcast"); if (!mcast_wq) return -ENOMEM; ib_sa_register_client(&sa_client); ret = ib_register_client(&mcast_client); if (ret) goto err; return 0; err: ib_sa_unregister_client(&sa_client); destroy_workqueue(mcast_wq); return ret; } void mcast_cleanup(void) { ib_unregister_client(&mcast_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(mcast_wq); } Index: head/sys/ofed/drivers/infiniband/core/umem.c =================================================================== --- head/sys/ofed/drivers/infiniband/core/umem.c (revision 300675) +++ head/sys/ofed/drivers/infiniband/core/umem.c (revision 300676) @@ -1,442 +1,444 @@ /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Cisco Systems. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#define LINUXKPI_PARAM_PREFIX ibcore_ + #include #include #include #include #include #include #include #include #include #include #include "uverbs.h" #define IB_UMEM_MAX_PAGE_CHUNK (PAGE_SIZE / sizeof (struct page *)) static int allow_weak_ordering; module_param_named(weak_ordering, allow_weak_ordering, int, 0444); MODULE_PARM_DESC(weak_ordering, "Allow weak ordering for data registered memory"); static struct ib_umem *peer_umem_get(struct ib_peer_memory_client *ib_peer_mem, struct ib_umem *umem, unsigned long addr, int dmasync, int invalidation_supported) { int ret; const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem; struct invalidation_ctx *invalidation_ctx = NULL; umem->ib_peer_mem = ib_peer_mem; if (invalidation_supported) { invalidation_ctx = kzalloc(sizeof(*invalidation_ctx), GFP_KERNEL); if (!invalidation_ctx) { ret = -ENOMEM; goto out; } umem->invalidation_ctx = invalidation_ctx; invalidation_ctx->umem = umem; mutex_lock(&ib_peer_mem->lock); invalidation_ctx->context_ticket = ib_peer_insert_context(ib_peer_mem, invalidation_ctx); /* unlock before calling get pages to prevent a dead-lock from the callback */ mutex_unlock(&ib_peer_mem->lock); } ret = peer_mem->get_pages(addr, umem->length, umem->writable, 1, &umem->sg_head, umem->peer_mem_client_context, invalidation_ctx ? (void *)invalidation_ctx->context_ticket : NULL); if (invalidation_ctx) { /* taking the lock back, checking that wasn't invalidated at that time */ mutex_lock(&ib_peer_mem->lock); if (invalidation_ctx->peer_invalidated) { printk(KERN_ERR "peer_umem_get: pages were invalidated by peer\n"); ret = -EINVAL; } } if (ret) goto out; umem->page_size = peer_mem->get_page_size (umem->peer_mem_client_context); if (umem->page_size <= 0) goto put_pages; umem->offset = addr & ((unsigned long)umem->page_size - 1); ret = peer_mem->dma_map(&umem->sg_head, umem->peer_mem_client_context, umem->context->device->dma_device, dmasync, &umem->nmap); if (ret) goto put_pages; ib_peer_mem->stats.num_reg_pages += umem->nmap * (umem->page_size >> PAGE_SHIFT); ib_peer_mem->stats.num_alloc_mrs += 1; return umem; put_pages: peer_mem->put_pages(umem->peer_mem_client_context, &umem->sg_head); out: if (invalidation_ctx) { ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket); mutex_unlock(&umem->ib_peer_mem->lock); kfree(invalidation_ctx); } ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context, umem->peer_mem_srcu_key); kfree(umem); return ERR_PTR(ret); } static void peer_umem_release(struct ib_umem *umem) { struct ib_peer_memory_client *ib_peer_mem = umem->ib_peer_mem; const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem; struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx; if (invalidation_ctx) { int peer_callback; int inflight_invalidation; /* If we are not under peer callback we must take the lock before removing * core ticket from the tree and releasing its umem. * It will let any inflight callbacks to be ended safely. * If we are under peer callback or under error flow of reg_mr so that context * wasn't activated yet lock was already taken. */ if (invalidation_ctx->func && !invalidation_ctx->peer_callback) mutex_lock(&ib_peer_mem->lock); ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket); /* make sure to check inflight flag after took the lock and remove from tree. * in addition, from that point using local variables for peer_callback and * inflight_invalidation as after the complete invalidation_ctx can't be accessed * any more as it may be freed by the callback. */ peer_callback = invalidation_ctx->peer_callback; inflight_invalidation = invalidation_ctx->inflight_invalidation; if (inflight_invalidation) complete(&invalidation_ctx->comp); /* On peer callback lock is handled externally */ if (!peer_callback) /* unlocking before put_pages */ mutex_unlock(&ib_peer_mem->lock); /* in case under callback context or callback is pending let it free the invalidation context */ if (!peer_callback && !inflight_invalidation) kfree(invalidation_ctx); } peer_mem->dma_unmap(&umem->sg_head, umem->peer_mem_client_context, umem->context->device->dma_device); peer_mem->put_pages(&umem->sg_head, umem->peer_mem_client_context); ib_peer_mem->stats.num_dereg_pages += umem->nmap * (umem->page_size >> PAGE_SHIFT); ib_peer_mem->stats.num_dealloc_mrs += 1; ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context, umem->peer_mem_srcu_key); kfree(umem); return; } static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { vm_object_t object; struct scatterlist *sg; struct page *page; int i; object = NULL; if (umem->nmap > 0) ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->nmap, DMA_BIDIRECTIONAL); for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { page = sg_page(sg); if (umem->writable && dirty) { if (object && object != page->object) VM_OBJECT_WUNLOCK(object); if (object != page->object) { object = page->object; VM_OBJECT_WLOCK(object); } vm_page_dirty(page); } } sg_free_table(&umem->sg_head); if (object) VM_OBJECT_WUNLOCK(object); } void ib_umem_activate_invalidation_notifier(struct ib_umem *umem, umem_invalidate_func_t func, void *cookie) { struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx; invalidation_ctx->func = func; invalidation_ctx->cookie = cookie; /* from that point any pending invalidations can be called */ mutex_unlock(&umem->ib_peer_mem->lock); return; } EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier); /** * ib_umem_get - Pin and DMA map userspace memory. * @context: userspace context to pin memory for * @addr: userspace virtual address to start at * @size: length of region to pin * @access: IB_ACCESS_xxx flags for memory being pinned * @dmasync: flush in-flight DMA when the memory region is written */ struct ib_umem *ib_umem_get_ex(struct ib_ucontext *context, unsigned long addr, size_t size, int access, int dmasync, int invalidation_supported) { struct ib_umem *umem; struct proc *proc; pmap_t pmap; vm_offset_t end, last, start; vm_size_t npages; int error; int ret; int ents; int i; DEFINE_DMA_ATTRS(attrs); struct scatterlist *sg, *sg_list_start; int need_release = 0; error = priv_check(curthread, PRIV_VM_MLOCK); if (error) return ERR_PTR(-error); last = addr + size; start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */ end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */ if (last < addr || end < addr) return ERR_PTR(-EINVAL); npages = atop(end - start); if (npages > vm_page_max_wired) return ERR_PTR(-ENOMEM); umem = kzalloc(sizeof *umem, GFP_KERNEL); if (!umem) return ERR_PTR(-ENOMEM); proc = curthread->td_proc; PROC_LOCK(proc); if (ptoa(npages + pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { PROC_UNLOCK(proc); kfree(umem); return ERR_PTR(-ENOMEM); } PROC_UNLOCK(proc); if (npages + vm_cnt.v_wire_count > vm_page_max_wired) { kfree(umem); return ERR_PTR(-EAGAIN); } error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES | (umem->writable ? VM_MAP_WIRE_WRITE : 0)); if (error != KERN_SUCCESS) { kfree(umem); return ERR_PTR(-ENOMEM); } umem->context = context; umem->length = size; umem->offset = addr & ~PAGE_MASK; umem->page_size = PAGE_SIZE; umem->start = addr; /* * We ask for writable memory if any access flags other than * "remote read" are set. "Local write" and "remote write" * obviously require write access. "Remote atomic" can do * things like fetch and add, which will modify memory, and * "MW bind" can change permissions by binding a window. */ umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); if (invalidation_supported || context->peer_mem_private_data) { struct ib_peer_memory_client *peer_mem_client; peer_mem_client = ib_get_peer_client(context, addr, size, &umem->peer_mem_client_context, &umem->peer_mem_srcu_key); if (peer_mem_client) return peer_umem_get(peer_mem_client, umem, addr, dmasync, invalidation_supported); } umem->hugetlb = 0; pmap = vm_map_pmap(&proc->p_vmspace->vm_map); if (npages == 0) { ret = -EINVAL; goto out; } ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); if (ret) goto out; need_release = 1; sg_list_start = umem->sg_head.sgl; while (npages) { ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK); umem->npages += ents; for_each_sg(sg_list_start, sg, ents, i) { vm_paddr_t pa; pa = pmap_extract(pmap, start); if (pa == 0) { ret = -ENOMEM; goto out; } sg_set_page(sg, PHYS_TO_VM_PAGE(pa), PAGE_SIZE, 0); npages--; start += PAGE_SIZE; } /* preparing for next loop */ sg_list_start = sg; } umem->nmap = ib_dma_map_sg_attrs(context->device, umem->sg_head.sgl, umem->npages, DMA_BIDIRECTIONAL, &attrs); if (umem->nmap != umem->npages) { ret = -ENOMEM; goto out; } out: if (ret < 0) { if (need_release) __ib_umem_release(context->device, umem, 0); kfree(umem); } return ret < 0 ? ERR_PTR(ret) : umem; } EXPORT_SYMBOL(ib_umem_get_ex); struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, size_t size, int access, int dmasync) { return ib_umem_get_ex(context, addr, size, access, dmasync, 0); } EXPORT_SYMBOL(ib_umem_get); /** * ib_umem_release - release memory pinned with ib_umem_get * @umem: umem struct to release */ void ib_umem_release(struct ib_umem *umem) { vm_offset_t addr, end, last, start; vm_size_t size; int error; if (umem->ib_peer_mem) { peer_umem_release(umem); return; } __ib_umem_release(umem->context->device, umem, 1); if (umem->context->closing) { kfree(umem); return; } error = priv_check(curthread, PRIV_VM_MUNLOCK); if (error) return; addr = umem->start; size = umem->length; last = addr + size; start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */ end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */ vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); kfree(umem); } EXPORT_SYMBOL(ib_umem_release); int ib_umem_page_count(struct ib_umem *umem) { int shift; int i; int n; struct scatterlist *sg; shift = ilog2(umem->page_size); n = 0; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) n += sg_dma_len(sg) >> shift; return n; } EXPORT_SYMBOL(ib_umem_page_count); Index: head/sys/ofed/drivers/infiniband/core/uverbs_cmd.c =================================================================== --- head/sys/ofed/drivers/infiniband/core/uverbs_cmd.c (revision 300675) +++ head/sys/ofed/drivers/infiniband/core/uverbs_cmd.c (revision 300676) @@ -1,3912 +1,3914 @@ /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * Copyright (c) 2006 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#define LINUXKPI_PARAM_PREFIX ibcore_ + #include #include #include #include #include #include #include #include #include #include #include "uverbs.h" static int disable_raw_qp_enforcement; module_param_named(disable_raw_qp_enforcement, disable_raw_qp_enforcement, int, 0444); MODULE_PARM_DESC(disable_raw_qp_enforcement, "Disable RAW QP enforcement for " "being opened by root (default: 0)"); struct uverbs_lock_class { struct lock_class_key key; char name[16]; }; static struct uverbs_lock_class pd_lock_class = { .name = "PD-uobj" }; static struct uverbs_lock_class mr_lock_class = { .name = "MR-uobj" }; static struct uverbs_lock_class mw_lock_class = { .name = "MW-uobj" }; static struct uverbs_lock_class cq_lock_class = { .name = "CQ-uobj" }; static struct uverbs_lock_class qp_lock_class = { .name = "QP-uobj" }; static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" }; static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; static struct uverbs_lock_class dct_lock_class = { .name = "DCT-uobj" }; static int uverbs_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; } static int uverbs_copy_to_udata(struct ib_udata *udata, void *src, size_t len) { return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; } static struct ib_udata_ops uverbs_copy = { .copy_from = uverbs_copy_from_udata, .copy_to = uverbs_copy_to_udata }; #define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ do { \ (udata)->ops = &uverbs_copy; \ (udata)->inbuf = (void __user *) (ibuf); \ (udata)->outbuf = (void __user *) (obuf); \ (udata)->inlen = (ilen); \ (udata)->outlen = (olen); \ } while (0) enum uverbs_cmd_type { IB_USER_VERBS_CMD_BASIC, IB_USER_VERBS_CMD_EXTENDED }; /* * The ib_uobject locking scheme is as follows: * * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it * needs to be held during all idr operations. When an object is * looked up, a reference must be taken on the object's kref before * dropping this lock. * * - Each object also has an rwsem. This rwsem must be held for * reading while an operation that uses the object is performed. * For example, while registering an MR, the associated PD's * uobject.mutex must be held for reading. The rwsem must be held * for writing while initializing or destroying an object. * * - In addition, each object has a "live" flag. If this flag is not * set, then lookups of the object will fail even if it is found in * the idr. This handles a reader that blocks and does not acquire * the rwsem until after the object is destroyed. The destroy * operation will set the live flag to 0 and then drop the rwsem; * this will allow the reader to acquire the rwsem, see that the * live flag is 0, and then drop the rwsem and its reference to * object. The underlying storage will not be freed until the last * reference to the object is dropped. */ static void init_uobj(struct ib_uobject *uobj, u64 user_handle, struct ib_ucontext *context, struct uverbs_lock_class *c) { uobj->user_handle = user_handle; uobj->context = context; kref_init(&uobj->ref); init_rwsem(&uobj->mutex); lockdep_set_class_and_name(&uobj->mutex, &c->key, c->name); uobj->live = 0; } static void release_uobj(struct kref *kref) { kfree(container_of(kref, struct ib_uobject, ref)); } static void put_uobj(struct ib_uobject *uobj) { kref_put(&uobj->ref, release_uobj); } static void put_uobj_read(struct ib_uobject *uobj) { up_read(&uobj->mutex); put_uobj(uobj); } static void put_uobj_write(struct ib_uobject *uobj) { up_write(&uobj->mutex); put_uobj(uobj); } static int idr_add_uobj(struct idr *idr, struct ib_uobject *uobj) { int ret; retry: if (!idr_pre_get(idr, GFP_KERNEL)) return -ENOMEM; spin_lock(&ib_uverbs_idr_lock); ret = idr_get_new(idr, uobj, &uobj->id); spin_unlock(&ib_uverbs_idr_lock); if (ret == -EAGAIN) goto retry; return ret; } void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj) { spin_lock(&ib_uverbs_idr_lock); idr_remove(idr, uobj->id); spin_unlock(&ib_uverbs_idr_lock); } static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id, struct ib_ucontext *context) { struct ib_uobject *uobj; spin_lock(&ib_uverbs_idr_lock); uobj = idr_find(idr, id); if (uobj) { if (uobj->context == context) kref_get(&uobj->ref); else uobj = NULL; } spin_unlock(&ib_uverbs_idr_lock); return uobj; } static struct ib_uobject *idr_read_uobj(struct idr *idr, int id, struct ib_ucontext *context, int nested) { struct ib_uobject *uobj; uobj = __idr_get_uobj(idr, id, context); if (!uobj) return NULL; if (nested) down_read_nested(&uobj->mutex, SINGLE_DEPTH_NESTING); else down_read(&uobj->mutex); if (!uobj->live) { put_uobj_read(uobj); return NULL; } return uobj; } static struct ib_uobject *idr_write_uobj(struct idr *idr, int id, struct ib_ucontext *context) { struct ib_uobject *uobj; uobj = __idr_get_uobj(idr, id, context); if (!uobj) return NULL; down_write(&uobj->mutex); if (!uobj->live) { put_uobj_write(uobj); return NULL; } return uobj; } static void *idr_read_obj(struct idr *idr, int id, struct ib_ucontext *context, int nested) { struct ib_uobject *uobj; uobj = idr_read_uobj(idr, id, context, nested); return uobj ? uobj->object : NULL; } static struct ib_pd *idr_read_pd(int pd_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_pd_idr, pd_handle, context, 0); } static void put_pd_read(struct ib_pd *pd) { put_uobj_read(pd->uobject); } static struct ib_cq *idr_read_cq(int cq_handle, struct ib_ucontext *context, int nested) { return idr_read_obj(&ib_uverbs_cq_idr, cq_handle, context, nested); } static void put_cq_read(struct ib_cq *cq) { put_uobj_read(cq->uobject); } static struct ib_ah *idr_read_ah(int ah_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_ah_idr, ah_handle, context, 0); } static void put_ah_read(struct ib_ah *ah) { put_uobj_read(ah->uobject); } static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0); } static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context) { struct ib_uobject *uobj; uobj = idr_write_uobj(&ib_uverbs_qp_idr, qp_handle, context); return uobj ? uobj->object : NULL; } static void put_qp_read(struct ib_qp *qp) { put_uobj_read(qp->uobject); } static void put_qp_write(struct ib_qp *qp) { put_uobj_write(qp->uobject); } static struct ib_dct *idr_read_dct(int dct_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_dct_idr, dct_handle, context, 0); } static void put_dct_read(struct ib_dct *dct) { put_uobj_read(dct->uobject); } static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0); } static void put_srq_read(struct ib_srq *srq) { put_uobj_read(srq->uobject); } static struct ib_xrcd *idr_read_xrcd(int xrcd_handle, struct ib_ucontext *context, struct ib_uobject **uobj) { *uobj = idr_read_uobj(&ib_uverbs_xrcd_idr, xrcd_handle, context, 0); return *uobj ? (*uobj)->object : NULL; } static void put_xrcd_read(struct ib_uobject *uobj) { put_uobj_read(uobj); } ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_get_context cmd; struct ib_uverbs_get_context_resp resp; struct ib_udata udata; struct ib_device *ibdev = file->device->ib_dev; struct ib_ucontext *ucontext; struct file *filp; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; mutex_lock(&file->mutex); if (file->ucontext) { ret = -EINVAL; goto err; } INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); ucontext = ibdev->alloc_ucontext(ibdev, &udata); if (IS_ERR(ucontext)) { ret = PTR_ERR(ucontext); goto err; } ucontext->device = ibdev; INIT_LIST_HEAD(&ucontext->pd_list); INIT_LIST_HEAD(&ucontext->mr_list); INIT_LIST_HEAD(&ucontext->mw_list); INIT_LIST_HEAD(&ucontext->cq_list); INIT_LIST_HEAD(&ucontext->qp_list); INIT_LIST_HEAD(&ucontext->srq_list); INIT_LIST_HEAD(&ucontext->ah_list); INIT_LIST_HEAD(&ucontext->xrcd_list); INIT_LIST_HEAD(&ucontext->rule_list); INIT_LIST_HEAD(&ucontext->dct_list); ucontext->closing = 0; ucontext->peer_mem_private_data = NULL; ucontext->peer_mem_name = NULL; resp.num_comp_vectors = file->device->num_comp_vectors; ret = get_unused_fd(); if (ret < 0) goto err_free; resp.async_fd = ret; filp = ib_uverbs_alloc_event_file(file, 1); if (IS_ERR(filp)) { ret = PTR_ERR(filp); goto err_fd; } file->async_file = filp->private_data; INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev, ib_uverbs_event_handler); ret = ib_register_event_handler(&file->event_handler); if (ret) goto err_file; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_file; } kref_get(&file->async_file->ref); kref_get(&file->ref); file->ucontext = ucontext; fd_install(resp.async_fd, filp); mutex_unlock(&file->mutex); return in_len; err_file: fput(filp); err_fd: put_unused_fd(resp.async_fd); err_free: ibdev->dealloc_ucontext(ucontext); err: mutex_unlock(&file->mutex); return ret; } static void ib_uverbs_query_device_assign( struct ib_uverbs_query_device_resp *resp, struct ib_device_attr *attr, struct ib_uverbs_file *file) { memset(resp, 0, sizeof(*resp)); resp->fw_ver = attr->fw_ver; resp->node_guid = file->device->ib_dev->node_guid; resp->sys_image_guid = attr->sys_image_guid; resp->max_mr_size = attr->max_mr_size; resp->page_size_cap = attr->page_size_cap; resp->vendor_id = attr->vendor_id; resp->vendor_part_id = attr->vendor_part_id; resp->hw_ver = attr->hw_ver; resp->max_qp = attr->max_qp; resp->max_qp_wr = attr->max_qp_wr; resp->device_cap_flags = attr->device_cap_flags; resp->max_sge = attr->max_sge; resp->max_sge_rd = attr->max_sge_rd; resp->max_cq = attr->max_cq; resp->max_cqe = attr->max_cqe; resp->max_mr = attr->max_mr; resp->max_pd = attr->max_pd; resp->max_qp_rd_atom = attr->max_qp_rd_atom; resp->max_ee_rd_atom = attr->max_ee_rd_atom; resp->max_res_rd_atom = attr->max_res_rd_atom; resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; resp->atomic_cap = attr->atomic_cap; resp->max_ee = attr->max_ee; resp->max_rdd = attr->max_rdd; resp->max_mw = attr->max_mw; resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; resp->max_mcast_grp = attr->max_mcast_grp; resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; resp->max_ah = attr->max_ah; resp->max_fmr = attr->max_fmr; resp->max_map_per_fmr = attr->max_map_per_fmr; resp->max_srq = attr->max_srq; resp->max_srq_wr = attr->max_srq_wr; resp->max_srq_sge = attr->max_srq_sge; resp->max_pkeys = attr->max_pkeys; resp->local_ca_ack_delay = attr->local_ca_ack_delay; resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt; } ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_query_device cmd; struct ib_uverbs_query_device_resp resp; struct ib_device_attr attr; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; ret = ib_query_device(file->device->ib_dev, &attr); if (ret) return ret; ib_uverbs_query_device_assign(&resp, &attr, file); if (copy_to_user((void __user *)(unsigned long) cmd.response, &resp, sizeof(resp))) return -EFAULT; return in_len; } ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_query_port cmd; struct ib_uverbs_query_port_resp resp; struct ib_port_attr attr; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr); if (ret) return ret; memset(&resp, 0, sizeof resp); resp.state = attr.state; resp.max_mtu = attr.max_mtu; resp.active_mtu = attr.active_mtu; resp.gid_tbl_len = attr.gid_tbl_len; resp.port_cap_flags = attr.port_cap_flags; resp.max_msg_sz = attr.max_msg_sz; resp.bad_pkey_cntr = attr.bad_pkey_cntr; resp.qkey_viol_cntr = attr.qkey_viol_cntr; resp.pkey_tbl_len = attr.pkey_tbl_len; resp.lid = attr.lid; resp.sm_lid = attr.sm_lid; resp.lmc = attr.lmc; resp.max_vl_num = attr.max_vl_num; resp.sm_sl = attr.sm_sl; resp.subnet_timeout = attr.subnet_timeout; resp.init_type_reply = attr.init_type_reply; resp.active_width = attr.active_width; resp.active_speed = attr.active_speed; resp.phys_state = attr.phys_state; resp.link_layer = rdma_port_get_link_layer(file->device->ib_dev, cmd.port_num); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) return -EFAULT; return in_len; } ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_alloc_pd cmd; struct ib_uverbs_alloc_pd_resp resp; struct ib_udata udata; struct ib_uobject *uobj; struct ib_pd *pd; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); uobj = kmalloc(sizeof *uobj, GFP_KERNEL); if (!uobj) return -ENOMEM; init_uobj(uobj, 0, file->ucontext, &pd_lock_class); down_write(&uobj->mutex); pd = file->device->ib_dev->alloc_pd(file->device->ib_dev, file->ucontext, &udata); if (IS_ERR(pd)) { ret = PTR_ERR(pd); goto err; } pd->device = file->device->ib_dev; pd->uobject = uobj; atomic_set(&pd->usecnt, 0); uobj->object = pd; ret = idr_add_uobj(&ib_uverbs_pd_idr, uobj); if (ret) goto err_idr; memset(&resp, 0, sizeof resp); resp.pd_handle = uobj->id; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; } mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->pd_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); return in_len; err_copy: idr_remove_uobj(&ib_uverbs_pd_idr, uobj); err_idr: ib_dealloc_pd(pd); err: put_uobj_write(uobj); return ret; } ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_dealloc_pd cmd; struct ib_uobject *uobj; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext); if (!uobj) return -EINVAL; ret = ib_dealloc_pd(uobj->object); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_pd_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); return in_len; } struct xrcd_table_entry { struct rb_node node; struct ib_xrcd *xrcd; struct inode *inode; }; static int xrcd_table_insert(struct ib_uverbs_device *dev, struct inode *inode, struct ib_xrcd *xrcd) { struct xrcd_table_entry *entry, *scan; struct rb_node **p = &dev->xrcd_tree.rb_node; struct rb_node *parent = NULL; entry = kmalloc(sizeof *entry, GFP_KERNEL); if (!entry) return -ENOMEM; entry->xrcd = xrcd; entry->inode = inode; while (*p) { parent = *p; scan = rb_entry(parent, struct xrcd_table_entry, node); if (inode < scan->inode) { p = &(*p)->rb_left; } else if (inode > scan->inode) { p = &(*p)->rb_right; } else { kfree(entry); return -EEXIST; } } rb_link_node(&entry->node, parent, p); rb_insert_color(&entry->node, &dev->xrcd_tree); igrab(inode); return 0; } static struct xrcd_table_entry *xrcd_table_search(struct ib_uverbs_device *dev, struct inode *inode) { struct xrcd_table_entry *entry; struct rb_node *p = dev->xrcd_tree.rb_node; while (p) { entry = rb_entry(p, struct xrcd_table_entry, node); if (inode < entry->inode) p = p->rb_left; else if (inode > entry->inode) p = p->rb_right; else return entry; } return NULL; } static struct ib_xrcd *find_xrcd(struct ib_uverbs_device *dev, struct inode *inode) { struct xrcd_table_entry *entry; entry = xrcd_table_search(dev, inode); if (!entry) return NULL; return entry->xrcd; } static void xrcd_table_delete(struct ib_uverbs_device *dev, struct inode *inode) { struct xrcd_table_entry *entry; entry = xrcd_table_search(dev, inode); if (entry) { iput(inode); rb_erase(&entry->node, &dev->xrcd_tree); kfree(entry); } } ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_open_xrcd cmd; struct ib_uverbs_open_xrcd_resp resp; struct ib_udata udata; struct ib_uxrcd_object *obj; struct ib_xrcd *xrcd = NULL; struct fd f = {NULL}; struct inode *inode = NULL; int ret = 0; int new_xrcd = 0; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); mutex_lock(&file->device->xrcd_tree_mutex); if (cmd.fd != -1) { /* search for file descriptor */ f = fdget(cmd.fd); if (!f.file) { ret = -EBADF; goto err_tree_mutex_unlock; } inode = f.file->f_dentry->d_inode; xrcd = find_xrcd(file->device, inode); if (!xrcd && !(cmd.oflags & O_CREAT)) { /* no file descriptor. Need CREATE flag */ ret = -EAGAIN; goto err_tree_mutex_unlock; } if (xrcd && cmd.oflags & O_EXCL) { ret = -EINVAL; goto err_tree_mutex_unlock; } } obj = kmalloc(sizeof *obj, GFP_KERNEL); if (!obj) { ret = -ENOMEM; goto err_tree_mutex_unlock; } init_uobj(&obj->uobject, 0, file->ucontext, &xrcd_lock_class); down_write(&obj->uobject.mutex); if (!xrcd) { xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev, file->ucontext, &udata); if (IS_ERR(xrcd)) { ret = PTR_ERR(xrcd); goto err; } xrcd->inode = inode; xrcd->device = file->device->ib_dev; atomic_set(&xrcd->usecnt, 0); mutex_init(&xrcd->tgt_qp_mutex); INIT_LIST_HEAD(&xrcd->tgt_qp_list); new_xrcd = 1; } atomic_set(&obj->refcnt, 0); obj->uobject.object = xrcd; ret = idr_add_uobj(&ib_uverbs_xrcd_idr, &obj->uobject); if (ret) goto err_idr; memset(&resp, 0, sizeof resp); resp.xrcd_handle = obj->uobject.id; if (inode) { if (new_xrcd) { /* create new inode/xrcd table entry */ ret = xrcd_table_insert(file->device, inode, xrcd); if (ret) goto err_insert_xrcd; } atomic_inc(&xrcd->usecnt); } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; } if (f.file) fdput(f); mutex_lock(&file->mutex); list_add_tail(&obj->uobject.list, &file->ucontext->xrcd_list); mutex_unlock(&file->mutex); obj->uobject.live = 1; up_write(&obj->uobject.mutex); mutex_unlock(&file->device->xrcd_tree_mutex); return in_len; err_copy: if (inode) { if (new_xrcd) xrcd_table_delete(file->device, inode); atomic_dec(&xrcd->usecnt); } err_insert_xrcd: idr_remove_uobj(&ib_uverbs_xrcd_idr, &obj->uobject); err_idr: ib_dealloc_xrcd(xrcd); err: put_uobj_write(&obj->uobject); err_tree_mutex_unlock: if (f.file) fdput(f); mutex_unlock(&file->device->xrcd_tree_mutex); return ret; } ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_close_xrcd cmd; struct ib_uobject *uobj; struct ib_xrcd *xrcd = NULL; struct inode *inode = NULL; struct ib_uxrcd_object *obj; int live; int ret = 0; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; mutex_lock(&file->device->xrcd_tree_mutex); uobj = idr_write_uobj(&ib_uverbs_xrcd_idr, cmd.xrcd_handle, file->ucontext); if (!uobj) { ret = -EINVAL; goto out; } xrcd = uobj->object; inode = xrcd->inode; obj = container_of(uobj, struct ib_uxrcd_object, uobject); if (atomic_read(&obj->refcnt)) { put_uobj_write(uobj); ret = -EBUSY; goto out; } if (!inode || atomic_dec_and_test(&xrcd->usecnt)) { ret = ib_dealloc_xrcd(uobj->object); if (!ret) uobj->live = 0; } live = uobj->live; if (inode && ret) atomic_inc(&xrcd->usecnt); put_uobj_write(uobj); if (ret) goto out; if (inode && !live) xrcd_table_delete(file->device, inode); idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); ret = in_len; out: mutex_unlock(&file->device->xrcd_tree_mutex); return ret; } void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd) { struct inode *inode; inode = xrcd->inode; if (inode && !atomic_dec_and_test(&xrcd->usecnt)) return; ib_dealloc_xrcd(xrcd); if (inode) xrcd_table_delete(dev, inode); } ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_reg_mr cmd; struct ib_uverbs_reg_mr_resp resp; struct ib_udata udata; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mr *mr; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) return -EINVAL; ret = ib_check_mr_access(cmd.access_flags); if (ret) return ret; uobj = kmalloc(sizeof *uobj, GFP_KERNEL); if (!uobj) return -ENOMEM; init_uobj(uobj, 0, file->ucontext, &mr_lock_class); down_write(&uobj->mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err_free; } /* We first get a new "obj id" to be passed later to reg mr for further use as mr_id. */ ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); if (ret) goto err_put; mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, cmd.access_flags, &udata, uobj->id); if (IS_ERR(mr)) { ret = PTR_ERR(mr); goto err_remove_uobj; } mr->device = pd->device; mr->pd = pd; mr->uobject = uobj; atomic_inc(&pd->usecnt); atomic_set(&mr->usecnt, 0); uobj->object = mr; memset(&resp, 0, sizeof resp); resp.lkey = mr->lkey; resp.rkey = mr->rkey; resp.mr_handle = uobj->id; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; } put_pd_read(pd); mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->mr_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); return in_len; err_copy: ib_dereg_mr(mr); err_remove_uobj: idr_remove_uobj(&ib_uverbs_mr_idr, uobj); err_put: put_pd_read(pd); err_free: put_uobj_write(uobj); return ret; } ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_dereg_mr cmd; struct ib_mr *mr; struct ib_uobject *uobj; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext); if (!uobj) return -EINVAL; mr = uobj->object; ret = ib_dereg_mr(mr); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_mr_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); return in_len; } ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_alloc_mw cmd; struct ib_uverbs_alloc_mw_resp resp; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mw *mw; int ret; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); if (!uobj) return -ENOMEM; init_uobj(uobj, 0, file->ucontext, &mw_lock_class); down_write(&uobj->mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err_free; } mw = pd->device->alloc_mw(pd, cmd.mw_type); if (IS_ERR(mw)) { ret = PTR_ERR(mw); goto err_put; } mw->device = pd->device; mw->pd = pd; mw->uobject = uobj; atomic_inc(&pd->usecnt); uobj->object = mw; ret = idr_add_uobj(&ib_uverbs_mw_idr, uobj); if (ret) goto err_unalloc; memset(&resp, 0, sizeof(resp)); resp.rkey = mw->rkey; resp.mw_handle = uobj->id; if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) { ret = -EFAULT; goto err_copy; } put_pd_read(pd); mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->mw_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); return in_len; err_copy: idr_remove_uobj(&ib_uverbs_mw_idr, uobj); err_unalloc: ib_dealloc_mw(mw); err_put: put_pd_read(pd); err_free: put_uobj_write(uobj); return ret; } ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_dealloc_mw cmd; struct ib_mw *mw; struct ib_uobject *uobj; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_mw_idr, cmd.mw_handle, file->ucontext); if (!uobj) return -EINVAL; mw = uobj->object; ret = ib_dealloc_mw(mw); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_mw_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); return in_len; } ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_create_comp_channel cmd; struct ib_uverbs_create_comp_channel_resp resp; struct file *filp; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; ret = get_unused_fd(); if (ret < 0) return ret; resp.fd = ret; filp = ib_uverbs_alloc_event_file(file, 0); if (IS_ERR(filp)) { put_unused_fd(resp.fd); return PTR_ERR(filp); } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { put_unused_fd(resp.fd); fput(filp); return -EFAULT; } fd_install(resp.fd, filp); return in_len; } static ssize_t create_cq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len, void *vcmd, int ex, void __user *response) { struct ib_uverbs_create_cq *cmd; struct ib_uverbs_create_cq_ex *cmd_e; struct ib_uverbs_create_cq_resp resp; struct ib_udata udata; struct ib_ucq_object *obj; struct ib_uverbs_event_file *ev_file = NULL; struct ib_cq *cq; struct ib_cq_init_attr attr; int cmd_sz; int ret; if (out_len < sizeof resp) return -ENOSPC; cmd = vcmd; cmd_e = vcmd; cmd_sz = ex ? sizeof(*cmd_e) : sizeof(*cmd); INIT_UDATA(&udata, buf + cmd_sz, response + sizeof(resp), in_len - sizeof(cmd), out_len - sizeof(resp)); if (cmd->comp_vector >= file->device->num_comp_vectors) return -EINVAL; obj = kmalloc(sizeof *obj, GFP_KERNEL); if (!obj) return -ENOMEM; init_uobj(&obj->uobject, cmd->user_handle, file->ucontext, &cq_lock_class); down_write(&obj->uobject.mutex); if (cmd->comp_channel >= 0) { ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel); if (!ev_file) { ret = -EINVAL; goto err; } } obj->uverbs_file = file; obj->comp_events_reported = 0; obj->async_events_reported = 0; INIT_LIST_HEAD(&obj->comp_list); INIT_LIST_HEAD(&obj->async_list); memset(&attr, 0, sizeof(attr)); attr.cqe = cmd->cqe; attr.comp_vector = cmd->comp_vector; if (ex && (cmd_e->comp_mask & IB_UVERBS_CREATE_CQ_EX_CAP_FLAGS)) attr.flags = cmd_e->create_flags; cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr, file->ucontext, &udata); if (IS_ERR(cq)) { ret = PTR_ERR(cq); goto err_file; } cq->device = file->device->ib_dev; cq->uobject = &obj->uobject; cq->comp_handler = ib_uverbs_comp_handler; cq->event_handler = ib_uverbs_cq_event_handler; cq->cq_context = ev_file; atomic_set(&cq->usecnt, 0); obj->uobject.object = cq; ret = idr_add_uobj(&ib_uverbs_cq_idr, &obj->uobject); if (ret) goto err_free; memset(&resp, 0, sizeof resp); resp.cq_handle = obj->uobject.id; resp.cqe = cq->cqe; if (copy_to_user(response, &resp, sizeof(resp))) { ret = -EFAULT; goto err_copy; } mutex_lock(&file->mutex); list_add_tail(&obj->uobject.list, &file->ucontext->cq_list); mutex_unlock(&file->mutex); obj->uobject.live = 1; up_write(&obj->uobject.mutex); return in_len; err_copy: idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject); err_free: ib_destroy_cq(cq); err_file: if (ev_file) ib_uverbs_release_ucq(file, ev_file, obj); err: put_uobj_write(&obj->uobject); return ret; } ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_create_cq cmd; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; return create_cq(file, buf, in_len, out_len, &cmd, IB_USER_VERBS_CMD_BASIC, (void __user *) (unsigned long) cmd.response); } ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_resize_cq cmd; struct ib_uverbs_resize_cq_resp resp; struct ib_udata udata; struct ib_cq *cq; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); if (!cq) return -EINVAL; ret = cq->device->resize_cq(cq, cmd.cqe, &udata); if (ret) goto out; resp.cqe = cq->cqe; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp.cqe)) ret = -EFAULT; out: put_cq_read(cq); return ret ? ret : in_len; } static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) { struct ib_uverbs_wc tmp; tmp.wr_id = wc->wr_id; tmp.status = wc->status; tmp.opcode = wc->opcode; tmp.vendor_err = wc->vendor_err; tmp.byte_len = wc->byte_len; tmp.ex.imm_data = (__u32 __force) wc->ex.imm_data; tmp.qp_num = wc->qp->qp_num; tmp.src_qp = wc->src_qp; tmp.wc_flags = wc->wc_flags; tmp.pkey_index = wc->pkey_index; tmp.slid = wc->slid; tmp.sl = wc->sl; tmp.dlid_path_bits = wc->dlid_path_bits; tmp.port_num = wc->port_num; tmp.reserved = 0; if (copy_to_user(dest, &tmp, sizeof tmp)) return -EFAULT; return 0; } ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_poll_cq cmd; struct ib_uverbs_poll_cq_resp resp; u8 __user *header_ptr; u8 __user *data_ptr; struct ib_cq *cq; struct ib_wc wc; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); if (!cq) return -EINVAL; /* we copy a struct ib_uverbs_poll_cq_resp to user space */ header_ptr = (void __user *)(unsigned long) cmd.response; data_ptr = header_ptr + sizeof resp; memset(&resp, 0, sizeof resp); while (resp.count < cmd.ne) { ret = ib_poll_cq(cq, 1, &wc); if (ret < 0) goto out_put; if (!ret) break; ret = copy_wc_to_user(data_ptr, &wc); if (ret) goto out_put; data_ptr += sizeof(struct ib_uverbs_wc); ++resp.count; } if (copy_to_user(header_ptr, &resp, sizeof resp)) { ret = -EFAULT; goto out_put; } ret = in_len; out_put: put_cq_read(cq); return ret; } ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_req_notify_cq cmd; struct ib_cq *cq; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); if (!cq) return -EINVAL; ib_req_notify_cq(cq, cmd.solicited_only ? IB_CQ_SOLICITED : IB_CQ_NEXT_COMP); put_cq_read(cq); return in_len; } ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_cq cmd; struct ib_uverbs_destroy_cq_resp resp; struct ib_uobject *uobj; struct ib_cq *cq; struct ib_ucq_object *obj; struct ib_uverbs_event_file *ev_file; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_cq_idr, cmd.cq_handle, file->ucontext); if (!uobj) return -EINVAL; cq = uobj->object; ev_file = cq->cq_context; obj = container_of(cq->uobject, struct ib_ucq_object, uobject); ret = ib_destroy_cq(cq); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_cq_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); ib_uverbs_release_ucq(file, ev_file, obj); memset(&resp, 0, sizeof resp); resp.comp_events_reported = obj->comp_events_reported; resp.async_events_reported = obj->async_events_reported; put_uobj(uobj); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) return -EFAULT; return in_len; } ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { void __user *response; struct ib_udata udata; struct ib_uqp_object *obj; struct ib_device *device; struct ib_pd *pd = NULL; struct ib_xrcd *xrcd = NULL; struct ib_uobject *uninitialized_var(xrcd_uobj); struct ib_cq *scq = NULL, *rcq = NULL; struct ib_srq *srq = NULL; struct ib_qp *qp; struct ib_qp_init_attr attr; int ret; union { struct ib_uverbs_create_qp basic; } cmd_obj; struct ib_uverbs_create_qp *cmd; size_t cmd_size = 0; union { struct ib_uverbs_create_qp_resp basic; } resp_obj; struct ib_uverbs_create_qp_resp *resp; size_t resp_size = 0; cmd_size = sizeof(cmd_obj.basic); cmd = &cmd_obj.basic; resp_size = sizeof(resp_obj.basic); resp = &resp_obj.basic; if (out_len < resp_size) return -ENOSPC; if (copy_from_user(&cmd_obj, buf, cmd_size)) return -EFAULT; response = (void __user *) (unsigned long) cmd->response; if (!disable_raw_qp_enforcement && cmd->qp_type == IB_QPT_RAW_PACKET && priv_check(curthread, PRIV_NET_RAW)) return -EPERM; INIT_UDATA(&udata, buf + cmd_size, response + resp_size, in_len - cmd_size, out_len - resp_size); obj = kzalloc(sizeof *obj, GFP_KERNEL); if (!obj) return -ENOMEM; init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &qp_lock_class); down_write(&obj->uevent.uobject.mutex); if (cmd->qp_type == IB_QPT_XRC_TGT) { xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext, &xrcd_uobj); if (!xrcd) { ret = -EINVAL; goto err_put; } device = xrcd->device; } else { if (cmd->qp_type == IB_QPT_XRC_INI) { cmd->max_recv_wr = 0; cmd->max_recv_sge = 0; } else { if (cmd->is_srq) { srq = idr_read_srq(cmd->srq_handle, file->ucontext); if (!srq || srq->srq_type != IB_SRQT_BASIC) { ret = -EINVAL; goto err_put; } } if (cmd->recv_cq_handle != cmd->send_cq_handle) { rcq = idr_read_cq(cmd->recv_cq_handle, file->ucontext, 0); if (!rcq) { ret = -EINVAL; goto err_put; } } } scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq); rcq = rcq ?: scq; pd = idr_read_pd(cmd->pd_handle, file->ucontext); if (!pd || !scq) { ret = -EINVAL; goto err_put; } device = pd->device; } memset(&attr, 0, sizeof attr); attr.event_handler = ib_uverbs_qp_event_handler; attr.qp_context = file; attr.send_cq = scq; attr.recv_cq = rcq; attr.srq = srq; attr.xrcd = xrcd; attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; attr.qp_type = cmd->qp_type; attr.create_flags = 0; attr.cap.max_send_wr = cmd->max_send_wr; attr.cap.max_recv_wr = cmd->max_recv_wr; attr.cap.max_send_sge = cmd->max_send_sge; attr.cap.max_recv_sge = cmd->max_recv_sge; attr.cap.max_inline_data = cmd->max_inline_data; obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); INIT_LIST_HEAD(&obj->mcast_list); if (cmd->qp_type == IB_QPT_XRC_TGT) qp = ib_create_qp(pd, &attr); else qp = device->create_qp(pd, &attr, &udata); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_put; } if (cmd->qp_type != IB_QPT_XRC_TGT) { qp->real_qp = qp; qp->device = device; qp->pd = pd; qp->send_cq = attr.send_cq; qp->recv_cq = attr.recv_cq; qp->srq = attr.srq; qp->event_handler = attr.event_handler; qp->qp_context = attr.qp_context; qp->qp_type = attr.qp_type; atomic_set(&qp->usecnt, 0); atomic_inc(&pd->usecnt); atomic_inc(&attr.send_cq->usecnt); if (attr.recv_cq) atomic_inc(&attr.recv_cq->usecnt); if (attr.srq) atomic_inc(&attr.srq->usecnt); } qp->uobject = &obj->uevent.uobject; obj->uevent.uobject.object = qp; ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); if (ret) goto err_destroy; memset(&resp_obj, 0, sizeof(resp_obj)); resp->qpn = qp->qp_num; resp->qp_handle = obj->uevent.uobject.id; resp->max_recv_sge = attr.cap.max_recv_sge; resp->max_send_sge = attr.cap.max_send_sge; resp->max_recv_wr = attr.cap.max_recv_wr; resp->max_send_wr = attr.cap.max_send_wr; resp->max_inline_data = attr.cap.max_inline_data; if (copy_to_user(response, &resp_obj, resp_size)) { ret = -EFAULT; goto err_copy; } if (xrcd) { obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); put_xrcd_read(xrcd_uobj); } if (pd) put_pd_read(pd); if (scq) put_cq_read(scq); if (rcq && rcq != scq) put_cq_read(rcq); if (srq) put_srq_read(srq); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); mutex_unlock(&file->mutex); obj->uevent.uobject.live = 1; up_write(&obj->uevent.uobject.mutex); return in_len; err_copy: idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); err_destroy: ib_destroy_qp(qp); err_put: if (xrcd) put_xrcd_read(xrcd_uobj); if (pd) put_pd_read(pd); if (scq) put_cq_read(scq); if (rcq && rcq != scq) put_cq_read(rcq); if (srq) put_srq_read(srq); put_uobj_write(&obj->uevent.uobject); return ret; } ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_open_qp cmd; struct ib_uverbs_create_qp_resp resp; struct ib_udata udata; struct ib_uqp_object *obj; struct ib_xrcd *xrcd; struct ib_uobject *uninitialized_var(xrcd_uobj); struct ib_qp *qp; struct ib_qp_open_attr attr; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); obj = kmalloc(sizeof *obj, GFP_KERNEL); if (!obj) return -ENOMEM; init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class); down_write(&obj->uevent.uobject.mutex); xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj); if (!xrcd) { ret = -EINVAL; goto err_put; } attr.event_handler = ib_uverbs_qp_event_handler; attr.qp_context = file; attr.qp_num = cmd.qpn; attr.qp_type = cmd.qp_type; obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); INIT_LIST_HEAD(&obj->mcast_list); qp = ib_open_qp(xrcd, &attr); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_put; } qp->uobject = &obj->uevent.uobject; obj->uevent.uobject.object = qp; ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); if (ret) goto err_destroy; memset(&resp, 0, sizeof resp); resp.qpn = qp->qp_num; resp.qp_handle = obj->uevent.uobject.id; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_remove; } obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); put_xrcd_read(xrcd_uobj); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); mutex_unlock(&file->mutex); obj->uevent.uobject.live = 1; up_write(&obj->uevent.uobject.mutex); return in_len; err_remove: idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); err_destroy: ib_destroy_qp(qp); err_put: put_xrcd_read(xrcd_uobj); put_uobj_write(&obj->uevent.uobject); return ret; } ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_query_qp cmd; struct ib_uverbs_query_qp_resp resp; struct ib_qp *qp; struct ib_qp_attr *attr; struct ib_qp_init_attr *init_attr; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; attr = kmalloc(sizeof *attr, GFP_KERNEL); init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL); if (!attr || !init_attr) { ret = -ENOMEM; goto out; } qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) { ret = -EINVAL; goto out; } ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr); put_qp_read(qp); if (ret) goto out; memset(&resp, 0, sizeof resp); resp.qp_state = attr->qp_state; resp.cur_qp_state = attr->cur_qp_state; resp.path_mtu = attr->path_mtu; resp.path_mig_state = attr->path_mig_state; resp.qkey = attr->qkey; resp.rq_psn = attr->rq_psn; resp.sq_psn = attr->sq_psn; resp.dest_qp_num = attr->dest_qp_num; resp.qp_access_flags = attr->qp_access_flags; resp.pkey_index = attr->pkey_index; resp.alt_pkey_index = attr->alt_pkey_index; resp.sq_draining = attr->sq_draining; resp.max_rd_atomic = attr->max_rd_atomic; resp.max_dest_rd_atomic = attr->max_dest_rd_atomic; resp.min_rnr_timer = attr->min_rnr_timer; resp.port_num = attr->port_num; resp.timeout = attr->timeout; resp.retry_cnt = attr->retry_cnt; resp.rnr_retry = attr->rnr_retry; resp.alt_port_num = attr->alt_port_num; resp.alt_timeout = attr->alt_timeout; memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16); resp.dest.flow_label = attr->ah_attr.grh.flow_label; resp.dest.sgid_index = attr->ah_attr.grh.sgid_index; resp.dest.hop_limit = attr->ah_attr.grh.hop_limit; resp.dest.traffic_class = attr->ah_attr.grh.traffic_class; resp.dest.dlid = attr->ah_attr.dlid; resp.dest.sl = attr->ah_attr.sl; resp.dest.src_path_bits = attr->ah_attr.src_path_bits; resp.dest.static_rate = attr->ah_attr.static_rate; resp.dest.is_global = !!(attr->ah_attr.ah_flags & IB_AH_GRH); resp.dest.port_num = attr->ah_attr.port_num; memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16); resp.alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label; resp.alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index; resp.alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit; resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class; resp.alt_dest.dlid = attr->alt_ah_attr.dlid; resp.alt_dest.sl = attr->alt_ah_attr.sl; resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits; resp.alt_dest.static_rate = attr->alt_ah_attr.static_rate; resp.alt_dest.is_global = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH); resp.alt_dest.port_num = attr->alt_ah_attr.port_num; resp.max_send_wr = init_attr->cap.max_send_wr; resp.max_recv_wr = init_attr->cap.max_recv_wr; resp.max_send_sge = init_attr->cap.max_send_sge; resp.max_recv_sge = init_attr->cap.max_recv_sge; resp.max_inline_data = init_attr->cap.max_inline_data; resp.sq_sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) ret = -EFAULT; out: kfree(attr); kfree(init_attr); return ret ? ret : in_len; } /* Remove ignored fields set in the attribute mask */ static int modify_qp_mask(enum ib_qp_type qp_type, int mask) { switch (qp_type) { case IB_QPT_XRC_INI: return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER); case IB_QPT_XRC_TGT: return mask & ~(IB_QP_MAX_QP_RD_ATOMIC | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY); default: return mask; } } static ssize_t __uverbs_modify_qp(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len, enum uverbs_cmd_type cmd_type) { struct ib_uverbs_modify_qp_ex cmd; struct ib_udata udata; struct ib_qp *qp; struct ib_qp_attr *attr; struct ib_qp_attr_ex *attrx; int ret; void *p; union ib_gid sgid; union ib_gid *dgid; u8 port_num; if (cmd_type == IB_USER_VERBS_CMD_BASIC) { p = &cmd; p += sizeof(cmd.comp_mask); if (copy_from_user(p, buf, sizeof(struct ib_uverbs_modify_qp))) return -EFAULT; } else { if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; } INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, out_len); attrx = kzalloc(sizeof(*attrx), GFP_KERNEL); if (!attrx) return -ENOMEM; attr = (struct ib_qp_attr *)attrx; qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) { kfree(attrx); return -EINVAL; } attr->qp_state = cmd.qp_state; attr->cur_qp_state = cmd.cur_qp_state; attr->path_mtu = cmd.path_mtu; attr->path_mig_state = cmd.path_mig_state; attr->qkey = cmd.qkey; attr->rq_psn = cmd.rq_psn; attr->sq_psn = cmd.sq_psn; attr->dest_qp_num = cmd.dest_qp_num; attr->qp_access_flags = cmd.qp_access_flags; attr->pkey_index = cmd.pkey_index; attr->alt_pkey_index = cmd.alt_pkey_index; attr->en_sqd_async_notify = cmd.en_sqd_async_notify; attr->max_rd_atomic = cmd.max_rd_atomic; attr->max_dest_rd_atomic = cmd.max_dest_rd_atomic; attr->min_rnr_timer = cmd.min_rnr_timer; attr->port_num = cmd.port_num; attr->timeout = cmd.timeout; attr->retry_cnt = cmd.retry_cnt; attr->rnr_retry = cmd.rnr_retry; attr->alt_port_num = cmd.alt_port_num; attr->alt_timeout = cmd.alt_timeout; memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16); attr->ah_attr.grh.flow_label = cmd.dest.flow_label; attr->ah_attr.grh.sgid_index = cmd.dest.sgid_index; attr->ah_attr.grh.hop_limit = cmd.dest.hop_limit; attr->ah_attr.grh.traffic_class = cmd.dest.traffic_class; attr->ah_attr.dlid = cmd.dest.dlid; attr->ah_attr.sl = cmd.dest.sl; attr->ah_attr.src_path_bits = cmd.dest.src_path_bits; attr->ah_attr.static_rate = cmd.dest.static_rate; attr->ah_attr.ah_flags = cmd.dest.is_global ? IB_AH_GRH : 0; attr->ah_attr.port_num = cmd.dest.port_num; memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16); attr->alt_ah_attr.grh.flow_label = cmd.alt_dest.flow_label; attr->alt_ah_attr.grh.sgid_index = cmd.alt_dest.sgid_index; attr->alt_ah_attr.grh.hop_limit = cmd.alt_dest.hop_limit; attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class; attr->alt_ah_attr.dlid = cmd.alt_dest.dlid; attr->alt_ah_attr.sl = cmd.alt_dest.sl; attr->alt_ah_attr.src_path_bits = cmd.alt_dest.src_path_bits; attr->alt_ah_attr.static_rate = cmd.alt_dest.static_rate; attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0; attr->alt_ah_attr.port_num = cmd.alt_dest.port_num; port_num = (cmd.attr_mask & IB_QP_PORT) ? cmd.port_num : qp->port_num; if ((cmd.attr_mask & IB_QP_AV) && port_num && (rdma_port_get_link_layer(qp->device, port_num) == IB_LINK_LAYER_ETHERNET)) { ret = ib_query_gid(qp->device, port_num, attr->ah_attr.grh.sgid_index, &sgid); if (ret) goto out; dgid = &attr->ah_attr.grh.dgid; if (rdma_link_local_addr((struct in6_addr *)dgid->raw)) { rdma_get_ll_mac((struct in6_addr *)dgid->raw, attr->ah_attr.dmac); rdma_get_ll_mac((struct in6_addr *)sgid.raw, attr->smac); attr->vlan_id = rdma_get_vlan_id(&sgid); } else { ret = rdma_addr_find_dmac_by_grh(&sgid, dgid, attr->ah_attr.dmac, &attr->vlan_id, -1U); if (ret) goto out; ret = rdma_addr_find_smac_by_sgid(&sgid, attr->smac, NULL, -1U); if (ret) goto out; } cmd.attr_mask |= IB_QP_SMAC; if (attr->vlan_id < 0xFFFF) cmd.attr_mask |= IB_QP_VID; } if (cmd_type == IB_USER_VERBS_CMD_EXTENDED) { if (cmd.comp_mask & IB_UVERBS_QP_ATTR_DCT_KEY) attrx->dct_key = cmd.dct_key; } if (qp->real_qp == qp) { ret = qp->device->modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata); if (!ret && (cmd.attr_mask & IB_QP_PORT)) qp->port_num = attr->port_num; } else { ret = ib_modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask)); } if (ret) goto out; ret = in_len; out: put_qp_read(qp); kfree(attrx); return ret; } ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { return __uverbs_modify_qp(file, buf, in_len, out_len, IB_USER_VERBS_CMD_BASIC); } ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_qp cmd; struct ib_uverbs_destroy_qp_resp resp; struct ib_uobject *uobj; struct ib_qp *qp; struct ib_uqp_object *obj; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; memset(&resp, 0, sizeof resp); uobj = idr_write_uobj(&ib_uverbs_qp_idr, cmd.qp_handle, file->ucontext); if (!uobj) return -EINVAL; qp = uobj->object; obj = container_of(uobj, struct ib_uqp_object, uevent.uobject); if (!list_empty(&obj->mcast_list)) { put_uobj_write(uobj); return -EBUSY; } ret = ib_destroy_qp(qp); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; if (obj->uxrcd) atomic_dec(&obj->uxrcd->refcnt); idr_remove_uobj(&ib_uverbs_qp_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); ib_uverbs_release_uevent(file, &obj->uevent); resp.events_reported = obj->uevent.events_reported; put_uobj(uobj); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) return -EFAULT; return in_len; } ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_post_send cmd; struct ib_uverbs_post_send_resp resp; struct ib_uverbs_send_wr *user_wr; struct ib_send_wr *wr = NULL, *last, *next, *bad_wr; struct ib_qp *qp; int i, sg_ind; int is_ud; ssize_t ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; if (in_len < sizeof cmd + cmd.wqe_size * cmd.wr_count + cmd.sge_count * sizeof (struct ib_uverbs_sge)) return -EINVAL; if (cmd.wqe_size < sizeof (struct ib_uverbs_send_wr)) return -EINVAL; user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL); if (!user_wr) return -ENOMEM; qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) goto out; is_ud = qp->qp_type == IB_QPT_UD; sg_ind = 0; last = NULL; for (i = 0; i < cmd.wr_count; ++i) { if (copy_from_user(user_wr, buf + sizeof cmd + i * cmd.wqe_size, cmd.wqe_size)) { ret = -EFAULT; goto out_put; } if (user_wr->num_sge + sg_ind > cmd.sge_count) { ret = -EINVAL; goto out_put; } next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + user_wr->num_sge * sizeof (struct ib_sge), GFP_KERNEL); if (!next) { ret = -ENOMEM; goto out_put; } if (!last) wr = next; else last->next = next; last = next; next->next = NULL; next->wr_id = user_wr->wr_id; next->num_sge = user_wr->num_sge; next->opcode = user_wr->opcode; next->send_flags = user_wr->send_flags; if (is_ud) { next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah, file->ucontext); if (!next->wr.ud.ah) { ret = -EINVAL; goto out_put; } next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn; next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey; } else { switch (next->opcode) { case IB_WR_RDMA_WRITE_WITH_IMM: next->ex.imm_data = (__be32 __force) user_wr->ex.imm_data; case IB_WR_RDMA_WRITE: case IB_WR_RDMA_READ: next->wr.rdma.remote_addr = user_wr->wr.rdma.remote_addr; next->wr.rdma.rkey = user_wr->wr.rdma.rkey; break; case IB_WR_SEND_WITH_IMM: next->ex.imm_data = (__be32 __force) user_wr->ex.imm_data; break; case IB_WR_SEND_WITH_INV: next->ex.invalidate_rkey = user_wr->ex.invalidate_rkey; break; case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: next->wr.atomic.remote_addr = user_wr->wr.atomic.remote_addr; next->wr.atomic.compare_add = user_wr->wr.atomic.compare_add; next->wr.atomic.swap = user_wr->wr.atomic.swap; next->wr.atomic.rkey = user_wr->wr.atomic.rkey; break; default: break; } } if (next->num_sge) { next->sg_list = (void *) next + ALIGN(sizeof *next, sizeof (struct ib_sge)); if (copy_from_user(next->sg_list, buf + sizeof cmd + cmd.wr_count * cmd.wqe_size + sg_ind * sizeof (struct ib_sge), next->num_sge * sizeof (struct ib_sge))) { ret = -EFAULT; goto out_put; } sg_ind += next->num_sge; } else next->sg_list = NULL; } resp.bad_wr = 0; ret = qp->device->post_send(qp->real_qp, wr, &bad_wr); if (ret) for (next = wr; next; next = next->next) { ++resp.bad_wr; if (next == bad_wr) break; } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) ret = -EFAULT; out_put: put_qp_read(qp); while (wr) { if (is_ud && wr->wr.ud.ah) put_ah_read(wr->wr.ud.ah); next = wr->next; kfree(wr); wr = next; } out: kfree(user_wr); return ret ? ret : in_len; } static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf, int in_len, u32 wr_count, u32 sge_count, u32 wqe_size) { struct ib_uverbs_recv_wr *user_wr; struct ib_recv_wr *wr = NULL, *last, *next; int sg_ind; int i; int ret; if (in_len < wqe_size * wr_count + sge_count * sizeof (struct ib_uverbs_sge)) return ERR_PTR(-EINVAL); if (wqe_size < sizeof (struct ib_uverbs_recv_wr)) return ERR_PTR(-EINVAL); user_wr = kmalloc(wqe_size, GFP_KERNEL); if (!user_wr) return ERR_PTR(-ENOMEM); sg_ind = 0; last = NULL; for (i = 0; i < wr_count; ++i) { if (copy_from_user(user_wr, buf + i * wqe_size, wqe_size)) { ret = -EFAULT; goto err; } if (user_wr->num_sge + sg_ind > sge_count) { ret = -EINVAL; goto err; } next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + user_wr->num_sge * sizeof (struct ib_sge), GFP_KERNEL); if (!next) { ret = -ENOMEM; goto err; } if (!last) wr = next; else last->next = next; last = next; next->next = NULL; next->wr_id = user_wr->wr_id; next->num_sge = user_wr->num_sge; if (next->num_sge) { next->sg_list = (void *) next + ALIGN(sizeof *next, sizeof (struct ib_sge)); if (copy_from_user(next->sg_list, buf + wr_count * wqe_size + sg_ind * sizeof (struct ib_sge), next->num_sge * sizeof (struct ib_sge))) { ret = -EFAULT; goto err; } sg_ind += next->num_sge; } else next->sg_list = NULL; } kfree(user_wr); return wr; err: kfree(user_wr); while (wr) { next = wr->next; kfree(wr); wr = next; } return ERR_PTR(ret); } ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_post_recv cmd; struct ib_uverbs_post_recv_resp resp; struct ib_recv_wr *wr, *next, *bad_wr; struct ib_qp *qp; ssize_t ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd, in_len - sizeof cmd, cmd.wr_count, cmd.sge_count, cmd.wqe_size); if (IS_ERR(wr)) return PTR_ERR(wr); qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) goto out; resp.bad_wr = 0; ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr); put_qp_read(qp); if (ret) for (next = wr; next; next = next->next) { ++resp.bad_wr; if (next == bad_wr) break; } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) ret = -EFAULT; out: while (wr) { next = wr->next; kfree(wr); wr = next; } return ret ? ret : in_len; } ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_post_srq_recv cmd; struct ib_uverbs_post_srq_recv_resp resp; struct ib_recv_wr *wr, *next, *bad_wr; struct ib_srq *srq; ssize_t ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd, in_len - sizeof cmd, cmd.wr_count, cmd.sge_count, cmd.wqe_size); if (IS_ERR(wr)) return PTR_ERR(wr); srq = idr_read_srq(cmd.srq_handle, file->ucontext); if (!srq) goto out; resp.bad_wr = 0; ret = srq->device->post_srq_recv(srq, wr, &bad_wr); put_srq_read(srq); if (ret) for (next = wr; next; next = next->next) { ++resp.bad_wr; if (next == bad_wr) break; } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) ret = -EFAULT; out: while (wr) { next = wr->next; kfree(wr); wr = next; } return ret ? ret : in_len; } ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_create_ah cmd; struct ib_uverbs_create_ah_resp resp; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_ah *ah; struct ib_ah_attr attr; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = kmalloc(sizeof *uobj, GFP_KERNEL); if (!uobj) return -ENOMEM; init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_class); down_write(&uobj->mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err; } attr.dlid = cmd.attr.dlid; attr.sl = cmd.attr.sl; attr.src_path_bits = cmd.attr.src_path_bits; attr.static_rate = cmd.attr.static_rate; attr.ah_flags = cmd.attr.is_global ? IB_AH_GRH : 0; attr.port_num = cmd.attr.port_num; attr.grh.flow_label = cmd.attr.grh.flow_label; attr.grh.sgid_index = cmd.attr.grh.sgid_index; attr.grh.hop_limit = cmd.attr.grh.hop_limit; attr.grh.traffic_class = cmd.attr.grh.traffic_class; memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16); ah = ib_create_ah(pd, &attr); if (IS_ERR(ah)) { ret = PTR_ERR(ah); goto err_put; } ah->uobject = uobj; uobj->object = ah; ret = idr_add_uobj(&ib_uverbs_ah_idr, uobj); if (ret) goto err_destroy; resp.ah_handle = uobj->id; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; } put_pd_read(pd); mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->ah_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); return in_len; err_copy: idr_remove_uobj(&ib_uverbs_ah_idr, uobj); err_destroy: ib_destroy_ah(ah); err_put: put_pd_read(pd); err: put_uobj_write(uobj); return ret; } ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_ah cmd; struct ib_ah *ah; struct ib_uobject *uobj; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_ah_idr, cmd.ah_handle, file->ucontext); if (!uobj) return -EINVAL; ah = uobj->object; ret = ib_destroy_ah(ah); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_ah_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); return in_len; } ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_attach_mcast cmd; struct ib_qp *qp; struct ib_uqp_object *obj; struct ib_uverbs_mcast_entry *mcast; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; qp = idr_write_qp(cmd.qp_handle, file->ucontext); if (!qp) return -EINVAL; obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); list_for_each_entry(mcast, &obj->mcast_list, list) if (cmd.mlid == mcast->lid && !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) { ret = 0; goto out_put; } mcast = kmalloc(sizeof *mcast, GFP_KERNEL); if (!mcast) { ret = -ENOMEM; goto out_put; } mcast->lid = cmd.mlid; memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw); ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid); if (!ret) list_add_tail(&mcast->list, &obj->mcast_list); else kfree(mcast); out_put: put_qp_write(qp); return ret ? ret : in_len; } ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_detach_mcast cmd; struct ib_uqp_object *obj; struct ib_qp *qp; struct ib_uverbs_mcast_entry *mcast; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; qp = idr_write_qp(cmd.qp_handle, file->ucontext); if (!qp) return -EINVAL; ret = ib_detach_mcast(qp, (union ib_gid *) cmd.gid, cmd.mlid); if (ret) goto out_put; obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); list_for_each_entry(mcast, &obj->mcast_list, list) if (cmd.mlid == mcast->lid && !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) { list_del(&mcast->list); kfree(mcast); break; } out_put: put_qp_write(qp); return ret ? ret : in_len; } static int __uverbs_create_xsrq(struct ib_uverbs_file *file, struct ib_uverbs_create_xsrq *cmd, struct ib_udata *udata) { struct ib_uverbs_create_srq_resp resp; struct ib_usrq_object *obj; struct ib_pd *pd; struct ib_srq *srq; struct ib_uobject *uninitialized_var(xrcd_uobj); struct ib_srq_init_attr attr; int ret; obj = kmalloc(sizeof(*obj), GFP_KERNEL); if (!obj) return -ENOMEM; init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class); down_write(&obj->uevent.uobject.mutex); if (cmd->srq_type == IB_SRQT_XRC) { attr.ext.xrc.xrcd = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj); if (!attr.ext.xrc.xrcd) { ret = -EINVAL; goto err; } obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); attr.ext.xrc.cq = idr_read_cq(cmd->cq_handle, file->ucontext, 0); if (!attr.ext.xrc.cq) { ret = -EINVAL; goto err_put_xrcd; } } pd = idr_read_pd(cmd->pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err_put_cq; } attr.event_handler = ib_uverbs_srq_event_handler; attr.srq_context = file; attr.srq_type = cmd->srq_type; attr.attr.max_wr = cmd->max_wr; attr.attr.max_sge = cmd->max_sge; attr.attr.srq_limit = cmd->srq_limit; obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); srq = pd->device->create_srq(pd, &attr, udata); if (IS_ERR(srq)) { ret = PTR_ERR(srq); goto err_put; } srq->device = pd->device; srq->pd = pd; srq->srq_type = cmd->srq_type; srq->uobject = &obj->uevent.uobject; srq->event_handler = attr.event_handler; srq->srq_context = attr.srq_context; if (cmd->srq_type == IB_SRQT_XRC) { srq->ext.xrc.cq = attr.ext.xrc.cq; srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; atomic_inc(&attr.ext.xrc.cq->usecnt); atomic_inc(&attr.ext.xrc.xrcd->usecnt); } atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); obj->uevent.uobject.object = srq; ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); if (ret) goto err_destroy; memset(&resp, 0, sizeof resp); resp.srq_handle = obj->uevent.uobject.id; resp.max_wr = attr.attr.max_wr; resp.max_sge = attr.attr.max_sge; if (cmd->srq_type == IB_SRQT_XRC) resp.srqn = srq->ext.xrc.srq_num; if (copy_to_user((void __user *) (unsigned long) cmd->response, &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; } if (cmd->srq_type == IB_SRQT_XRC) { put_uobj_read(xrcd_uobj); put_cq_read(attr.ext.xrc.cq); } put_pd_read(pd); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list); mutex_unlock(&file->mutex); obj->uevent.uobject.live = 1; up_write(&obj->uevent.uobject.mutex); return 0; err_copy: idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); err_destroy: ib_destroy_srq(srq); err_put: put_pd_read(pd); err_put_cq: if (cmd->srq_type == IB_SRQT_XRC) put_cq_read(attr.ext.xrc.cq); err_put_xrcd: if (cmd->srq_type == IB_SRQT_XRC) { atomic_dec(&obj->uxrcd->refcnt); put_uobj_read(xrcd_uobj); } err: put_uobj_write(&obj->uevent.uobject); return ret; } ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_create_srq cmd; struct ib_uverbs_create_xsrq xcmd; struct ib_uverbs_create_srq_resp resp; struct ib_udata udata; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; xcmd.response = cmd.response; xcmd.user_handle = cmd.user_handle; xcmd.srq_type = IB_SRQT_BASIC; xcmd.pd_handle = cmd.pd_handle; xcmd.max_wr = cmd.max_wr; xcmd.max_sge = cmd.max_sge; xcmd.srq_limit = cmd.srq_limit; INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); ret = __uverbs_create_xsrq(file, &xcmd, &udata); if (ret) return ret; return in_len; } ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_create_xsrq cmd; struct ib_uverbs_create_srq_resp resp; struct ib_udata udata; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); ret = __uverbs_create_xsrq(file, &cmd, &udata); if (ret) return ret; return in_len; } ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_modify_srq cmd; struct ib_udata udata; struct ib_srq *srq; struct ib_srq_attr attr; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, out_len); srq = idr_read_srq(cmd.srq_handle, file->ucontext); if (!srq) return -EINVAL; attr.max_wr = cmd.max_wr; attr.srq_limit = cmd.srq_limit; ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata); put_srq_read(srq); return ret ? ret : in_len; } ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_query_srq cmd; struct ib_uverbs_query_srq_resp resp; struct ib_srq_attr attr; struct ib_srq *srq; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; srq = idr_read_srq(cmd.srq_handle, file->ucontext); if (!srq) return -EINVAL; ret = ib_query_srq(srq, &attr); put_srq_read(srq); if (ret) return ret; memset(&resp, 0, sizeof resp); resp.max_wr = attr.max_wr; resp.max_sge = attr.max_sge; resp.srq_limit = attr.srq_limit; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) return -EFAULT; return in_len; } ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_srq cmd; struct ib_uverbs_destroy_srq_resp resp; struct ib_uobject *uobj; struct ib_srq *srq; struct ib_uevent_object *obj; int ret = -EINVAL; struct ib_usrq_object *us; enum ib_srq_type srq_type; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext); if (!uobj) return -EINVAL; srq = uobj->object; obj = container_of(uobj, struct ib_uevent_object, uobject); srq_type = srq->srq_type; ret = ib_destroy_srq(srq); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; if (srq_type == IB_SRQT_XRC) { us = container_of(obj, struct ib_usrq_object, uevent); atomic_dec(&us->uxrcd->refcnt); } idr_remove_uobj(&ib_uverbs_srq_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); ib_uverbs_release_uevent(file, obj); memset(&resp, 0, sizeof resp); resp.events_reported = obj->events_reported; put_uobj(uobj); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) ret = -EFAULT; return ret ? ret : in_len; } ssize_t ib_uverbs_exp_create_dct(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) { int in_len = ucore->inlen + uhw->inlen; int out_len = ucore->outlen + uhw->outlen; struct ib_uverbs_create_dct cmd; struct ib_uverbs_create_dct_resp resp; struct ib_udata udata; struct ib_udct_object *obj; struct ib_dct *dct; int ret; struct ib_dct_init_attr attr; struct ib_pd *pd = NULL; struct ib_cq *cq = NULL; struct ib_srq *srq = NULL; if (out_len < sizeof(resp)) return -ENOSPC; ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); if (ret) return ret; obj = kmalloc(sizeof(*obj), GFP_KERNEL); if (!obj) return -ENOMEM; init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &dct_lock_class); down_write(&obj->uobject.mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err_pd; } cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); if (!cq) { ret = -EINVAL; goto err_put; } srq = idr_read_srq(cmd.srq_handle, file->ucontext); if (!srq) { ret = -EINVAL; goto err_put; } attr.cq = cq; attr.access_flags = cmd.access_flags; attr.min_rnr_timer = cmd.min_rnr_timer; attr.srq = srq; attr.tclass = cmd.tclass; attr.flow_label = cmd.flow_label; attr.dc_key = cmd.dc_key; attr.mtu = cmd.mtu; attr.port = cmd.port; attr.pkey_index = cmd.pkey_index; attr.gid_index = cmd.gid_index; attr.hop_limit = cmd.hop_limit; attr.create_flags = cmd.create_flags; dct = ib_create_dct(pd, &attr, &udata); if (IS_ERR(dct)) { ret = PTR_ERR(dct); goto err_put; } dct->device = file->device->ib_dev; dct->uobject = &obj->uobject; obj->uobject.object = dct; ret = idr_add_uobj(&ib_uverbs_dct_idr, &obj->uobject); if (ret) goto err_dct; memset(&resp, 0, sizeof(resp)); resp.dct_handle = obj->uobject.id; resp.dctn = dct->dct_num; ret = ucore->ops->copy_to(ucore, &resp, sizeof(resp)); if (ret) goto err_copy; mutex_lock(&file->mutex); list_add_tail(&obj->uobject.list, &file->ucontext->dct_list); mutex_unlock(&file->mutex); obj->uobject.live = 1; put_srq_read(srq); put_cq_read(cq); put_pd_read(pd); up_write(&obj->uobject.mutex); return in_len; err_copy: idr_remove_uobj(&ib_uverbs_dct_idr, &obj->uobject); err_dct: ib_destroy_dct(dct); err_put: if (srq) put_srq_read(srq); if (cq) put_cq_read(cq); put_pd_read(pd); err_pd: put_uobj_write(&obj->uobject); return ret; } ssize_t ib_uverbs_exp_destroy_dct(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) { int in_len = ucore->inlen + uhw->inlen; int out_len = ucore->outlen + uhw->outlen; struct ib_uverbs_destroy_dct cmd; struct ib_uverbs_destroy_dct_resp resp; struct ib_uobject *uobj; struct ib_dct *dct; struct ib_udct_object *obj; int ret; if (out_len < sizeof(resp)) return -ENOSPC; ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); if (ret) return ret; uobj = idr_write_uobj(&ib_uverbs_dct_idr, cmd.user_handle, file->ucontext); if (!uobj) return -EINVAL; dct = uobj->object; obj = container_of(dct->uobject, struct ib_udct_object, uobject); ret = ib_destroy_dct(dct); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_dct_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); memset(&resp, 0, sizeof(resp)); put_uobj(uobj); ret = ucore->ops->copy_to(ucore, &resp, sizeof(resp)); if (ret) return ret; return in_len; } ssize_t ib_uverbs_exp_query_dct(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) { int in_len = ucore->inlen + uhw->inlen; int out_len = ucore->outlen + uhw->outlen; struct ib_uverbs_query_dct cmd; struct ib_uverbs_query_dct_resp resp; struct ib_dct *dct; struct ib_dct_attr *attr; int err; if (out_len < sizeof(resp)) return -ENOSPC; err = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); if (err) return err; attr = kmalloc(sizeof(*attr), GFP_KERNEL); if (!attr) { err = -ENOMEM; goto out; } dct = idr_read_dct(cmd.dct_handle, file->ucontext); if (!dct) { err = -EINVAL; goto out; } err = ib_query_dct(dct, attr); put_dct_read(dct); if (err) goto out; memset(&resp, 0, sizeof(resp)); resp.dc_key = attr->dc_key; resp.access_flags = attr->access_flags; resp.flow_label = attr->flow_label; resp.key_violations = attr->key_violations; resp.port = attr->port; resp.min_rnr_timer = attr->min_rnr_timer; resp.tclass = attr->tclass; resp.mtu = attr->mtu; resp.pkey_index = attr->pkey_index; resp.gid_index = attr->gid_index; resp.hop_limit = attr->hop_limit; resp.state = attr->state; err = ucore->ops->copy_to(ucore, &resp, sizeof(resp)); out: kfree(attr); return err ? err : in_len; } /* * Experimental functions */ static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec) { ib_spec->type = kern_spec->type; switch (ib_spec->type) { case IB_FLOW_SPEC_ETH: ib_spec->eth.size = sizeof(struct ib_flow_spec_eth); memcpy(&ib_spec->eth.val, &kern_spec->eth.val, sizeof(struct ib_flow_eth_filter)); memcpy(&ib_spec->eth.mask, &kern_spec->eth.mask, sizeof(struct ib_flow_eth_filter)); break; case IB_FLOW_SPEC_IB: ib_spec->ib.size = sizeof(struct ib_flow_spec_ib); memcpy(&ib_spec->ib.val, &kern_spec->ib.val, sizeof(struct ib_flow_ib_filter)); memcpy(&ib_spec->ib.mask, &kern_spec->ib.mask, sizeof(struct ib_flow_ib_filter)); break; case IB_FLOW_SPEC_IPV4: ib_spec->ipv4.size = sizeof(struct ib_flow_spec_ipv4); memcpy(&ib_spec->ipv4.val, &kern_spec->ipv4.val, sizeof(struct ib_flow_ipv4_filter)); memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask, sizeof(struct ib_flow_ipv4_filter)); break; case IB_FLOW_SPEC_TCP: case IB_FLOW_SPEC_UDP: ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp); memcpy(&ib_spec->tcp_udp.val, &kern_spec->tcp_udp.val, sizeof(struct ib_flow_tcp_udp_filter)); memcpy(&ib_spec->tcp_udp.mask, &kern_spec->tcp_udp.mask, sizeof(struct ib_flow_tcp_udp_filter)); break; default: return -EINVAL; } return 0; } int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_create_flow cmd; struct ib_uverbs_create_flow_resp resp; struct ib_uobject *uobj; struct ib_flow *flow_id; struct ib_uverbs_flow_attr *kern_flow_attr; struct ib_flow_attr *flow_attr; struct ib_qp *qp; int err = 0; void *kern_spec; void *ib_spec; int i; if (ucore->outlen < sizeof(resp)) return -ENOSPC; err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); if (err) return err; ucore->inbuf += sizeof(cmd); ucore->inlen -= sizeof(cmd); if (cmd.comp_mask) return -EINVAL; if (priv_check(curthread, PRIV_NET_RAW) && !disable_raw_qp_enforcement) return -EPERM; if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) return -EINVAL; if (cmd.flow_attr.size > ucore->inlen || cmd.flow_attr.size > (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec))) return -EINVAL; if (cmd.flow_attr.num_of_specs) { kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size, GFP_KERNEL); if (!kern_flow_attr) return -ENOMEM; memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr)); err = ib_copy_from_udata(kern_flow_attr + 1, ucore, cmd.flow_attr.size); if (err) goto err_free_attr; } else { kern_flow_attr = &cmd.flow_attr; } uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); if (!uobj) { err = -ENOMEM; goto err_free_attr; } init_uobj(uobj, 0, file->ucontext, &rule_lock_class); down_write(&uobj->mutex); qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) { err = -EINVAL; goto err_uobj; } flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, GFP_KERNEL); if (!flow_attr) { err = -ENOMEM; goto err_put; } flow_attr->type = kern_flow_attr->type; flow_attr->priority = kern_flow_attr->priority; flow_attr->num_of_specs = kern_flow_attr->num_of_specs; flow_attr->port = kern_flow_attr->port; flow_attr->flags = kern_flow_attr->flags; flow_attr->size = sizeof(*flow_attr); kern_spec = kern_flow_attr + 1; ib_spec = flow_attr + 1; for (i = 0; i < flow_attr->num_of_specs && cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) && cmd.flow_attr.size >= ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) { err = kern_spec_to_ib_spec(kern_spec, ib_spec); if (err) goto err_free; flow_attr->size += ((union ib_flow_spec *)ib_spec)->size; cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size; kern_spec += ((struct ib_uverbs_flow_spec *)kern_spec)->size; ib_spec += ((union ib_flow_spec *)ib_spec)->size; } if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n", i, cmd.flow_attr.size); goto err_free; } flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); if (IS_ERR(flow_id)) { err = PTR_ERR(flow_id); goto err_free; } flow_id->qp = qp; flow_id->uobject = uobj; uobj->object = flow_id; err = idr_add_uobj(&ib_uverbs_rule_idr, uobj); if (err) goto destroy_flow; memset(&resp, 0, sizeof(resp)); resp.flow_handle = uobj->id; err = ib_copy_to_udata(ucore, &resp, sizeof(resp)); if (err) goto err_copy; put_qp_read(qp); mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->rule_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); kfree(flow_attr); if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); return 0; err_copy: idr_remove_uobj(&ib_uverbs_rule_idr, uobj); destroy_flow: ib_destroy_flow(flow_id); err_free: kfree(flow_attr); err_put: put_qp_read(qp); err_uobj: put_uobj_write(uobj); err_free_attr: if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); return err; } int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_destroy_flow cmd; struct ib_flow *flow_id; struct ib_uobject *uobj; int ret; ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); if (ret) return ret; uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle, file->ucontext); if (!uobj) return -EINVAL; flow_id = uobj->object; ret = ib_destroy_flow(flow_id); if (!ret) uobj->live = 0; put_uobj_write(uobj); idr_remove_uobj(&ib_uverbs_rule_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); return ret; } ssize_t ib_uverbs_exp_modify_qp(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) { const char __user *buf = ucore->inbuf; int in_len = ucore->inlen + uhw->inlen; int out_len = ucore->outlen + uhw->outlen; return __uverbs_modify_qp(file, buf, in_len, out_len, IB_USER_VERBS_CMD_EXTENDED); } ssize_t ib_uverbs_exp_create_cq(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) { const char __user *buf = ucore->inbuf; int in_len = ucore->inlen + uhw->inlen; int out_len = ucore->outlen + uhw->outlen; struct ib_uverbs_create_cq_ex cmd; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; return create_cq(file, buf, in_len, out_len, &cmd, IB_USER_VERBS_CMD_EXTENDED, ucore->outbuf); } ssize_t ib_uverbs_exp_modify_cq(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) { const char __user *buf = ucore->inbuf; int in_len = ucore->inlen + uhw->inlen; struct ib_uverbs_modify_cq_ex cmd; struct ib_cq *cq; struct ib_cq_attr attr; int ret; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); if (!cq) return -EINVAL; attr.moderation.cq_count = cmd.cq_count; attr.moderation.cq_period = cmd.cq_period; attr.cq_cap_flags = cmd.cq_cap_flags; ret = ib_modify_cq(cq, &attr, cmd.attr_mask); put_cq_read(cq); return ret ? ret : in_len; } ssize_t ib_uverbs_exp_query_device(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_exp_query_device_resp resp; struct ib_exp_device_attr exp_attr; int ret; if (ucore->outlen + uhw->outlen < sizeof(resp)) return -ENOSPC; memset(&resp, 0, sizeof(resp)); memset(&exp_attr, 0, sizeof(exp_attr)); ret = ib_exp_query_device(file->device->ib_dev, &exp_attr); if (ret) return ret; ib_uverbs_query_device_assign(&resp.base, &exp_attr.base, file); resp.comp_mask = 0; resp.device_cap_flags2 = 0; /* * Handle regular attr fields */ if (exp_attr.base.comp_mask & IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK) { resp.timestamp_mask = exp_attr.base.timestamp_mask; resp.comp_mask |= IB_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK; } if (exp_attr.base.comp_mask & IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK) { resp.hca_core_clock = exp_attr.base.hca_core_clock; resp.comp_mask |= IB_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK; } /* * Handle experimental attr fields */ if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_CAP_FLAGS2) { resp.device_cap_flags2 = exp_attr.device_cap_flags2; resp.comp_mask |= IB_EXP_DEVICE_ATTR_CAP_FLAGS2; } if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_REQ_RD) { resp.dc_rd_req = exp_attr.dc_rd_req; resp.comp_mask |= IB_EXP_DEVICE_ATTR_DC_REQ_RD; } if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_RES_RD) { resp.dc_rd_res = exp_attr.dc_rd_res; resp.comp_mask |= IB_EXP_DEVICE_ATTR_DC_RES_RD; } if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ) { resp.inline_recv_sz = exp_attr.inline_recv_sz; resp.comp_mask |= IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ; } if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_RSS_TBL_SZ) { resp.max_rss_tbl_sz = exp_attr.max_rss_tbl_sz; resp.comp_mask |= IB_EXP_DEVICE_ATTR_RSS_TBL_SZ; } if (copy_to_user(ucore->outbuf, &resp, sizeof(resp))) return -EFAULT; return ucore->inlen + uhw->inlen; } ssize_t ib_uverbs_exp_create_qp(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uqp_object *obj; struct ib_device *device; struct ib_pd *pd = NULL; struct ib_xrcd *xrcd = NULL; struct ib_uobject *uninitialized_var(xrcd_uobj); struct ib_cq *scq = NULL, *rcq = NULL; struct ib_srq *srq = NULL; struct ib_qp *qp; struct ib_exp_qp_init_attr attr; int ret; struct ib_uverbs_exp_create_qp cmd_exp; struct ib_uverbs_exp_create_qp_resp resp_exp; struct ib_qp *parentqp = NULL; memset(&cmd_exp, 0, sizeof(cmd_exp)); ret = ucore->ops->copy_from(&cmd_exp, ucore, sizeof(cmd_exp)); if (ret) return ret; if (!disable_raw_qp_enforcement && cmd_exp.qp_type == IB_QPT_RAW_PACKET && priv_check(curthread, PRIV_NET_RAW)) return -EPERM; obj = kzalloc(sizeof(*obj), GFP_KERNEL); if (!obj) return -ENOMEM; init_uobj(&obj->uevent.uobject, cmd_exp.user_handle, file->ucontext, &qp_lock_class); down_write(&obj->uevent.uobject.mutex); if (cmd_exp.qp_type == IB_QPT_XRC_TGT) { xrcd = idr_read_xrcd(cmd_exp.pd_handle, file->ucontext, &xrcd_uobj); if (!xrcd) { ret = -EINVAL; goto err_put; } device = xrcd->device; } else { if (cmd_exp.qp_type == IB_QPT_XRC_INI) { cmd_exp.max_recv_wr = 0; cmd_exp.max_recv_sge = 0; } else { if (cmd_exp.is_srq) { srq = idr_read_srq(cmd_exp.srq_handle, file->ucontext); if (!srq || srq->srq_type != IB_SRQT_BASIC) { ret = -EINVAL; goto err_put; } } if (cmd_exp.recv_cq_handle != cmd_exp.send_cq_handle) { rcq = idr_read_cq(cmd_exp.recv_cq_handle, file->ucontext, 0); if (!rcq) { ret = -EINVAL; goto err_put; } } } scq = idr_read_cq(cmd_exp.send_cq_handle, file->ucontext, !!rcq); rcq = rcq ?: scq; pd = idr_read_pd(cmd_exp.pd_handle, file->ucontext); if (!pd || !scq) { ret = -EINVAL; goto err_put; } device = pd->device; } memset(&attr, 0, sizeof(attr)); attr.event_handler = ib_uverbs_qp_event_handler; attr.qp_context = file; attr.send_cq = scq; attr.recv_cq = rcq; attr.srq = srq; attr.xrcd = xrcd; attr.sq_sig_type = cmd_exp.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; attr.qp_type = cmd_exp.qp_type; attr.create_flags = 0; attr.cap.max_send_wr = cmd_exp.max_send_wr; attr.cap.max_recv_wr = cmd_exp.max_recv_wr; attr.cap.max_send_sge = cmd_exp.max_send_sge; attr.cap.max_recv_sge = cmd_exp.max_recv_sge; attr.cap.max_inline_data = cmd_exp.max_inline_data; if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_CAP_FLAGS) attr.create_flags |= cmd_exp.qp_cap_flags & (IB_QP_CREATE_CROSS_CHANNEL | IB_QP_CREATE_MANAGED_SEND | IB_QP_CREATE_MANAGED_RECV); if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_QPG) { struct ib_uverbs_qpg *qpg; if (cmd_exp.qp_type != IB_QPT_RAW_PACKET && cmd_exp.qp_type != IB_QPT_UD) { ret = -EINVAL; goto err_put; } qpg = &cmd_exp.qpg; switch (qpg->qpg_type) { case IB_QPG_PARENT: attr.parent_attrib.rss_child_count = qpg->parent_attrib.rss_child_count; attr.parent_attrib.tss_child_count = qpg->parent_attrib.tss_child_count; break; case IB_QPG_CHILD_RX: case IB_QPG_CHILD_TX: parentqp = idr_read_qp(qpg->parent_handle, file->ucontext); if (!parentqp) { ret = -EINVAL; goto err_put; } attr.qpg_parent = parentqp; break; default: ret = -EINVAL; goto err_put; } attr.qpg_type = qpg->qpg_type; } if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV) attr.max_inl_recv = cmd_exp.max_inl_recv; obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); INIT_LIST_HEAD(&obj->mcast_list); if (cmd_exp.qp_type == IB_QPT_XRC_TGT) qp = ib_create_qp(pd, (struct ib_qp_init_attr *)&attr); else qp = device->exp_create_qp(pd, &attr, uhw); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_put; } if (cmd_exp.qp_type != IB_QPT_XRC_TGT) { qp->real_qp = qp; qp->device = device; qp->pd = pd; qp->send_cq = attr.send_cq; qp->recv_cq = attr.recv_cq; qp->srq = attr.srq; qp->event_handler = attr.event_handler; qp->qp_context = attr.qp_context; qp->qp_type = attr.qp_type; atomic_set(&qp->usecnt, 0); atomic_inc(&pd->usecnt); atomic_inc(&attr.send_cq->usecnt); if (attr.recv_cq) atomic_inc(&attr.recv_cq->usecnt); if (attr.srq) atomic_inc(&attr.srq->usecnt); } qp->uobject = &obj->uevent.uobject; obj->uevent.uobject.object = qp; ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); if (ret) goto err_destroy; memset(&resp_exp, 0, sizeof(resp_exp)); resp_exp.qpn = qp->qp_num; resp_exp.qp_handle = obj->uevent.uobject.id; resp_exp.max_recv_sge = attr.cap.max_recv_sge; resp_exp.max_send_sge = attr.cap.max_send_sge; resp_exp.max_recv_wr = attr.cap.max_recv_wr; resp_exp.max_send_wr = attr.cap.max_send_wr; resp_exp.max_inline_data = attr.cap.max_inline_data; if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV) { resp_exp.comp_mask |= IB_UVERBS_EXP_CREATE_QP_RESP_INL_RECV; resp_exp.max_inl_recv = attr.max_inl_recv; } ret = ucore->ops->copy_to(ucore, &resp_exp, sizeof(resp_exp)); if (ret) goto err_copy; if (xrcd) { obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); put_xrcd_read(xrcd_uobj); } if (pd) put_pd_read(pd); if (scq) put_cq_read(scq); if (rcq && rcq != scq) put_cq_read(rcq); if (srq) put_srq_read(srq); if (parentqp) put_qp_read(parentqp); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); mutex_unlock(&file->mutex); obj->uevent.uobject.live = 1; up_write(&obj->uevent.uobject.mutex); return ucore->inlen + uhw->inlen; err_copy: idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); err_destroy: ib_destroy_qp(qp); err_put: if (xrcd) put_xrcd_read(xrcd_uobj); if (pd) put_pd_read(pd); if (scq) put_cq_read(scq); if (rcq && rcq != scq) put_cq_read(rcq); if (srq) put_srq_read(srq); if (parentqp) put_qp_read(parentqp); put_uobj_write(&obj->uevent.uobject); return ret; } int ib_exp_query_device(struct ib_device *device, struct ib_exp_device_attr *device_attr) { return device->exp_query_device(device, device_attr); } EXPORT_SYMBOL(ib_exp_query_device); Index: head/sys/ofed/drivers/infiniband/debug/memtrack.c =================================================================== --- head/sys/ofed/drivers/infiniband/debug/memtrack.c (revision 300675) +++ head/sys/ofed/drivers/infiniband/debug/memtrack.c (revision 300676) @@ -1,958 +1,960 @@ /* This software is available to you under a choice of one of two licenses. You may choose to be licensed under the terms of the GNU General Public License (GPL) Version 2, available at , or the OpenIB.org BSD license, available in the LICENSE.TXT file accompanying this software. These details are also available at . THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. */ +#define LINUXKPI_PARAM_PREFIX memtrack_ + #define C_MEMTRACK_C #ifdef kmalloc #undef kmalloc #endif #ifdef kmemdup #undef kmemdup #endif #ifdef kfree #undef kfree #endif #ifdef vmalloc #undef vmalloc #endif #ifdef vzalloc #undef vzalloc #endif #ifdef vzalloc_node #undef vzalloc_node #endif #ifdef vfree #undef vfree #endif #ifdef kmem_cache_alloc #undef kmem_cache_alloc #endif #ifdef kmem_cache_free #undef kmem_cache_free #endif #ifdef ioremap #undef ioremap #endif #ifdef io_mapping_create_wc #undef io_mapping_create_wc #endif #ifdef io_mapping_free #undef io_mapping_free #endif #ifdef ioremap_nocache #undef ioremap_nocache #endif #ifdef iounmap #undef iounmap #endif #ifdef alloc_pages #undef alloc_pages #endif #ifdef free_pages #undef free_pages #endif #ifdef get_page #undef get_page #endif #ifdef put_page #undef put_page #endif #ifdef create_workqueue #undef create_workqueue #endif #ifdef create_rt_workqueue #undef create_rt_workqueue #endif #ifdef create_freezeable_workqueue #undef create_freezeable_workqueue #endif #ifdef create_singlethread_workqueue #undef create_singlethread_workqueue #endif #ifdef destroy_workqueue #undef destroy_workqueue #endif #include #include #include #include #include #include #include #include #include #include "memtrack.h" #include MODULE_AUTHOR("Mellanox Technologies LTD."); MODULE_DESCRIPTION("Memory allocations tracking"); MODULE_LICENSE("GPL"); #define MEMTRACK_HASH_SZ ((1<<15)-19) /* prime: http://www.utm.edu/research/primes/lists/2small/0bit.html */ #define MAX_FILENAME_LEN 31 #define memtrack_spin_lock(spl, flags) spin_lock_irqsave(spl, flags) #define memtrack_spin_unlock(spl, flags) spin_unlock_irqrestore(spl, flags) /* if a bit is set then the corresponding allocation is tracked. bit0 corresponds to MEMTRACK_KMALLOC, bit1 corresponds to MEMTRACK_VMALLOC etc. */ static unsigned long track_mask = -1; /* effectively everything */ module_param(track_mask, ulong, 0444); MODULE_PARM_DESC(track_mask, "bitmask defining what is tracked"); /* if a bit is set then the corresponding allocation is strictly tracked. That is, before inserting the whole range is checked to not overlap any of the allocations already in the database */ static unsigned long strict_track_mask = 0; /* no strict tracking */ module_param(strict_track_mask, ulong, 0444); MODULE_PARM_DESC(strict_track_mask, "bitmask which allocation requires strict tracking"); /* Sets the frequency of allocations failures injections if set to 0 all allocation should succeed */ static unsigned int inject_freq = 0; module_param(inject_freq, uint, 0644); MODULE_PARM_DESC(inject_freq, "Error injection frequency, default is 0 (disabled)"); static int random_mem = 1; module_param(random_mem, uint, 0644); MODULE_PARM_DESC(random_mem, "When set, randomize allocated memory, default is 1 (enabled)"); struct memtrack_meminfo_t { unsigned long addr; unsigned long size; unsigned long line_num; unsigned long dev; unsigned long addr2; int direction; struct memtrack_meminfo_t *next; struct list_head list; /* used to link all items from a certain type together */ char filename[MAX_FILENAME_LEN + 1]; /* putting the char array last is better for struct. packing */ char ext_info[32]; }; static struct kmem_cache *meminfo_cache; struct tracked_obj_desc_t { struct memtrack_meminfo_t *mem_hash[MEMTRACK_HASH_SZ]; spinlock_t hash_lock; unsigned long count; /* size of memory tracked (*malloc) or number of objects tracked */ struct list_head tracked_objs_head; /* head of list of all objects */ int strict_track; /* if 1 then for each object inserted check if it overlaps any of the objects already in the list */ }; static struct tracked_obj_desc_t *tracked_objs_arr[MEMTRACK_NUM_OF_MEMTYPES]; static const char *rsc_names[MEMTRACK_NUM_OF_MEMTYPES] = { "kmalloc", "vmalloc", "kmem_cache_alloc", "io_remap", "create_workqueue", "alloc_pages", "ib_dma_map_single", "ib_dma_map_page", "ib_dma_map_sg" }; static const char *rsc_free_names[MEMTRACK_NUM_OF_MEMTYPES] = { "kfree", "vfree", "kmem_cache_free", "io_unmap", "destory_workqueue", "free_pages", "ib_dma_unmap_single", "ib_dma_unmap_page", "ib_dma_unmap_sg" }; static inline const char *memtype_alloc_str(enum memtrack_memtype_t memtype) { switch (memtype) { case MEMTRACK_KMALLOC: case MEMTRACK_VMALLOC: case MEMTRACK_KMEM_OBJ: case MEMTRACK_IOREMAP: case MEMTRACK_WORK_QUEUE: case MEMTRACK_PAGE_ALLOC: case MEMTRACK_DMA_MAP_SINGLE: case MEMTRACK_DMA_MAP_PAGE: case MEMTRACK_DMA_MAP_SG: return rsc_names[memtype]; default: return "(Unknown allocation type)"; } } static inline const char *memtype_free_str(enum memtrack_memtype_t memtype) { switch (memtype) { case MEMTRACK_KMALLOC: case MEMTRACK_VMALLOC: case MEMTRACK_KMEM_OBJ: case MEMTRACK_IOREMAP: case MEMTRACK_WORK_QUEUE: case MEMTRACK_PAGE_ALLOC: case MEMTRACK_DMA_MAP_SINGLE: case MEMTRACK_DMA_MAP_PAGE: case MEMTRACK_DMA_MAP_SG: return rsc_free_names[memtype]; default: return "(Unknown allocation type)"; } } /* * overlap_a_b */ static inline int overlap_a_b(unsigned long a_start, unsigned long a_end, unsigned long b_start, unsigned long b_end) { if ((b_start > a_end) || (a_start > b_end)) return 0; return 1; } /* * check_overlap */ static void check_overlap(enum memtrack_memtype_t memtype, struct memtrack_meminfo_t *mem_info_p, struct tracked_obj_desc_t *obj_desc_p) { struct list_head *pos, *next; struct memtrack_meminfo_t *cur; unsigned long start_a, end_a, start_b, end_b; start_a = mem_info_p->addr; end_a = mem_info_p->addr + mem_info_p->size - 1; list_for_each_safe(pos, next, &obj_desc_p->tracked_objs_head) { cur = list_entry(pos, struct memtrack_meminfo_t, list); start_b = cur->addr; end_b = cur->addr + cur->size - 1; if (overlap_a_b(start_a, end_a, start_b, end_b)) printk(KERN_ERR "%s overlaps! new_start=0x%lx, new_end=0x%lx, item_start=0x%lx, item_end=0x%lx\n", memtype_alloc_str(memtype), mem_info_p->addr, mem_info_p->addr + mem_info_p->size - 1, cur->addr, cur->addr + cur->size - 1); } } /* Invoke on memory allocation */ void memtrack_alloc(enum memtrack_memtype_t memtype, unsigned long dev, unsigned long addr, unsigned long size, unsigned long addr2, int direction, const char *filename, const unsigned long line_num, int alloc_flags) { unsigned long hash_val; struct memtrack_meminfo_t *cur_mem_info_p, *new_mem_info_p; struct tracked_obj_desc_t *obj_desc_p; unsigned long flags; if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); return; } if (!tracked_objs_arr[memtype]) { /* object is not tracked */ return; } obj_desc_p = tracked_objs_arr[memtype]; hash_val = addr % MEMTRACK_HASH_SZ; new_mem_info_p = (struct memtrack_meminfo_t *)kmem_cache_alloc(meminfo_cache, alloc_flags); if (new_mem_info_p == NULL) { printk(KERN_ERR "%s: Failed allocating kmem_cache item for new mem_info. " "Lost tracking on allocation at %s:%lu...\n", __func__, filename, line_num); return; } /* save allocation properties */ new_mem_info_p->addr = addr; new_mem_info_p->size = size; new_mem_info_p->dev = dev; new_mem_info_p->addr2 = addr2; new_mem_info_p->direction = direction; new_mem_info_p->line_num = line_num; *new_mem_info_p->ext_info = '\0'; /* Make sure that we will print out the path tail if the given filename is longer * than MAX_FILENAME_LEN. (otherwise, we will not see the name of the actual file * in the printout -- only the path head! */ if (strlen(filename) > MAX_FILENAME_LEN) strncpy(new_mem_info_p->filename, filename + strlen(filename) - MAX_FILENAME_LEN, MAX_FILENAME_LEN); else strncpy(new_mem_info_p->filename, filename, MAX_FILENAME_LEN); new_mem_info_p->filename[MAX_FILENAME_LEN] = 0; /* NULL terminate anyway */ memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* make sure given memory location is not already allocated */ if ((memtype != MEMTRACK_DMA_MAP_SINGLE) && (memtype != MEMTRACK_DMA_MAP_PAGE) && (memtype != MEMTRACK_DMA_MAP_SG)) { /* make sure given memory location is not already allocated */ cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; while (cur_mem_info_p != NULL) { if ((cur_mem_info_p->addr == addr) && (cur_mem_info_p->dev == dev)) { /* Found given address in the database */ printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s @ addr=0x%lX which is already known from %s:%lu\n", __func__, filename, line_num, memtype_alloc_str(memtype), addr, cur_mem_info_p->filename, cur_mem_info_p->line_num); memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); kmem_cache_free(meminfo_cache, new_mem_info_p); return; } cur_mem_info_p = cur_mem_info_p->next; } } /* not found - we can put in the hash bucket */ /* link as first */ new_mem_info_p->next = obj_desc_p->mem_hash[hash_val]; obj_desc_p->mem_hash[hash_val] = new_mem_info_p; if (obj_desc_p->strict_track) check_overlap(memtype, new_mem_info_p, obj_desc_p); obj_desc_p->count += size; list_add(&new_mem_info_p->list, &obj_desc_p->tracked_objs_head); memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); return; } EXPORT_SYMBOL(memtrack_alloc); /* Invoke on memory free */ void memtrack_free(enum memtrack_memtype_t memtype, unsigned long dev, unsigned long addr, unsigned long size, int direction, const char *filename, const unsigned long line_num) { unsigned long hash_val; struct memtrack_meminfo_t *cur_mem_info_p, *prev_mem_info_p; struct tracked_obj_desc_t *obj_desc_p; unsigned long flags; if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); return; } if (!tracked_objs_arr[memtype]) { /* object is not tracked */ return; } obj_desc_p = tracked_objs_arr[memtype]; hash_val = addr % MEMTRACK_HASH_SZ; memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* find mem_info of given memory location */ prev_mem_info_p = NULL; cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; while (cur_mem_info_p != NULL) { if ((cur_mem_info_p->addr == addr) && (cur_mem_info_p->dev == dev)) { /* Found given address in the database */ if ((memtype == MEMTRACK_DMA_MAP_SINGLE) || (memtype == MEMTRACK_DMA_MAP_PAGE) || (memtype == MEMTRACK_DMA_MAP_SG)) { if (direction != cur_mem_info_p->direction) printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s bad direction for addr 0x%lX: alloc:0x%x, free:0x%x (allocated in %s::%lu)\n", __func__, filename, line_num, memtype_free_str(memtype), addr, cur_mem_info_p->direction, direction, cur_mem_info_p->filename, cur_mem_info_p->line_num); if (size != cur_mem_info_p->size) printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s bad size for addr 0x%lX: size:%lu, free:%lu (allocated in %s::%lu)\n", __func__, filename, line_num, memtype_free_str(memtype), addr, cur_mem_info_p->size, size, cur_mem_info_p->filename, cur_mem_info_p->line_num); } /* Remove from the bucket/list */ if (prev_mem_info_p == NULL) obj_desc_p->mem_hash[hash_val] = cur_mem_info_p->next; /* removing first */ else prev_mem_info_p->next = cur_mem_info_p->next; /* "crossover" */ list_del(&cur_mem_info_p->list); obj_desc_p->count -= cur_mem_info_p->size; memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); kmem_cache_free(meminfo_cache, cur_mem_info_p); return; } prev_mem_info_p = cur_mem_info_p; cur_mem_info_p = cur_mem_info_p->next; } /* not found */ printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX, device=0x%lX\n", __func__, filename, line_num, memtype_free_str(memtype), addr, dev); memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); return; } EXPORT_SYMBOL(memtrack_free); /* * This function recognizes allocations which * may be released by kernel (e.g. skb) and * therefore not trackable by memtrack. * The allocations are recognized by the name * of their calling function. */ int is_non_trackable_alloc_func(const char *func_name) { static const char * const str_str_arr[] = { /* functions containing these strings consider non trackable */ "skb", }; static const char * const str_str_excep_arr[] = { /* functions which are exception to the str_str_arr table */ "ipoib_cm_skb_too_long" }; static const char * const str_cmp_arr[] = { /* functions that allocate SKBs */ "mlx4_en_alloc_frags", "mlx4_en_alloc_frag", "mlx4_en_init_allocator", "mlx4_en_free_frag", "mlx4_en_free_rx_desc", "mlx4_en_destroy_allocator", "mlx4_en_complete_rx_desc", /* vnic skb functions */ "free_single_frag", "vnic_alloc_rx_skb", "vnic_rx_skb", "vnic_alloc_frag", "vnic_empty_rx_entry", "vnic_init_allocator", "vnic_destroy_allocator", "sdp_post_recv", "sdp_rx_ring_purge", "sdp_post_srcavail", "sk_stream_alloc_page", "update_send_head", "sdp_bcopy_get", "sdp_destroy_resources", /* function that allocate memory for RDMA device context */ "ib_alloc_device" }; size_t str_str_arr_size = sizeof(str_str_arr)/sizeof(char *); size_t str_str_excep_size = sizeof(str_str_excep_arr)/sizeof(char *); size_t str_cmp_arr_size = sizeof(str_cmp_arr)/sizeof(char *); int i, j; for (i = 0; i < str_str_arr_size; ++i) if (strstr(func_name, str_str_arr[i])) { for (j = 0; j < str_str_excep_size; ++j) if (!strcmp(func_name, str_str_excep_arr[j])) return 0; return 1; } for (i = 0; i < str_cmp_arr_size; ++i) if (!strcmp(func_name, str_cmp_arr[i])) return 1; return 0; } EXPORT_SYMBOL(is_non_trackable_alloc_func); /* * In some cases we need to free a memory * we defined as "non trackable" (see * is_non_trackable_alloc_func). * This function recognizes such releases * by the name of their calling function. */ int is_non_trackable_free_func(const char *func_name) { static const char * const str_cmp_arr[] = { /* function that deallocate memory for RDMA device context */ "ib_dealloc_device" }; size_t str_cmp_arr_size = sizeof(str_cmp_arr)/sizeof(char *); int i; for (i = 0; i < str_cmp_arr_size; ++i) if (!strcmp(func_name, str_cmp_arr[i])) return 1; return 0; } EXPORT_SYMBOL(is_non_trackable_free_func); /* WA - In this function handles confirm the function name is '__ib_umem_release' or 'ib_umem_get' In this case we won't track the memory there because the kernel was the one who allocated it. Return value: 1 - if the function name is match, else 0 */ int is_umem_put_page(const char *func_name) { const char func_str[18] = "__ib_umem_release"; /* In case of error flow put_page is called as part of ib_umem_get */ const char func_str1[12] = "ib_umem_get"; return ((strstr(func_name, func_str) != NULL) || (strstr(func_name, func_str1) != NULL)) ? 1 : 0; } EXPORT_SYMBOL(is_umem_put_page); /* Check page order size When Freeing a page allocation it checks whether we are trying to free the same size we asked to allocate */ int memtrack_check_size(enum memtrack_memtype_t memtype, unsigned long addr, unsigned long size, const char *filename, const unsigned long line_num) { unsigned long hash_val; struct memtrack_meminfo_t *cur_mem_info_p; struct tracked_obj_desc_t *obj_desc_p; unsigned long flags; int ret = 0; if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); return 1; } if (!tracked_objs_arr[memtype]) { /* object is not tracked */ return 1; } obj_desc_p = tracked_objs_arr[memtype]; hash_val = addr % MEMTRACK_HASH_SZ; memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* find mem_info of given memory location */ cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; while (cur_mem_info_p != NULL) { if (cur_mem_info_p->addr == addr) { /* Found given address in the database - check size */ if (cur_mem_info_p->size != size) { printk(KERN_ERR "mtl size inconsistency: %s: %s::%lu: try to %s at address=0x%lX with size %lu while was created with size %lu\n", __func__, filename, line_num, memtype_free_str(memtype), addr, size, cur_mem_info_p->size); snprintf(cur_mem_info_p->ext_info, sizeof(cur_mem_info_p->ext_info), "invalid free size %lu\n", size); ret = 1; } memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); return ret; } cur_mem_info_p = cur_mem_info_p->next; } /* not found - This function will not give any indication but will only check the correct size\order For inconsistency the 'free' function will check that */ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); return 1; } EXPORT_SYMBOL(memtrack_check_size); /* Search for a specific addr whether it exist in the current data-base. It will print an error msg if we get an unexpected result, Return value: 0 - if addr exist, else 1 */ int memtrack_is_new_addr(enum memtrack_memtype_t memtype, unsigned long addr, int expect_exist, const char *filename, const unsigned long line_num) { unsigned long hash_val; struct memtrack_meminfo_t *cur_mem_info_p; struct tracked_obj_desc_t *obj_desc_p; unsigned long flags; if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); return 1; } if (!tracked_objs_arr[memtype]) { /* object is not tracked */ return 0; } obj_desc_p = tracked_objs_arr[memtype]; hash_val = addr % MEMTRACK_HASH_SZ; memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* find mem_info of given memory location */ cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; while (cur_mem_info_p != NULL) { if (cur_mem_info_p->addr == addr) { /* Found given address in the database - exiting */ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); return 0; } cur_mem_info_p = cur_mem_info_p->next; } /* not found */ if (expect_exist) printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX\n", __func__, filename, line_num, memtype_free_str(memtype), addr); memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); return 1; } EXPORT_SYMBOL(memtrack_is_new_addr); /* Return current page reference counter */ int memtrack_get_page_ref_count(unsigned long addr) { unsigned long hash_val; struct memtrack_meminfo_t *cur_mem_info_p; struct tracked_obj_desc_t *obj_desc_p; unsigned long flags; /* This function is called only for page allocation */ enum memtrack_memtype_t memtype = MEMTRACK_PAGE_ALLOC; int ref_conut = 0; if (!tracked_objs_arr[memtype]) { /* object is not tracked */ return ref_conut; } obj_desc_p = tracked_objs_arr[memtype]; hash_val = addr % MEMTRACK_HASH_SZ; memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* find mem_info of given memory location */ cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; while (cur_mem_info_p != NULL) { if (cur_mem_info_p->addr == addr) { /* Found given address in the database - check ref-count */ struct page *page = (struct page *)(cur_mem_info_p->addr); ref_conut = atomic_read(&page->_count); memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); return ref_conut; } cur_mem_info_p = cur_mem_info_p->next; } /* not found */ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); return ref_conut; } EXPORT_SYMBOL(memtrack_get_page_ref_count); /* Report current allocations status (for all memory types) */ static void memtrack_report(void) { enum memtrack_memtype_t memtype; unsigned long cur_bucket; struct memtrack_meminfo_t *cur_mem_info_p; int serial = 1; struct tracked_obj_desc_t *obj_desc_p; unsigned long flags; unsigned long detected_leaks = 0; printk(KERN_INFO "%s: Currently known allocations:\n", __func__); for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) { if (tracked_objs_arr[memtype]) { printk(KERN_INFO "%d) %s:\n", serial, memtype_alloc_str(memtype)); obj_desc_p = tracked_objs_arr[memtype]; /* Scan all buckets to find existing allocations */ /* TBD: this may be optimized by holding a linked list of all hash items */ for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ; cur_bucket++) { memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* protect per bucket/list */ cur_mem_info_p = obj_desc_p->mem_hash[cur_bucket]; while (cur_mem_info_p != NULL) { /* scan bucket */ printk(KERN_INFO "%s::%lu: %s(%lu)==%lX dev=%lX %s\n", cur_mem_info_p->filename, cur_mem_info_p->line_num, memtype_alloc_str(memtype), cur_mem_info_p->size, cur_mem_info_p->addr, cur_mem_info_p->dev, cur_mem_info_p->ext_info); cur_mem_info_p = cur_mem_info_p->next; ++ detected_leaks; } /* while cur_mem_info_p */ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); } /* for cur_bucket */ serial++; } } /* for memtype */ printk(KERN_INFO "%s: Summary: %lu leak(s) detected\n", __func__, detected_leaks); } static struct proc_dir_entry *memtrack_tree; static enum memtrack_memtype_t get_rsc_by_name(const char *name) { enum memtrack_memtype_t i; for (i = 0; i < MEMTRACK_NUM_OF_MEMTYPES; ++i) { if (strcmp(name, rsc_names[i]) == 0) return i; } return i; } static ssize_t memtrack_read(struct file *filp, char __user *buf, size_t size, loff_t *offset) { unsigned long cur, flags; loff_t pos = *offset; static char kbuf[20]; static int file_len; int _read, to_ret, left; const char *fname; enum memtrack_memtype_t memtype; if (pos < 0) return -EINVAL; fname = filp->f_dentry->d_name.name; memtype = get_rsc_by_name(fname); if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { printk(KERN_ERR "invalid file name\n"); return -EINVAL; } if (pos == 0) { memtrack_spin_lock(&tracked_objs_arr[memtype]->hash_lock, flags); cur = tracked_objs_arr[memtype]->count; memtrack_spin_unlock(&tracked_objs_arr[memtype]->hash_lock, flags); _read = sprintf(kbuf, "%lu\n", cur); if (_read < 0) return _read; else file_len = _read; } left = file_len - pos; to_ret = (left < size) ? left : size; if (copy_to_user(buf, kbuf+pos, to_ret)) return -EFAULT; else { *offset = pos + to_ret; return to_ret; } } static const struct file_operations memtrack_proc_fops = { .read = memtrack_read, }; static const char *memtrack_proc_entry_name = "mt_memtrack"; static int create_procfs_tree(void) { struct proc_dir_entry *dir_ent; struct proc_dir_entry *proc_ent; int i, j; unsigned long bit_mask; dir_ent = proc_mkdir(memtrack_proc_entry_name, NULL); if (!dir_ent) return -1; memtrack_tree = dir_ent; for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) { if (bit_mask & track_mask) { proc_ent = create_proc_entry(rsc_names[i], S_IRUGO, memtrack_tree); if (!proc_ent) goto undo_create_root; proc_ent->proc_fops = &memtrack_proc_fops; } } goto exit_ok; undo_create_root: for (j = 0, bit_mask = 1; j < i; ++j, bit_mask <<= 1) { if (bit_mask & track_mask) remove_proc_entry(rsc_names[j], memtrack_tree); } remove_proc_entry(memtrack_proc_entry_name, NULL); return -1; exit_ok: return 0; } static void destroy_procfs_tree(void) { int i; unsigned long bit_mask; for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) { if (bit_mask & track_mask) remove_proc_entry(rsc_names[i], memtrack_tree); } remove_proc_entry(memtrack_proc_entry_name, NULL); } int memtrack_inject_error(void) { int val = 0; if (inject_freq) { if (!(random32() % inject_freq)) val = 1; } return val; } EXPORT_SYMBOL(memtrack_inject_error); int memtrack_randomize_mem(void) { return random_mem; } EXPORT_SYMBOL(memtrack_randomize_mem); /* module entry points */ int init_module(void) { enum memtrack_memtype_t i; int j; unsigned long bit_mask; /* create a cache for the memtrack_meminfo_t strcutures */ meminfo_cache = kmem_cache_create("memtrack_meminfo_t", sizeof(struct memtrack_meminfo_t), 0, SLAB_HWCACHE_ALIGN, NULL); if (!meminfo_cache) { printk(KERN_ERR "memtrack::%s: failed to allocate meminfo cache\n", __func__); return -1; } /* initialize array of descriptors */ memset(tracked_objs_arr, 0, sizeof(tracked_objs_arr)); /* create a tracking object descriptor for all required objects */ for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) { if (bit_mask & track_mask) { tracked_objs_arr[i] = vmalloc(sizeof(struct tracked_obj_desc_t)); if (!tracked_objs_arr[i]) { printk(KERN_ERR "memtrack: failed to allocate tracking object\n"); goto undo_cache_create; } memset(tracked_objs_arr[i], 0, sizeof(struct tracked_obj_desc_t)); spin_lock_init(&tracked_objs_arr[i]->hash_lock); INIT_LIST_HEAD(&tracked_objs_arr[i]->tracked_objs_head); if (bit_mask & strict_track_mask) tracked_objs_arr[i]->strict_track = 1; else tracked_objs_arr[i]->strict_track = 0; } } if (create_procfs_tree()) { printk(KERN_ERR "%s: create_procfs_tree() failed\n", __FILE__); goto undo_cache_create; } printk(KERN_INFO "memtrack::%s done.\n", __func__); return 0; undo_cache_create: for (j = 0; j < i; ++j) { if (tracked_objs_arr[j]) vfree(tracked_objs_arr[j]); } #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19) if (kmem_cache_destroy(meminfo_cache) != 0) printk(KERN_ERR "Failed on kmem_cache_destroy!\n"); #else kmem_cache_destroy(meminfo_cache); #endif return -1; } void cleanup_module(void) { enum memtrack_memtype_t memtype; unsigned long cur_bucket; struct memtrack_meminfo_t *cur_mem_info_p, *next_mem_info_p; struct tracked_obj_desc_t *obj_desc_p; unsigned long flags; memtrack_report(); destroy_procfs_tree(); /* clean up any hash table left-overs */ for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) { /* Scan all buckets to find existing allocations */ /* TBD: this may be optimized by holding a linked list of all hash items */ if (tracked_objs_arr[memtype]) { obj_desc_p = tracked_objs_arr[memtype]; for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ; cur_bucket++) { memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* protect per bucket/list */ cur_mem_info_p = obj_desc_p->mem_hash[cur_bucket]; while (cur_mem_info_p != NULL) { /* scan bucket */ next_mem_info_p = cur_mem_info_p->next; /* save "next" pointer before the "free" */ kmem_cache_free(meminfo_cache, cur_mem_info_p); cur_mem_info_p = next_mem_info_p; } /* while cur_mem_info_p */ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); } /* for cur_bucket */ vfree(obj_desc_p); } } /* for memtype */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19) if (kmem_cache_destroy(meminfo_cache) != 0) printk(KERN_ERR "memtrack::cleanup_module: Failed on kmem_cache_destroy!\n"); #else kmem_cache_destroy(meminfo_cache); #endif printk(KERN_INFO "memtrack::cleanup_module done.\n"); } Index: head/sys/ofed/drivers/infiniband/hw/mlx4/main.c =================================================================== --- head/sys/ofed/drivers/infiniband/hw/mlx4/main.c (revision 300675) +++ head/sys/ofed/drivers/infiniband/hw/mlx4/main.c (revision 300676) @@ -1,2894 +1,2896 @@ /* * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#define LINUXKPI_PARAM_PREFIX mlx4_ + #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mlx4_ib.h" #include "mlx4_exp.h" #include "user.h" #include "wc.h" #define DRV_NAME MLX4_IB_DRV_NAME #define DRV_VERSION "1.0" #define DRV_RELDATE __DATE__ #define MLX4_IB_DRIVER_PROC_DIR_NAME "driver/mlx4_ib" #define MLX4_IB_MRS_PROC_DIR_NAME "mrs" #define MLX4_IB_FLOW_MAX_PRIO 0xFFF #define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); MODULE_LICENSE("Dual BSD/GPL"); #ifdef __linux__ MODULE_VERSION(DRV_VERSION); #endif int mlx4_ib_sm_guid_assign = 1; module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)"); enum { MAX_NUM_STR_BITMAP = 1 << 15, DEFAULT_TBL_VAL = -1 }; static struct mlx4_dbdf2val_lst dev_assign_str = { .name = "dev_assign_str param", .num_vals = 1, .def_val = {DEFAULT_TBL_VAL}, .range = {0, MAX_NUM_STR_BITMAP - 1} }; module_param_string(dev_assign_str, dev_assign_str.str, sizeof(dev_assign_str.str), 0444); MODULE_PARM_DESC(dev_assign_str, "Map device function numbers to IB device numbers (e.g. '0000:04:00.0-0,002b:1c:0b.a-1,...').\n" "\t\tHexadecimal digits for the device function (e.g. 002b:1c:0b.a) and decimal for IB device numbers (e.g. 1).\n" "\t\tMax supported devices - 32"); static unsigned long *dev_num_str_bitmap; static spinlock_t dev_num_str_lock; static const char mlx4_ib_version[] = DRV_NAME ": Mellanox ConnectX InfiniBand driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; struct update_gid_work { struct work_struct work; union ib_gid gids[128]; struct mlx4_ib_dev *dev; int port; }; struct dev_rec { int bus; int dev; int func; int nr; }; static int dr_active; static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init); static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, struct net_device*, unsigned long); static u8 mlx4_ib_get_dev_port(struct net_device *dev, struct mlx4_ib_dev *ibdev); static struct workqueue_struct *wq; static void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; mad->class_version = 1; mad->method = IB_MGMT_METHOD_GET; } static union ib_gid zgid; static int check_flow_steering_support(struct mlx4_dev *dev) { int eth_num_ports = 0; int ib_num_ports = 0; int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED; if (dmfs) { int i; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) eth_num_ports++; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) ib_num_ports++; dmfs &= (!ib_num_ports || (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) && (!eth_num_ports || (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)); if (ib_num_ports && mlx4_is_mfunc(dev)) { dmfs = 0; } } return dmfs; } int mlx4_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; memset(props, 0, sizeof *props); props->fw_ver = dev->dev->caps.fw_ver; props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK | IB_DEVICE_SHARED_MR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM) props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT) props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; if (dev->dev->caps.max_gso_sz && dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH) props->device_cap_flags |= IB_DEVICE_UD_TSO; if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY) props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR)) props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) props->device_cap_flags |= IB_DEVICE_XRC; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_CROSS_CHANNEL) props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; if (check_flow_steering_support(dev->dev)) props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; props->device_cap_flags |= IB_DEVICE_QPG; if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) { props->device_cap_flags |= IB_DEVICE_UD_RSS; props->max_rss_tbl_sz = dev->dev->caps.max_rss_tbl_sz; } if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) props->device_cap_flags |= IB_DEVICE_MEM_WINDOW; if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B) props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B; else props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A; } props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & 0xffffff; props->vendor_part_id = dev->dev->pdev->device; props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&props->sys_image_guid, out_mad->data + 4, 8); props->max_mr_size = ~0ull; props->page_size_cap = dev->dev->caps.page_size_cap; props->max_qp = dev->dev->quotas.qp; props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; props->max_mr = dev->dev->quotas.mpt; props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds; props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma; props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma; props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq = dev->dev->quotas.srq; props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; props->max_srq_sge = dev->dev->caps.max_srq_sge; props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay; props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ? IB_ATOMIC_HCA : IB_ATOMIC_NONE; props->masked_atomic_cap = props->atomic_cap; props->max_pkeys = dev->dev->caps.pkey_table_len[1]; props->max_mcast_grp = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms; props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm; props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = dev->dev->caps.max_fmr_maps; props->hca_core_clock = dev->dev->caps.hca_core_clock; if (dev->dev->caps.hca_core_clock > 0) props->comp_mask |= IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK; if (dev->dev->caps.cq_timestamp) { props->timestamp_mask = 0xFFFFFFFFFFFF; props->comp_mask |= IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK; } out: kfree(in_mad); kfree(out_mad); return err; } static enum rdma_link_layer mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num) { struct mlx4_dev *dev = to_mdev(device)->dev; return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ? IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; } static int ib_link_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int ext_active_speed; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); props->lmc = out_mad->data[34] & 0x7; props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); props->sm_sl = out_mad->data[36] & 0xf; props->state = out_mad->data[32] & 0xf; props->phys_state = out_mad->data[33] >> 4; props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20)); if (netw_view) props->gid_tbl_len = out_mad->data[50]; else props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz; props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port]; props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); props->active_width = out_mad->data[31] & 0xf; props->active_speed = out_mad->data[35] >> 4; props->max_mtu = out_mad->data[41] & 0xf; props->active_mtu = out_mad->data[36] >> 4; props->subnet_timeout = out_mad->data[51] & 0x1f; props->max_vl_num = out_mad->data[37] >> 4; props->init_type_reply = out_mad->data[41] >> 4; /* Check if extended speeds (EDR/FDR/...) are supported */ if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { ext_active_speed = out_mad->data[62] >> 4; switch (ext_active_speed) { case 1: props->active_speed = IB_SPEED_FDR; break; case 2: props->active_speed = IB_SPEED_EDR; break; } } /* If reported active speed is QDR, check if is FDR-10 */ if (props->active_speed == IB_SPEED_QDR) { init_query_mad(in_mad); in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; /* Checking LinkSpeedActive for FDR-10 */ if (out_mad->data[15] & 0x1) props->active_speed = IB_SPEED_FDR10; } /* Avoid wrong speed value returned by FW if the IB link is down. */ if (props->state == IB_PORT_DOWN) props->active_speed = IB_SPEED_SDR; out: kfree(in_mad); kfree(out_mad); return err; } static u8 state_to_phys_state(enum ib_port_state state) { return state == IB_PORT_ACTIVE ? 5 : 3; } static int eth_link_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { struct mlx4_ib_dev *mdev = to_mdev(ibdev); struct mlx4_ib_iboe *iboe = &mdev->iboe; struct net_device *ndev; enum ib_mtu tmp; struct mlx4_cmd_mailbox *mailbox; unsigned long flags; int err = 0; mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0, MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) goto out; props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ? IB_WIDTH_4X : IB_WIDTH_1X; props->active_speed = IB_SPEED_QDR; props->port_cap_flags = IB_PORT_CM_SUP; if (netw_view) props->gid_tbl_len = MLX4_ROCE_MAX_GIDS; else props->gid_tbl_len = mdev->dev->caps.gid_table_len[port]; props->max_msg_sz = mdev->dev->caps.max_msg_sz; props->pkey_tbl_len = 1; props->max_mtu = IB_MTU_4096; props->max_vl_num = 2; props->state = IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); props->active_mtu = IB_MTU_256; spin_lock_irqsave(&iboe->lock, flags); ndev = iboe->netdevs[port - 1]; if (!ndev) goto out_unlock; tmp = iboe_get_mtu(ndev->if_mtu); props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256; props->state = (netif_running(ndev) && netif_carrier_ok(ndev)) ? IB_PORT_ACTIVE : IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); out_unlock: spin_unlock_irqrestore(&iboe->lock, flags); out: mlx4_free_cmd_mailbox(mdev->dev, mailbox); return err; } int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { int err; memset(props, 0, sizeof *props); err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? ib_link_query_port(ibdev, port, props, netw_view) : eth_link_query_port(ibdev, port, props, netw_view); return err; } static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { /* returns host view */ return __mlx4_ib_query_port(ibdev, port, props, 0); } int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; struct mlx4_ib_dev *dev = to_mdev(ibdev); int clear = 0; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); if (mlx4_is_mfunc(dev->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw, out_mad->data + 8, 8); if (mlx4_is_mfunc(dev->dev) && !netw_view) { if (index) { /* For any index > 0, return the null guid */ err = 0; clear = 1; goto out; } } init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; in_mad->attr_mod = cpu_to_be32(index / 8); err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); out: if (clear) memset(gid->raw + 8, 0, 8); kfree(in_mad); kfree(out_mad); return err; } static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct mlx4_ib_dev *dev = to_mdev(ibdev); *gid = dev->iboe.gid_table[port - 1][index]; return 0; } static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) return __mlx4_ib_query_gid(ibdev, port, index, gid, 0); else return iboe_query_gid(ibdev, port, index, gid); } int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; in_mad->attr_mod = cpu_to_be32(index / 32); if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]); out: kfree(in_mad); kfree(out_mad); return err; } static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0); } static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { struct mlx4_cmd_mailbox *mailbox; unsigned long flags; if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) return 0; if (mlx4_is_slave(to_mdev(ibdev)->dev)) return -EOPNOTSUPP; spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags); memcpy(ibdev->node_desc, props->node_desc, 64); spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags); /* * If possible, pass node desc to FW, so it can generate * a 144 trap. If cmd fails, just ignore. */ mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev); if (IS_ERR(mailbox)) return 0; memset(mailbox->buf, 0, 256); memcpy(mailbox->buf, props->node_desc, 64); mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox); return 0; } static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, u32 cap_mask) { struct mlx4_cmd_mailbox *mailbox; int err; u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; mailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); memset(mailbox->buf, 0, 256); if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { *(u8 *) mailbox->buf = !!reset_qkey_viols << 6; ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask); } else { ((u8 *) mailbox->buf)[3] = !!reset_qkey_viols; ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask); } err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev->dev, mailbox); return err; } static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { struct ib_port_attr attr; u32 cap_mask; int err; mutex_lock(&to_mdev(ibdev)->cap_mask_mutex); err = mlx4_ib_query_port(ibdev, port, &attr); if (err) goto out; cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; err = mlx4_SET_PORT(to_mdev(ibdev), port, !!(mask & IB_PORT_RESET_QKEY_CNTR), cap_mask); out: mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); return err; } static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct mlx4_ib_ucontext *context; struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3; struct mlx4_ib_alloc_ucontext_resp resp; int err; if (!dev->ib_active) return ERR_PTR(-EAGAIN); if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { resp_v3.qp_tab_size = dev->dev->caps.num_qps; if (mlx4_wc_enabled()) { resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size; resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; } else { resp_v3.bf_reg_size = 0; resp_v3.bf_regs_per_page = 0; } } else { resp.dev_caps = dev->dev->caps.userspace_caps; resp.qp_tab_size = dev->dev->caps.num_qps; if (mlx4_wc_enabled()) { resp.bf_reg_size = dev->dev->caps.bf_reg_size; resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; } else { resp.bf_reg_size = 0; resp.bf_regs_per_page = 0; } resp.cqe_size = dev->dev->caps.cqe_size; } context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar); if (err) { kfree(context); return ERR_PTR(err); } INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); else err = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (err) { mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); kfree(context); return ERR_PTR(-EFAULT); } return &context->ibucontext; } static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar); kfree(context); return 0; } /* XXX FBSD has no support for get_unmapped_area function */ #if 0 static unsigned long mlx4_ib_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm; struct vm_area_struct *vma; unsigned long start_addr; unsigned long page_size_order; unsigned long command; mm = current->mm; if (addr) return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); /* Last 8 bits hold the command others are data per that command */ command = pgoff & MLX4_IB_MMAP_CMD_MASK; if (command != MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); page_size_order = pgoff >> MLX4_IB_MMAP_CMD_BITS; /* code is based on the huge-pages get_unmapped_area code */ start_addr = mm->free_area_cache; if (len <= mm->cached_hole_size) start_addr = TASK_UNMAPPED_BASE; full_search: addr = ALIGN(start_addr, 1 << page_size_order); for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ if (TASK_SIZE - len < addr) { /* * Start a new search - just in case we missed * some holes. */ if (start_addr != TASK_UNMAPPED_BASE) { start_addr = TASK_UNMAPPED_BASE; goto full_search; } return -ENOMEM; } if (!vma || addr + len <= vma->vm_start) return addr; addr = ALIGN(vma->vm_end, 1 << page_size_order); } } #endif static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct mlx4_ib_dev *dev = to_mdev(context->device); /* Last 8 bits hold the command others are data per that command */ unsigned long command = vma->vm_pgoff & MLX4_IB_MMAP_CMD_MASK; if (command < MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) { /* compatibility handling for commands 0 & 1*/ if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; } if (command == MLX4_IB_MMAP_UAR_PAGE) { vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; } else if (command == MLX4_IB_MMAP_BLUE_FLAME_PAGE && dev->dev->caps.bf_reg_size != 0) { vma->vm_page_prot = pgprot_wc(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn + dev->dev->caps.num_uars, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; } else if (command == MLX4_IB_MMAP_GET_HW_CLOCK) { struct mlx4_clock_params params; int ret; ret = mlx4_get_internal_clock_params(dev->dev, ¶ms); if (ret) return ret; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, (pci_resource_start(dev->dev->pdev, params.bar) + params.offset) >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; } else return -EINVAL; return 0; } static int mlx4_ib_ioctl(struct ib_ucontext *context, unsigned int cmd, unsigned long arg) { struct mlx4_ib_dev *dev = to_mdev(context->device); int ret; int offset; switch (cmd) { case MLX4_IOCHWCLOCKOFFSET: { struct mlx4_clock_params params; int ret; ret = mlx4_get_internal_clock_params(dev->dev, ¶ms); if (!ret) { offset = params.offset % PAGE_SIZE; ret = put_user(offset, (int *)arg); return sizeof(int); } else { return ret; } } default: { pr_err("mlx4_ib: invalid ioctl %u command with arg %lX\n", cmd, arg); return -ENOTTY; } } return ret; } static int mlx4_ib_query_values(struct ib_device *device, int q_values, struct ib_device_values *values) { struct mlx4_ib_dev *dev = to_mdev(device); cycle_t cycles; values->values_mask = 0; if (q_values & IBV_VALUES_HW_CLOCK) { cycles = mlx4_read_clock(dev->dev); if (cycles < 0) { values->hwclock = cycles & CORE_CLOCK_MASK; values->values_mask |= IBV_VALUES_HW_CLOCK; } q_values &= ~IBV_VALUES_HW_CLOCK; } if (q_values) return -ENOTTY; return 0; } static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx4_ib_pd *pd; int err; pd = kmalloc(sizeof *pd, GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn); if (err) { kfree(pd); return ERR_PTR(err); } if (context) if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) { mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn); kfree(pd); return ERR_PTR(-EFAULT); } return &pd->ibpd; } static int mlx4_ib_dealloc_pd(struct ib_pd *pd) { mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn); kfree(pd); return 0; } static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx4_ib_xrcd *xrcd; int err; if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) return ERR_PTR(-ENOSYS); xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL); if (!xrcd) return ERR_PTR(-ENOMEM); err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn); if (err) goto err1; xrcd->pd = ib_alloc_pd(ibdev); if (IS_ERR(xrcd->pd)) { err = PTR_ERR(xrcd->pd); goto err2; } xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, 1, 0); if (IS_ERR(xrcd->cq)) { err = PTR_ERR(xrcd->cq); goto err3; } return &xrcd->ibxrcd; err3: ib_dealloc_pd(xrcd->pd); err2: mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn); err1: kfree(xrcd); return ERR_PTR(err); } static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd) { ib_destroy_cq(to_mxrcd(xrcd)->cq); ib_dealloc_pd(to_mxrcd(xrcd)->pd); mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn); kfree(xrcd); return 0; } static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) { struct mlx4_ib_qp *mqp = to_mqp(ibqp); struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_gid_entry *ge; ge = kzalloc(sizeof *ge, GFP_KERNEL); if (!ge) return -ENOMEM; ge->gid = *gid; if (mlx4_ib_add_mc(mdev, mqp, gid)) { ge->port = mqp->port; ge->added = 1; } mutex_lock(&mqp->mutex); list_add_tail(&ge->list, &mqp->gid_list); mutex_unlock(&mqp->mutex); return 0; } int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, union ib_gid *gid) { u8 mac[6]; struct net_device *ndev; int ret = 0; if (!mqp->port) return 0; spin_lock(&mdev->iboe.lock); ndev = mdev->iboe.netdevs[mqp->port - 1]; if (ndev) dev_hold(ndev); spin_unlock(&mdev->iboe.lock); if (ndev) { rdma_get_mcast_mac((struct in6_addr *)gid, mac); rtnl_lock(); dev_mc_add(mdev->iboe.netdevs[mqp->port - 1], mac, 6, 0); ret = 1; rtnl_unlock(); dev_put(ndev); } return ret; } struct mlx4_ib_steering { struct list_head list; u64 reg_id; union ib_gid gid; }; static int parse_flow_attr(struct mlx4_dev *dev, union ib_flow_spec *ib_spec, struct _rule_hw *mlx4_spec) { enum mlx4_net_trans_rule_id type; switch (ib_spec->type) { case IB_FLOW_SPEC_ETH: type = MLX4_NET_TRANS_RULE_ID_ETH; memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac, ETH_ALEN); memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac, ETH_ALEN); mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag; mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag; break; case IB_FLOW_SPEC_IB: type = MLX4_NET_TRANS_RULE_ID_IB; mlx4_spec->ib.l3_qpn = ib_spec->ib.val.l3_type_qpn; mlx4_spec->ib.qpn_mask = ib_spec->ib.mask.l3_type_qpn; memcpy(&mlx4_spec->ib.dst_gid, ib_spec->ib.val.dst_gid, 16); memcpy(&mlx4_spec->ib.dst_gid_msk, ib_spec->ib.mask.dst_gid, 16); break; case IB_FLOW_SPEC_IPV4: type = MLX4_NET_TRANS_RULE_ID_IPV4; mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip; mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip; mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip; mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip; break; case IB_FLOW_SPEC_TCP: case IB_FLOW_SPEC_UDP: type = ib_spec->type == IB_FLOW_SPEC_TCP ? MLX4_NET_TRANS_RULE_ID_TCP : MLX4_NET_TRANS_RULE_ID_UDP; mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port; mlx4_spec->tcp_udp.dst_port_msk = ib_spec->tcp_udp.mask.dst_port; mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port; mlx4_spec->tcp_udp.src_port_msk = ib_spec->tcp_udp.mask.src_port; break; default: return -EINVAL; } if (map_sw_to_hw_steering_id(dev, type) < 0 || hw_rule_sz(dev, type) < 0) return -EINVAL; mlx4_spec->id = cpu_to_be16(map_sw_to_hw_steering_id(dev, type)); mlx4_spec->size = hw_rule_sz(dev, type) >> 2; return hw_rule_sz(dev, type); } static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain, enum mlx4_net_trans_promisc_mode flow_type, u64 *reg_id) { int ret, i; int size = 0; void *ib_flow; struct mlx4_ib_dev *mdev = to_mdev(qp->device); struct mlx4_cmd_mailbox *mailbox; struct mlx4_net_trans_rule_hw_ctrl *ctrl; size_t rule_size = sizeof(struct mlx4_net_trans_rule_hw_ctrl) + (sizeof(struct _rule_hw) * flow_attr->num_of_specs); static const u16 __mlx4_domain[] = { [IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS, [IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL, [IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS, [IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC, }; if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) { pr_err("Invalid priority value.\n"); return -EINVAL; } if (domain >= IB_FLOW_DOMAIN_NUM) { pr_err("Invalid domain value.\n"); return -EINVAL; } if (map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0) return -EINVAL; mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); memset(mailbox->buf, 0, rule_size); ctrl = mailbox->buf; ctrl->prio = cpu_to_be16(__mlx4_domain[domain] | flow_attr->priority); ctrl->type = map_sw_to_hw_steering_mode(mdev->dev, flow_type); ctrl->port = flow_attr->port; ctrl->qpn = cpu_to_be32(qp->qp_num); if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK) ctrl->flags = (1 << 3); ib_flow = flow_attr + 1; size += sizeof(struct mlx4_net_trans_rule_hw_ctrl); for (i = 0; i < flow_attr->num_of_specs; i++) { ret = parse_flow_attr(mdev->dev, ib_flow, mailbox->buf + size); if (ret < 0) { mlx4_free_cmd_mailbox(mdev->dev, mailbox); return -EINVAL; } ib_flow += ((union ib_flow_spec *)ib_flow)->size; size += ret; } ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0, MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (ret == -ENOMEM) pr_err("mcg table is full. Fail to register network rule.\n"); else if (ret == -ENXIO) pr_err("Device managed flow steering is disabled. Fail to register network rule.\n"); else if (ret) pr_err("Invalid argumant. Fail to register network rule.\n"); mlx4_free_cmd_mailbox(mdev->dev, mailbox); return ret; } static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id) { int err; err = mlx4_cmd(dev, reg_id, 0, 0, MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) pr_err("Fail to detach network rule. registration id = 0x%llx\n", (unsigned long long)reg_id); return err; } static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain) { int err = 0, i = 0; struct mlx4_ib_flow *mflow; enum mlx4_net_trans_promisc_mode type[2]; memset(type, 0, sizeof(type)); mflow = kzalloc(sizeof(struct mlx4_ib_flow), GFP_KERNEL); if (!mflow) { err = -ENOMEM; goto err_free; } switch (flow_attr->type) { case IB_FLOW_ATTR_NORMAL: type[0] = MLX4_FS_REGULAR; break; case IB_FLOW_ATTR_ALL_DEFAULT: type[0] = MLX4_FS_ALL_DEFAULT; break; case IB_FLOW_ATTR_MC_DEFAULT: type[0] = MLX4_FS_MC_DEFAULT; break; case IB_FLOW_ATTR_SNIFFER: type[0] = MLX4_FS_UC_SNIFFER; type[1] = MLX4_FS_MC_SNIFFER; break; default: err = -EINVAL; goto err_free; } while (i < ARRAY_SIZE(type) && type[i]) { err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i], &mflow->reg_id[i]); if (err) goto err_free; i++; } return &mflow->ibflow; err_free: kfree(mflow); return ERR_PTR(err); } static int mlx4_ib_destroy_flow(struct ib_flow *flow_id) { int err, ret = 0; int i = 0; struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device); struct mlx4_ib_flow *mflow = to_mflow(flow_id); while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i]) { err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i]); if (err) ret = err; i++; } kfree(mflow); return ret; } static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) { struct mlx4_ib_gid_entry *ge; struct mlx4_ib_gid_entry *tmp; struct mlx4_ib_gid_entry *ret = NULL; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { if (!memcmp(raw, ge->gid.raw, 16)) { ret = ge; break; } } return ret; } static int del_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) { struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); struct mlx4_ib_gid_entry *ge; struct net_device *ndev; u8 mac[6]; mutex_lock(&mqp->mutex); ge = find_gid_entry(mqp, gid->raw); if (ge) { spin_lock(&mdev->iboe.lock); ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL; if (ndev) dev_hold(ndev); spin_unlock(&mdev->iboe.lock); rdma_get_mcast_mac((struct in6_addr *)gid, mac); if (ndev) { rtnl_lock(); dev_mc_delete(mdev->iboe.netdevs[ge->port - 1], mac, 6, 0); rtnl_unlock(); dev_put(ndev); } list_del(&ge->list); kfree(ge); } else pr_warn("could not find mgid entry\n"); mutex_unlock(&mqp->mutex); return ge != NULL ? 0 : -EINVAL; } static int _mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid, int count) { int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); u64 reg_id = 0; int record_err = 0; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { struct mlx4_ib_steering *ib_steering; struct mlx4_ib_steering *tmp; LIST_HEAD(temp); mutex_lock(&mqp->mutex); list_for_each_entry_safe(ib_steering, tmp, &mqp->steering_rules, list) { if (memcmp(ib_steering->gid.raw, gid->raw, 16)) continue; if (--count < 0) break; list_del(&ib_steering->list); list_add(&ib_steering->list, &temp); } mutex_unlock(&mqp->mutex); list_for_each_entry_safe(ib_steering, tmp, &temp, list) { reg_id = ib_steering->reg_id; err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, (ibqp->qp_type == IB_QPT_RAW_PACKET) ? MLX4_PROT_ETH : MLX4_PROT_IB_IPV6, reg_id); if (err) { record_err = record_err ?: err; continue; } err = del_gid_entry(ibqp, gid); if (err) { record_err = record_err ?: err; continue; } list_del(&ib_steering->list); kfree(ib_steering); } mutex_lock(&mqp->mutex); list_for_each_entry(ib_steering, &temp, list) { list_add(&ib_steering->list, &mqp->steering_rules); } mutex_unlock(&mqp->mutex); if (count) { pr_warn("Couldn't release all reg_ids for mgid. Steering rule is left attached\n"); return -EINVAL; } } else { if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_B0 && ibqp->qp_type == IB_QPT_RAW_PACKET) gid->raw[5] = mqp->port; err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, (ibqp->qp_type == IB_QPT_RAW_PACKET) ? MLX4_PROT_ETH : MLX4_PROT_IB_IPV6, reg_id); if (err) return err; err = del_gid_entry(ibqp, gid); if (err) return err; } return record_err; } static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); int count = (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) ? mdev->dev->caps.num_ports : 1; return _mlx4_ib_mcg_detach(ibqp, gid, lid, count); } static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { int err = -ENODEV; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); DECLARE_BITMAP(ports, MLX4_MAX_PORTS); int i = 0; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_B0 && ibqp->qp_type == IB_QPT_RAW_PACKET) gid->raw[5] = mqp->port; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { bitmap_fill(ports, mdev->dev->caps.num_ports); } else { if (mqp->port <= mdev->dev->caps.num_ports) { bitmap_zero(ports, mdev->dev->caps.num_ports); set_bit(0, ports); } else { return -EINVAL; } } for (; i < mdev->dev->caps.num_ports; i++) { u64 reg_id; struct mlx4_ib_steering *ib_steering = NULL; if (!test_bit(i, ports)) continue; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL); if (!ib_steering) goto err_add; } err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, i + 1, !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), (ibqp->qp_type == IB_QPT_RAW_PACKET) ? MLX4_PROT_ETH : MLX4_PROT_IB_IPV6, ®_id); if (err) { kfree(ib_steering); goto err_add; } err = add_gid_entry(ibqp, gid); if (err) { mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, MLX4_PROT_IB_IPV6, reg_id); kfree(ib_steering); goto err_add; } if (ib_steering) { memcpy(ib_steering->gid.raw, gid->raw, 16); mutex_lock(&mqp->mutex); list_add(&ib_steering->list, &mqp->steering_rules); mutex_unlock(&mqp->mutex); ib_steering->reg_id = reg_id; } } return 0; err_add: if (i > 0) _mlx4_ib_mcg_detach(ibqp, gid, lid, i); return err; } static int init_node_data(struct mlx4_ib_dev *dev) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; if (mlx4_is_master(dev->dev)) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(dev->ib_dev.node_desc, out_mad->data, 64); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); out: kfree(in_mad); kfree(out_mad); return err; } static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "MT%d\n", dev->dev->pdev->device); } static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32), (int) (dev->dev->caps.fw_ver >> 16) & 0xffff, (int) dev->dev->caps.fw_ver & 0xffff); } static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->dev->rev_id); } static ssize_t show_board(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, dev->dev->board_id); } static ssize_t show_vsd(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); ssize_t len = MLX4_VSD_LEN; if (dev->dev->vsd_vendor_id == PCI_VENDOR_ID_MELLANOX) len = sprintf(buf, "%.*s\n", MLX4_VSD_LEN, dev->dev->vsd); else memcpy(buf, dev->dev->vsd, MLX4_VSD_LEN); return len; } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static DEVICE_ATTR(vsd, S_IRUGO, show_vsd, NULL); static struct device_attribute *mlx4_class_attributes[] = { &dev_attr_hw_rev, &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id, &dev_attr_vsd }; static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev, u8 port) { memcpy(eui, IF_LLADDR(dev), 3); memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); if (vlan_id < 0x1000) { eui[3] = vlan_id >> 8; eui[4] = vlan_id & 0xff; } else { eui[3] = 0xff; eui[4] = 0xfe; } eui[0] ^= 2; } static void update_gids_task(struct work_struct *work) { struct update_gid_work *gw = container_of(work, struct update_gid_work, work); struct mlx4_cmd_mailbox *mailbox; union ib_gid *gids; int err; struct mlx4_dev *dev = gw->dev->dev; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox)); goto free; } gids = mailbox->buf; memcpy(gids, gw->gids, sizeof gw->gids); if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) == IB_LINK_LAYER_ETHERNET) { err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) pr_warn("set port command failed\n"); else mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE); } mlx4_free_cmd_mailbox(dev, mailbox); free: kfree(gw); } static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_num) { struct mlx4_ib_dev *ibdev = to_mdev(device); return mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port_num); } static void reset_gids_task(struct work_struct *work) { struct update_gid_work *gw = container_of(work, struct update_gid_work, work); struct mlx4_cmd_mailbox *mailbox; union ib_gid *gids; int err; struct mlx4_dev *dev = gw->dev->dev; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { pr_warn("reset gid table failed\n"); goto free; } gids = mailbox->buf; memcpy(gids, gw->gids, sizeof(gw->gids)); if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET && dev->caps.num_ports > 0) { err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | 1, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) pr_warn("set port 1 command failed\n"); } if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, 2) == IB_LINK_LAYER_ETHERNET && dev->caps.num_ports > 1) { err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | 2, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) pr_warn("set port 2 command failed\n"); } mlx4_free_cmd_mailbox(dev, mailbox); free: kfree(gw); } static int update_gid_table(struct mlx4_ib_dev *dev, int port, union ib_gid *gid, int clear, int default_gid) { struct update_gid_work *work; int i; int need_update = 0; int free = -1; int found = -1; int max_gids; int start_index = !default_gid; max_gids = dev->dev->caps.gid_table_len[port]; for (i = start_index; i < max_gids; ++i) { if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid, sizeof(*gid))) found = i; if (clear) { if (found >= 0) { need_update = 1; dev->iboe.gid_table[port - 1][found] = zgid; break; } } else { if (found >= 0) break; if (free < 0 && !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof(*gid))) free = i; } } if (found == -1 && !clear && free < 0) { pr_err("GID table of port %d is full. Can't add "GID_PRINT_FMT"\n", port, GID_PRINT_ARGS(gid)); return -ENOMEM; } if (found == -1 && clear) { pr_err(GID_PRINT_FMT" is not in GID table of port %d\n", GID_PRINT_ARGS(gid), port); return -EINVAL; } if (found == -1 && !clear && free >= 0) { dev->iboe.gid_table[port - 1][free] = *gid; need_update = 1; } if (!need_update) return 0; work = kzalloc(sizeof *work, GFP_ATOMIC); if (!work) return -ENOMEM; memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids)); INIT_WORK(&work->work, update_gids_task); work->port = port; work->dev = dev; queue_work(wq, &work->work); return 0; } static int reset_gid_table(struct mlx4_ib_dev *dev) { struct update_gid_work *work; work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) return -ENOMEM; memset(dev->iboe.gid_table, 0, sizeof(dev->iboe.gid_table)); memset(work->gids, 0, sizeof(work->gids)); INIT_WORK(&work->work, reset_gids_task); work->dev = dev; queue_work(wq, &work->work); return 0; } /* XXX BOND Related - stub (no support for these flags in FBSD)*/ static inline int netif_is_bond_master(struct net_device *dev) { #if 0 return (dev->flags & IFF_MASTER) && (dev->priv_flags & IFF_BONDING); #endif return 0; } static void mlx4_make_default_gid(struct net_device *dev, union ib_gid *gid, u8 port) { gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev, port); } static u8 mlx4_ib_get_dev_port(struct net_device *dev, struct mlx4_ib_dev *ibdev) { u8 port = 0; struct mlx4_ib_iboe *iboe; struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ? rdma_vlan_dev_real_dev(dev) : dev; iboe = &ibdev->iboe; for (port = 1; port <= MLX4_MAX_PORTS; ++port) if ((netif_is_bond_master(real_dev) && (real_dev == iboe->masters[port - 1])) || (!netif_is_bond_master(real_dev) && (real_dev == iboe->netdevs[port - 1]))) break; return port > MLX4_MAX_PORTS ? 0 : port; } static void mlx4_ib_get_dev_addr(struct net_device *dev, struct mlx4_ib_dev *ibdev, u8 port) { struct ifaddr *ifa; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct inet6_dev *in6_dev; union ib_gid *pgid; struct inet6_ifaddr *ifp; #endif union ib_gid gid; if ((port == 0) || (port > MLX4_MAX_PORTS)) return; /* IPv4 gids */ TAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) { if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET){ ipv6_addr_set_v4mapped( ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr.s_addr, (struct in6_addr *)&gid); update_gid_table(ibdev, port, &gid, 0, 0); } } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) /* IPv6 gids */ in6_dev = in6_dev_get(dev); if (in6_dev) { read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { pgid = (union ib_gid *)&ifp->addr; update_gid_table(ibdev, port, pgid, 0, 0); } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); } #endif } static void mlx4_set_default_gid(struct mlx4_ib_dev *ibdev, struct net_device *dev, u8 port) { union ib_gid gid; mlx4_make_default_gid(dev, &gid, port); update_gid_table(ibdev, port, &gid, 0, 1); } static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev) { struct net_device *dev; if (reset_gid_table(ibdev)) return -1; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(dev, &V_ifnet, if_link) { u8 port = mlx4_ib_get_dev_port(dev, ibdev); if (port) { if (!rdma_vlan_dev_real_dev(dev) && !netif_is_bond_master(dev)) mlx4_set_default_gid(ibdev, dev, port); mlx4_ib_get_dev_addr(dev, ibdev, port); } } IFNET_RUNLOCK_NOSLEEP(); return 0; } static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, struct net_device *dev, unsigned long event) { struct mlx4_ib_iboe *iboe; int port; int init = 0; unsigned long flags; iboe = &ibdev->iboe; spin_lock_irqsave(&iboe->lock, flags); mlx4_foreach_ib_transport_port(port, ibdev->dev) { struct net_device *old_netdev = iboe->netdevs[port - 1]; /* XXX BOND related */ #if 0 struct net_device *old_master = iboe->masters[port - 1]; #endif iboe->masters[port - 1] = NULL; iboe->netdevs[port - 1] = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); if (old_netdev != iboe->netdevs[port - 1]) init = 1; if (dev == iboe->netdevs[port - 1] && event == NETDEV_CHANGEADDR) init = 1; /* XXX BOND related */ #if 0 if (iboe->netdevs[port - 1] && netif_is_bond_slave(iboe->netdevs[port - 1])) iboe->masters[port - 1] = iboe->netdevs[port - 1]->master; /* if bonding is used it is possible that we add it to masters only after IP address is assigned to the net bonding interface */ if (old_master != iboe->masters[port - 1]) init = 1; #endif } spin_unlock_irqrestore(&iboe->lock, flags); if (init) if (mlx4_ib_init_gid_table(ibdev)) pr_warn("Fail to reset gid table\n"); } static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = ptr; struct mlx4_ib_dev *ibdev; ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); mlx4_ib_scan_netdevs(ibdev, dev, event); return NOTIFY_DONE; } /* This function initializes the gid table only if the event_netdev real device is an iboe * device, will be invoked by the inet/inet6 events */ static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_netdev = ptr; struct mlx4_ib_dev *ibdev; struct mlx4_ib_iboe *ibdev_iboe; int port = 0; ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet); struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ? rdma_vlan_dev_real_dev(event_netdev) : event_netdev; ibdev_iboe = &ibdev->iboe; port = mlx4_ib_get_dev_port(real_dev, ibdev); /* Perform init_gid_table if the event real_dev is the net_device which represents this port, * otherwise this event is not related and would be ignored.*/ if(port && (real_dev == ibdev_iboe->netdevs[port - 1])) if (mlx4_ib_init_gid_table(ibdev)) pr_warn("Fail to reset gid table\n"); return NOTIFY_DONE; } static void init_pkeys(struct mlx4_ib_dev *ibdev) { int port; int slave; int i; if (mlx4_is_master(ibdev->dev)) { for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) { for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { for (i = 0; i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; ++i) { ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] = /* master has the identity virt2phys pkey mapping */ (slave == mlx4_master_func_num(ibdev->dev) || !i) ? i : ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1; mlx4_sync_pkey_table(ibdev->dev, slave, port, i, ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]); } } } /* initialize pkey cache */ for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { for (i = 0; i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; ++i) ibdev->pkeys.phys_pkey_cache[port-1][i] = (i) ? 0 : 0xFFFF; } } } static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { char name[32]; int eq_per_port = 0; int added_eqs = 0; int total_eqs = 0; int i, j, eq; /* Legacy mode or comp_pool is not large enough */ if (dev->caps.comp_pool == 0 || dev->caps.num_ports > dev->caps.comp_pool) return; eq_per_port = rounddown_pow_of_two(dev->caps.comp_pool/ dev->caps.num_ports); /* Init eq table */ added_eqs = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) added_eqs += eq_per_port; total_eqs = dev->caps.num_comp_vectors + added_eqs; ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL); if (!ibdev->eq_table) return; ibdev->eq_added = added_eqs; eq = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) { for (j = 0; j < eq_per_port; j++) { sprintf(name, "mlx4-ib-%d-%d@%d:%d:%d:%d", i, j, pci_get_domain(dev->pdev->dev.bsddev), pci_get_bus(dev->pdev->dev.bsddev), PCI_SLOT(dev->pdev->devfn), PCI_FUNC(dev->pdev->devfn)); /* Set IRQ for specific name (per ring) */ if (mlx4_assign_eq(dev, name, &ibdev->eq_table[eq])) { /* Use legacy (same as mlx4_en driver) */ pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq); ibdev->eq_table[eq] = (eq % dev->caps.num_comp_vectors); } eq++; } } /* Fill the reset of the vector with legacy EQ */ for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++) ibdev->eq_table[eq++] = i; /* Advertise the new number of EQs to clients */ ibdev->ib_dev.num_comp_vectors = total_eqs; } static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { int i; /* no additional eqs were added */ if (!ibdev->eq_table) return; /* Reset the advertised EQ number */ ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; /* Free only the added eqs */ for (i = 0; i < ibdev->eq_added; i++) { /* Don't free legacy eqs if used */ if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors) continue; mlx4_release_eq(dev, ibdev->eq_table[i]); } kfree(ibdev->eq_table); } /* * create show function and a device_attribute struct pointing to * the function for _name */ #define DEVICE_DIAG_RPRT_ATTR(_name, _offset, _op_mod) \ static ssize_t show_rprt_##_name(struct device *dev, \ struct device_attribute *attr, \ char *buf){ \ return show_diag_rprt(dev, buf, _offset, _op_mod); \ } \ static DEVICE_ATTR(_name, S_IRUGO, show_rprt_##_name, NULL); #define MLX4_DIAG_RPRT_CLEAR_DIAGS 3 static size_t show_diag_rprt(struct device *device, char *buf, u32 offset, u8 op_modifier) { size_t ret; u32 counter_offset = offset; u32 diag_counter = 0; struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); ret = mlx4_query_diag_counters(dev->dev, 1, op_modifier, &counter_offset, &diag_counter); if (ret) return ret; return sprintf(buf, "%d\n", diag_counter); } static ssize_t clear_diag_counters(struct device *device, struct device_attribute *attr, const char *buf, size_t length) { size_t ret; struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); ret = mlx4_query_diag_counters(dev->dev, 0, MLX4_DIAG_RPRT_CLEAR_DIAGS, NULL, NULL); if (ret) return ret; return length; } DEVICE_DIAG_RPRT_ATTR(rq_num_lle , 0x00, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_lle , 0x04, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_lqpoe , 0x08, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_lqpoe , 0x0C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_lpe , 0x18, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_lpe , 0x1C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_wrfe , 0x20, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_wrfe , 0x24, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_mwbe , 0x2C, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_bre , 0x34, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_lae , 0x38, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rire , 0x44, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_rire , 0x48, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rae , 0x4C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_rae , 0x50, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_roe , 0x54, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_tree , 0x5C, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rree , 0x64, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_rnr , 0x68, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rnr , 0x6C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_oos , 0x100, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_oos , 0x104, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_mce , 0x108, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_udsdprd , 0x118, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_ucsdprd , 0x120, 2); DEVICE_DIAG_RPRT_ATTR(num_cqovf , 0x1A0, 2); DEVICE_DIAG_RPRT_ATTR(num_eqovf , 0x1A4, 2); DEVICE_DIAG_RPRT_ATTR(num_baddb , 0x1A8, 2); static DEVICE_ATTR(clear_diag, S_IWUSR, NULL, clear_diag_counters); static struct attribute *diag_rprt_attrs[] = { &dev_attr_rq_num_lle.attr, &dev_attr_sq_num_lle.attr, &dev_attr_rq_num_lqpoe.attr, &dev_attr_sq_num_lqpoe.attr, &dev_attr_rq_num_lpe.attr, &dev_attr_sq_num_lpe.attr, &dev_attr_rq_num_wrfe.attr, &dev_attr_sq_num_wrfe.attr, &dev_attr_sq_num_mwbe.attr, &dev_attr_sq_num_bre.attr, &dev_attr_rq_num_lae.attr, &dev_attr_sq_num_rire.attr, &dev_attr_rq_num_rire.attr, &dev_attr_sq_num_rae.attr, &dev_attr_rq_num_rae.attr, &dev_attr_sq_num_roe.attr, &dev_attr_sq_num_tree.attr, &dev_attr_sq_num_rree.attr, &dev_attr_rq_num_rnr.attr, &dev_attr_sq_num_rnr.attr, &dev_attr_rq_num_oos.attr, &dev_attr_sq_num_oos.attr, &dev_attr_rq_num_mce.attr, &dev_attr_rq_num_udsdprd.attr, &dev_attr_rq_num_ucsdprd.attr, &dev_attr_num_cqovf.attr, &dev_attr_num_eqovf.attr, &dev_attr_num_baddb.attr, &dev_attr_clear_diag.attr, NULL }; static struct attribute_group diag_counters_group = { .name = "diag_counters", .attrs = diag_rprt_attrs }; static void init_dev_assign(void) { int i = 1; spin_lock_init(&dev_num_str_lock); if (mlx4_fill_dbdf2val_tbl(&dev_assign_str)) return; dev_num_str_bitmap = kmalloc(BITS_TO_LONGS(MAX_NUM_STR_BITMAP) * sizeof(long), GFP_KERNEL); if (!dev_num_str_bitmap) { pr_warn("bitmap alloc failed -- cannot apply dev_assign_str parameter\n"); return; } bitmap_zero(dev_num_str_bitmap, MAX_NUM_STR_BITMAP); while ((i < MLX4_DEVS_TBL_SIZE) && (dev_assign_str.tbl[i].dbdf != MLX4_ENDOF_TBL)) { if (bitmap_allocate_region(dev_num_str_bitmap, dev_assign_str.tbl[i].val[0], 0)) goto err; i++; } dr_active = 1; return; err: kfree(dev_num_str_bitmap); dev_num_str_bitmap = NULL; pr_warn("mlx4_ib: The value of 'dev_assign_str' parameter " "is incorrect. The parameter value is discarded!"); } static int mlx4_ib_dev_idx(struct mlx4_dev *dev) { int i, val; if (!dr_active) return -1; if (!dev) return -1; if (mlx4_get_val(dev_assign_str.tbl, dev->pdev, 0, &val)) return -1; if (val != DEFAULT_TBL_VAL) { dev->flags |= MLX4_FLAG_DEV_NUM_STR; return val; } spin_lock(&dev_num_str_lock); i = bitmap_find_free_region(dev_num_str_bitmap, MAX_NUM_STR_BITMAP, 0); spin_unlock(&dev_num_str_lock); if (i >= 0) return i; return -1; } static void *mlx4_ib_add(struct mlx4_dev *dev) { struct mlx4_ib_dev *ibdev; int num_ports = 0; int i, j; int err; struct mlx4_ib_iboe *iboe; int dev_idx; pr_info_once("%s", mlx4_ib_version); mlx4_foreach_ib_transport_port(i, dev) num_ports++; /* No point in registering a device with no ports... */ if (num_ports == 0) return NULL; ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev); if (!ibdev) { dev_err(&dev->pdev->dev, "Device struct alloc failed\n"); return NULL; } iboe = &ibdev->iboe; if (mlx4_pd_alloc(dev, &ibdev->priv_pdn)) goto err_dealloc; if (mlx4_uar_alloc(dev, &ibdev->priv_uar)) goto err_pd; ibdev->priv_uar.map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!ibdev->priv_uar.map) goto err_uar; MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock); ibdev->dev = dev; dev_idx = mlx4_ib_dev_idx(dev); if (dev_idx >= 0) sprintf(ibdev->ib_dev.name, "mlx4_%d", dev_idx); else strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); ibdev->ib_dev.owner = THIS_MODULE; ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; ibdev->num_ports = num_ports; ibdev->ib_dev.phys_port_cnt = ibdev->num_ports; ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; ibdev->ib_dev.dma_device = &dev->pdev->dev; if (dev->caps.userspace_caps) ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; else ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | (1ull << IB_USER_VERBS_CMD_OPEN_QP); ibdev->ib_dev.query_device = mlx4_ib_query_device; ibdev->ib_dev.query_port = mlx4_ib_query_port; ibdev->ib_dev.get_link_layer = mlx4_ib_port_link_layer; ibdev->ib_dev.query_gid = mlx4_ib_query_gid; ibdev->ib_dev.query_pkey = mlx4_ib_query_pkey; ibdev->ib_dev.modify_device = mlx4_ib_modify_device; ibdev->ib_dev.modify_port = mlx4_ib_modify_port; ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext; ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext; ibdev->ib_dev.mmap = mlx4_ib_mmap; /* XXX FBSD has no support for get_unmapped_area function */ #if 0 ibdev->ib_dev.get_unmapped_area = mlx4_ib_get_unmapped_area; #endif ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd; ibdev->ib_dev.dealloc_pd = mlx4_ib_dealloc_pd; ibdev->ib_dev.create_ah = mlx4_ib_create_ah; ibdev->ib_dev.query_ah = mlx4_ib_query_ah; ibdev->ib_dev.destroy_ah = mlx4_ib_destroy_ah; ibdev->ib_dev.create_srq = mlx4_ib_create_srq; ibdev->ib_dev.modify_srq = mlx4_ib_modify_srq; ibdev->ib_dev.query_srq = mlx4_ib_query_srq; ibdev->ib_dev.destroy_srq = mlx4_ib_destroy_srq; ibdev->ib_dev.post_srq_recv = mlx4_ib_post_srq_recv; ibdev->ib_dev.create_qp = mlx4_ib_create_qp; ibdev->ib_dev.modify_qp = mlx4_ib_modify_qp; ibdev->ib_dev.query_qp = mlx4_ib_query_qp; ibdev->ib_dev.destroy_qp = mlx4_ib_destroy_qp; ibdev->ib_dev.post_send = mlx4_ib_post_send; ibdev->ib_dev.post_recv = mlx4_ib_post_recv; ibdev->ib_dev.create_cq = mlx4_ib_create_cq; ibdev->ib_dev.modify_cq = mlx4_ib_modify_cq; ibdev->ib_dev.resize_cq = mlx4_ib_resize_cq; ibdev->ib_dev.destroy_cq = mlx4_ib_destroy_cq; ibdev->ib_dev.poll_cq = mlx4_ib_poll_cq; ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq; ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr; ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr; ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr; ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr; ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list; ibdev->ib_dev.free_fast_reg_page_list = mlx4_ib_free_fast_reg_page_list; ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach; ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; ibdev->ib_dev.process_mad = mlx4_ib_process_mad; ibdev->ib_dev.get_netdev = mlx4_ib_get_netdev; ibdev->ib_dev.ioctl = mlx4_ib_ioctl; ibdev->ib_dev.query_values = mlx4_ib_query_values; if (!mlx4_is_slave(ibdev->dev)) { ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; } if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) { ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw; ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw; ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw; ibdev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); } if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd; ibdev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } /* * Set experimental data */ ibdev->ib_dev.uverbs_exp_cmd_mask = (1ull << IB_USER_VERBS_EXP_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_EXP_CMD_MODIFY_CQ) | (1ull << IB_USER_VERBS_EXP_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_EXP_CMD_CREATE_CQ); ibdev->ib_dev.exp_create_qp = mlx4_ib_exp_create_qp; ibdev->ib_dev.exp_query_device = mlx4_ib_exp_query_device; if (check_flow_steering_support(dev)) { ibdev->ib_dev.uverbs_ex_cmd_mask |= (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); ibdev->ib_dev.create_flow = mlx4_ib_create_flow; ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; } else { pr_debug("Device managed flow steering is unavailable for this configuration.\n"); } /* * End of experimental data */ mlx4_ib_alloc_eqs(dev, ibdev); spin_lock_init(&iboe->lock); if (init_node_data(ibdev)) goto err_map; for (i = 0; i < ibdev->num_ports; ++i) { if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == IB_LINK_LAYER_ETHERNET) { if (mlx4_is_slave(dev)) { ibdev->counters[i].status = mlx4_counter_alloc(ibdev->dev, i + 1, &ibdev->counters[i].counter_index); } else {/* allocating the PF IB default counter indices reserved in mlx4_init_counters_table */ ibdev->counters[i].counter_index = ((i + 1) << 1) - 1; ibdev->counters[i].status = 0; } dev_info(&dev->pdev->dev, "%s: allocated counter index %d for port %d\n", __func__, ibdev->counters[i].counter_index, i+1); } else { ibdev->counters[i].counter_index = MLX4_SINK_COUNTER_INDEX; ibdev->counters[i].status = -ENOSPC; } } spin_lock_init(&ibdev->sm_lock); mutex_init(&ibdev->cap_mask_mutex); if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED && !mlx4_is_mfunc(dev)) { ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, MLX4_IB_UC_STEER_QPN_ALIGN, &ibdev->steer_qpn_base, 0); if (err) goto err_counter; ibdev->ib_uc_qpns_bitmap = kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) * sizeof(long), GFP_KERNEL); if (!ibdev->ib_uc_qpns_bitmap) { dev_err(&dev->pdev->dev, "bit map alloc failed\n"); goto err_steer_qp_release; } bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count); err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_base + ibdev->steer_qpn_count - 1); if (err) goto err_steer_free_bitmap; } if (ib_register_device(&ibdev->ib_dev, NULL)) goto err_steer_free_bitmap; if (mlx4_ib_mad_init(ibdev)) goto err_reg; if (mlx4_ib_init_sriov(ibdev)) goto err_mad; if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) { if (!iboe->nb.notifier_call) { iboe->nb.notifier_call = mlx4_ib_netdev_event; err = register_netdevice_notifier(&iboe->nb); if (err) { iboe->nb.notifier_call = NULL; goto err_notify; } } if (!iboe->nb_inet.notifier_call) { iboe->nb_inet.notifier_call = mlx4_ib_inet_event; err = register_inetaddr_notifier(&iboe->nb_inet); if (err) { iboe->nb_inet.notifier_call = NULL; goto err_notify; } } mlx4_ib_scan_netdevs(ibdev, NULL, 0); } for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { if (device_create_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j])) goto err_notify; } if (sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group)) goto err_notify; ibdev->ib_active = true; if (mlx4_is_mfunc(ibdev->dev)) init_pkeys(ibdev); /* create paravirt contexts for any VFs which are active */ if (mlx4_is_master(ibdev->dev)) { for (j = 0; j < MLX4_MFUNC_MAX; j++) { if (j == mlx4_master_func_num(ibdev->dev)) continue; if (mlx4_is_slave_active(ibdev->dev, j)) do_slave_init(ibdev, j, 1); } } return ibdev; err_notify: for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { device_remove_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j]); } if (ibdev->iboe.nb.notifier_call) { if (unregister_netdevice_notifier(&ibdev->iboe.nb)) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb.notifier_call = NULL; } if (ibdev->iboe.nb_inet.notifier_call) { if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb_inet.notifier_call = NULL; } flush_workqueue(wq); mlx4_ib_close_sriov(ibdev); err_mad: mlx4_ib_mad_cleanup(ibdev); err_reg: ib_unregister_device(&ibdev->ib_dev); err_steer_free_bitmap: kfree(ibdev->ib_uc_qpns_bitmap); err_steer_qp_release: if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) mlx4_qp_release_range(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_count); err_counter: for (; i; --i) { if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i) == IB_LINK_LAYER_ETHERNET) { mlx4_counter_free(ibdev->dev, i, ibdev->counters[i - 1].counter_index); } } err_map: iounmap(ibdev->priv_uar.map); mlx4_ib_free_eqs(dev, ibdev); err_uar: mlx4_uar_free(dev, &ibdev->priv_uar); err_pd: mlx4_pd_free(dev, ibdev->priv_pdn); err_dealloc: ib_dealloc_device(&ibdev->ib_dev); return NULL; } int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn) { int offset; WARN_ON(!dev->ib_uc_qpns_bitmap); offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap, dev->steer_qpn_count, get_count_order(count)); if (offset < 0) return offset; *qpn = dev->steer_qpn_base + offset; return 0; } void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) { if (!qpn || dev->dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) return; BUG_ON(qpn < dev->steer_qpn_base); bitmap_release_region(dev->ib_uc_qpns_bitmap, qpn - dev->steer_qpn_base, get_count_order(count)); } int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, int is_attach) { int err; size_t flow_size; struct ib_flow_attr *flow = NULL; struct ib_flow_spec_ib *ib_spec; if (is_attach) { flow_size = sizeof(struct ib_flow_attr) + sizeof(struct ib_flow_spec_ib); flow = kzalloc(flow_size, GFP_KERNEL); if (!flow) return -ENOMEM; flow->port = mqp->port; flow->num_of_specs = 1; flow->size = flow_size; ib_spec = (struct ib_flow_spec_ib *)(flow + 1); ib_spec->type = IB_FLOW_SPEC_IB; ib_spec->size = sizeof(struct ib_flow_spec_ib); ib_spec->val.l3_type_qpn = mqp->ibqp.qp_num; ib_spec->mask.l3_type_qpn = MLX4_IB_FLOW_QPN_MASK; err = __mlx4_ib_create_flow(&mqp->ibqp, flow, IB_FLOW_DOMAIN_NIC, MLX4_FS_REGULAR, &mqp->reg_id); } else { err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id); } kfree(flow); return err; } static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) { struct mlx4_ib_dev *ibdev = ibdev_ptr; int p, j; int dev_idx, ret; if (ibdev->iboe.nb_inet.notifier_call) { if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb_inet.notifier_call = NULL; } mlx4_ib_close_sriov(ibdev); sysfs_remove_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group); mlx4_ib_mad_cleanup(ibdev); for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { device_remove_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j]); } dev_idx = -1; if (dr_active && !(ibdev->dev->flags & MLX4_FLAG_DEV_NUM_STR)) { ret = sscanf(ibdev->ib_dev.name, "mlx4_%d", &dev_idx); if (ret != 1) dev_idx = -1; } ib_unregister_device(&ibdev->ib_dev); if (dev_idx >= 0) { spin_lock(&dev_num_str_lock); bitmap_release_region(dev_num_str_bitmap, dev_idx, 0); spin_unlock(&dev_num_str_lock); } if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { mlx4_qp_release_range(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_count); kfree(ibdev->ib_uc_qpns_bitmap); } if (ibdev->iboe.nb.notifier_call) { if (unregister_netdevice_notifier(&ibdev->iboe.nb)) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb.notifier_call = NULL; } iounmap(ibdev->priv_uar.map); for (p = 0; p < ibdev->num_ports; ++p) { if (mlx4_ib_port_link_layer(&ibdev->ib_dev, p + 1) == IB_LINK_LAYER_ETHERNET) { mlx4_counter_free(ibdev->dev, p + 1, ibdev->counters[p].counter_index); } } mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB) mlx4_CLOSE_PORT(dev, p); mlx4_ib_free_eqs(dev, ibdev); mlx4_uar_free(dev, &ibdev->priv_uar); mlx4_pd_free(dev, ibdev->priv_pdn); ib_dealloc_device(&ibdev->ib_dev); } static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) { struct mlx4_ib_demux_work **dm = NULL; struct mlx4_dev *dev = ibdev->dev; int i; unsigned long flags; if (!mlx4_is_master(dev)) return; dm = kcalloc(dev->caps.num_ports, sizeof *dm, GFP_ATOMIC); if (!dm) { pr_err("failed to allocate memory for tunneling qp update\n"); goto out; } for (i = 0; i < dev->caps.num_ports; i++) { dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC); if (!dm[i]) { pr_err("failed to allocate memory for tunneling qp update work struct\n"); for (i = 0; i < dev->caps.num_ports; i++) { if (dm[i]) kfree(dm[i]); } goto out; } } /* initialize or tear down tunnel QPs for the slave */ for (i = 0; i < dev->caps.num_ports; i++) { INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); dm[i]->port = i + 1; dm[i]->slave = slave; dm[i]->do_init = do_init; dm[i]->dev = ibdev; spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags); if (!ibdev->sriov.is_going_down) queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work); spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); } out: if (dm) kfree(dm); return; } static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, enum mlx4_dev_event event, unsigned long param) { struct ib_event ibev; struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr); struct mlx4_eqe *eqe = NULL; struct ib_event_work *ew; int p = 0; if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE) eqe = (struct mlx4_eqe *)param; else p = (int) param; switch (event) { case MLX4_DEV_EVENT_PORT_UP: if (p > ibdev->num_ports) return; if (mlx4_is_master(dev) && rdma_port_get_link_layer(&ibdev->ib_dev, p) == IB_LINK_LAYER_INFINIBAND) { mlx4_ib_invalidate_all_guid_record(ibdev, p); } mlx4_ib_info((struct ib_device *) ibdev_ptr, "Port %d logical link is up\n", p); ibev.event = IB_EVENT_PORT_ACTIVE; break; case MLX4_DEV_EVENT_PORT_DOWN: if (p > ibdev->num_ports) return; mlx4_ib_info((struct ib_device *) ibdev_ptr, "Port %d logical link is down\n", p); ibev.event = IB_EVENT_PORT_ERR; break; case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: ibdev->ib_active = false; ibev.event = IB_EVENT_DEVICE_FATAL; break; case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: ew = kmalloc(sizeof *ew, GFP_ATOMIC); if (!ew) { pr_err("failed to allocate memory for events work\n"); break; } INIT_WORK(&ew->work, handle_port_mgmt_change_event); memcpy(&ew->ib_eqe, eqe, sizeof *eqe); ew->ib_dev = ibdev; /* need to queue only for port owner, which uses GEN_EQE */ if (mlx4_is_master(dev)) queue_work(wq, &ew->work); else handle_port_mgmt_change_event(&ew->work); return; case MLX4_DEV_EVENT_SLAVE_INIT: /* here, p is the slave id */ do_slave_init(ibdev, p, 1); return; case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: /* here, p is the slave id */ do_slave_init(ibdev, p, 0); return; default: return; } ibev.device = ibdev_ptr; ibev.element.port_num = (u8) p; ib_dispatch_event(&ibev); } static struct mlx4_interface mlx4_ib_interface = { .add = mlx4_ib_add, .remove = mlx4_ib_remove, .event = mlx4_ib_event, .protocol = MLX4_PROT_IB_IPV6 }; static int __init mlx4_ib_init(void) { int err; wq = create_singlethread_workqueue("mlx4_ib"); if (!wq) return -ENOMEM; err = mlx4_ib_mcg_init(); if (err) goto clean_proc; init_dev_assign(); err = mlx4_register_interface(&mlx4_ib_interface); if (err) goto clean_mcg; return 0; clean_mcg: mlx4_ib_mcg_destroy(); clean_proc: destroy_workqueue(wq); return err; } static void __exit mlx4_ib_cleanup(void) { mlx4_unregister_interface(&mlx4_ib_interface); mlx4_ib_mcg_destroy(); destroy_workqueue(wq); kfree(dev_num_str_bitmap); } module_init_order(mlx4_ib_init, SI_ORDER_MIDDLE); module_exit(mlx4_ib_cleanup); static int mlx4ib_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t mlx4ib_mod = { .name = "mlx4ib", .evhand = mlx4ib_evhand, }; DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_LAST, SI_ORDER_ANY); MODULE_DEPEND(mlx4ib, mlx4, 1, 1, 1); MODULE_DEPEND(mlx4ib, ibcore, 1, 1, 1); MODULE_DEPEND(mlx4ib, linuxkpi, 1, 1, 1); Index: head/sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c =================================================================== --- head/sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c (revision 300675) +++ head/sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c (revision 300676) @@ -1,198 +1,200 @@ /* * Copyright (c) 2005 Cisco Systems. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#define LINUXKPI_PARAM_PREFIX mthca_ + #include #include #include #include "mthca_dev.h" enum { MTHCA_CATAS_TYPE_INTERNAL = 0, MTHCA_CATAS_TYPE_UPLINK = 3, MTHCA_CATAS_TYPE_DDR = 4, MTHCA_CATAS_TYPE_PARITY = 5, }; #define MTHCA_CATAS_POLL_INTERVAL (5 * HZ) static DEFINE_SPINLOCK(catas_lock); static LIST_HEAD(catas_list); static struct workqueue_struct *catas_wq; static struct work_struct catas_work; static int catas_reset_disable; module_param_named(catas_reset_disable, catas_reset_disable, int, 0644); MODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero"); static void catas_reset(struct work_struct *work) { struct mthca_dev *dev, *tmpdev; LIST_HEAD(tlist); int ret; mutex_lock(&mthca_device_mutex); spin_lock_irq(&catas_lock); list_splice_init(&catas_list, &tlist); spin_unlock_irq(&catas_lock); list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) { struct pci_dev *pdev = dev->pdev; ret = __mthca_restart_one(dev->pdev); /* 'dev' now is not valid */ if (ret) printk(KERN_ERR "mthca %s: Reset failed (%d)\n", pci_name(pdev), ret); else { struct mthca_dev *d = pci_get_drvdata(pdev); mthca_dbg(d, "Reset succeeded\n"); } } mutex_unlock(&mthca_device_mutex); } static void handle_catas(struct mthca_dev *dev) { struct ib_event event; unsigned long flags; const char *type; int i; event.device = &dev->ib_dev; event.event = IB_EVENT_DEVICE_FATAL; event.element.port_num = 0; dev->active = 0; ib_dispatch_event(&event); switch (swab32(readl(dev->catas_err.map)) >> 24) { case MTHCA_CATAS_TYPE_INTERNAL: type = "internal error"; break; case MTHCA_CATAS_TYPE_UPLINK: type = "uplink bus error"; break; case MTHCA_CATAS_TYPE_DDR: type = "DDR data error"; break; case MTHCA_CATAS_TYPE_PARITY: type = "internal parity error"; break; default: type = "unknown error"; break; } mthca_err(dev, "Catastrophic error detected: %s\n", type); for (i = 0; i < dev->catas_err.size; ++i) mthca_err(dev, " buf[%02x]: %08x\n", i, swab32(readl(dev->catas_err.map + i))); if (catas_reset_disable) return; spin_lock_irqsave(&catas_lock, flags); list_add(&dev->catas_err.list, &catas_list); queue_work(catas_wq, &catas_work); spin_unlock_irqrestore(&catas_lock, flags); } static void poll_catas(unsigned long dev_ptr) { struct mthca_dev *dev = (struct mthca_dev *) dev_ptr; int i; for (i = 0; i < dev->catas_err.size; ++i) if (readl(dev->catas_err.map + i)) { handle_catas(dev); return; } mod_timer(&dev->catas_err.timer, round_jiffies(jiffies + MTHCA_CATAS_POLL_INTERVAL)); } void mthca_start_catas_poll(struct mthca_dev *dev) { unsigned long addr; init_timer(&dev->catas_err.timer); dev->catas_err.map = NULL; addr = pci_resource_start(dev->pdev, 0) + ((pci_resource_len(dev->pdev, 0) - 1) & dev->catas_err.addr); dev->catas_err.map = ioremap(addr, dev->catas_err.size * 4); if (!dev->catas_err.map) { mthca_warn(dev, "couldn't map catastrophic error region " "at 0x%lx/0x%x\n", addr, dev->catas_err.size * 4); return; } dev->catas_err.timer.data = (unsigned long) dev; dev->catas_err.timer.function = poll_catas; dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL; INIT_LIST_HEAD(&dev->catas_err.list); add_timer(&dev->catas_err.timer); } void mthca_stop_catas_poll(struct mthca_dev *dev) { del_timer_sync(&dev->catas_err.timer); if (dev->catas_err.map) iounmap(dev->catas_err.map); spin_lock_irq(&catas_lock); list_del(&dev->catas_err.list); spin_unlock_irq(&catas_lock); } int __init mthca_catas_init(void) { INIT_WORK(&catas_work, catas_reset); catas_wq = create_singlethread_workqueue("mthcacatas"); if (!catas_wq) return -ENOMEM; return 0; } void mthca_catas_cleanup(void) { destroy_workqueue(catas_wq); } Index: head/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c =================================================================== --- head/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c (revision 300675) +++ head/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c (revision 300676) @@ -1,1931 +1,1933 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#define LINUXKPI_PARAM_PREFIX mthca_ + #include #include #include #include #include #include #include "mthca_dev.h" #include "mthca_config_reg.h" #include "mthca_cmd.h" #include "mthca_memfree.h" #define CMD_POLL_TOKEN 0xffff enum { HCR_IN_PARAM_OFFSET = 0x00, HCR_IN_MODIFIER_OFFSET = 0x08, HCR_OUT_PARAM_OFFSET = 0x0c, HCR_TOKEN_OFFSET = 0x14, HCR_STATUS_OFFSET = 0x18, HCR_OPMOD_SHIFT = 12, HCA_E_BIT = 22, HCR_GO_BIT = 23 }; enum { /* initialization and general commands */ CMD_SYS_EN = 0x1, CMD_SYS_DIS = 0x2, CMD_MAP_FA = 0xfff, CMD_UNMAP_FA = 0xffe, CMD_RUN_FW = 0xff6, CMD_MOD_STAT_CFG = 0x34, CMD_QUERY_DEV_LIM = 0x3, CMD_QUERY_FW = 0x4, CMD_ENABLE_LAM = 0xff8, CMD_DISABLE_LAM = 0xff7, CMD_QUERY_DDR = 0x5, CMD_QUERY_ADAPTER = 0x6, CMD_INIT_HCA = 0x7, CMD_CLOSE_HCA = 0x8, CMD_INIT_IB = 0x9, CMD_CLOSE_IB = 0xa, CMD_QUERY_HCA = 0xb, CMD_SET_IB = 0xc, CMD_ACCESS_DDR = 0x2e, CMD_MAP_ICM = 0xffa, CMD_UNMAP_ICM = 0xff9, CMD_MAP_ICM_AUX = 0xffc, CMD_UNMAP_ICM_AUX = 0xffb, CMD_SET_ICM_SIZE = 0xffd, /* TPT commands */ CMD_SW2HW_MPT = 0xd, CMD_QUERY_MPT = 0xe, CMD_HW2SW_MPT = 0xf, CMD_READ_MTT = 0x10, CMD_WRITE_MTT = 0x11, CMD_SYNC_TPT = 0x2f, /* EQ commands */ CMD_MAP_EQ = 0x12, CMD_SW2HW_EQ = 0x13, CMD_HW2SW_EQ = 0x14, CMD_QUERY_EQ = 0x15, /* CQ commands */ CMD_SW2HW_CQ = 0x16, CMD_HW2SW_CQ = 0x17, CMD_QUERY_CQ = 0x18, CMD_RESIZE_CQ = 0x2c, /* SRQ commands */ CMD_SW2HW_SRQ = 0x35, CMD_HW2SW_SRQ = 0x36, CMD_QUERY_SRQ = 0x37, CMD_ARM_SRQ = 0x40, /* QP/EE commands */ CMD_RST2INIT_QPEE = 0x19, CMD_INIT2RTR_QPEE = 0x1a, CMD_RTR2RTS_QPEE = 0x1b, CMD_RTS2RTS_QPEE = 0x1c, CMD_SQERR2RTS_QPEE = 0x1d, CMD_2ERR_QPEE = 0x1e, CMD_RTS2SQD_QPEE = 0x1f, CMD_SQD2SQD_QPEE = 0x38, CMD_SQD2RTS_QPEE = 0x20, CMD_ERR2RST_QPEE = 0x21, CMD_QUERY_QPEE = 0x22, CMD_INIT2INIT_QPEE = 0x2d, CMD_SUSPEND_QPEE = 0x32, CMD_UNSUSPEND_QPEE = 0x33, /* special QPs and management commands */ CMD_CONF_SPECIAL_QP = 0x23, CMD_MAD_IFC = 0x24, /* multicast commands */ CMD_READ_MGM = 0x25, CMD_WRITE_MGM = 0x26, CMD_MGID_HASH = 0x27, /* miscellaneous commands */ CMD_DIAG_RPRT = 0x30, CMD_NOP = 0x31, /* debug commands */ CMD_QUERY_DEBUG_MSG = 0x2a, CMD_SET_DEBUG_MSG = 0x2b, }; /* * According to Mellanox code, FW may be starved and never complete * commands. So we can't use strict timeouts described in PRM -- we * just arbitrarily select 60 seconds for now. */ #if 0 /* * Round up and add 1 to make sure we get the full wait time (since we * will be starting in the middle of a jiffy) */ enum { CMD_TIME_CLASS_A = (HZ + 999) / 1000 + 1, CMD_TIME_CLASS_B = (HZ + 99) / 100 + 1, CMD_TIME_CLASS_C = (HZ + 9) / 10 + 1, CMD_TIME_CLASS_D = 60 * HZ }; #else #define CMD_TIME_CLASS_A (60 * HZ) #define CMD_TIME_CLASS_B (60 * HZ) #define CMD_TIME_CLASS_C (60 * HZ) #define CMD_TIME_CLASS_D (60 * HZ) #endif #define GO_BIT_TIMEOUT (HZ * 10) struct mthca_cmd_context { struct completion done; int result; int next; u64 out_param; u16 token; u8 status; }; static int fw_cmd_doorbell = 0; module_param(fw_cmd_doorbell, int, 0644); MODULE_PARM_DESC(fw_cmd_doorbell, "post FW commands through doorbell page if nonzero " "(and supported by FW)"); static inline int go_bit(struct mthca_dev *dev) { return readl(dev->hcr + HCR_STATUS_OFFSET) & swab32(1 << HCR_GO_BIT); } static void mthca_cmd_post_dbell(struct mthca_dev *dev, u64 in_param, u64 out_param, u32 in_modifier, u8 op_modifier, u16 op, u16 token) { void __iomem *ptr = dev->cmd.dbell_map; u16 *offs = dev->cmd.dbell_offsets; __raw_writel((__force u32) cpu_to_be32(in_param >> 32), ptr + offs[0]); wmb(); __raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful), ptr + offs[1]); wmb(); __raw_writel((__force u32) cpu_to_be32(in_modifier), ptr + offs[2]); wmb(); __raw_writel((__force u32) cpu_to_be32(out_param >> 32), ptr + offs[3]); wmb(); __raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), ptr + offs[4]); wmb(); __raw_writel((__force u32) cpu_to_be32(token << 16), ptr + offs[5]); wmb(); __raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT) | (1 << HCA_E_BIT) | (op_modifier << HCR_OPMOD_SHIFT) | op), ptr + offs[6]); wmb(); __raw_writel((__force u32) 0, ptr + offs[7]); wmb(); } static int mthca_cmd_post_hcr(struct mthca_dev *dev, u64 in_param, u64 out_param, u32 in_modifier, u8 op_modifier, u16 op, u16 token, int event) { if (event) { unsigned long end = jiffies + GO_BIT_TIMEOUT; while (go_bit(dev) && time_before(jiffies, end)) sched_yield(); } if (go_bit(dev)) return -EAGAIN; /* * We use writel (instead of something like memcpy_toio) * because writes of less than 32 bits to the HCR don't work * (and some architectures such as ia64 implement memcpy_toio * in terms of writeb). */ __raw_writel((__force u32) cpu_to_be32(in_param >> 32), dev->hcr + 0 * 4); __raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful), dev->hcr + 1 * 4); __raw_writel((__force u32) cpu_to_be32(in_modifier), dev->hcr + 2 * 4); __raw_writel((__force u32) cpu_to_be32(out_param >> 32), dev->hcr + 3 * 4); __raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), dev->hcr + 4 * 4); __raw_writel((__force u32) cpu_to_be32(token << 16), dev->hcr + 5 * 4); /* __raw_writel may not order writes. */ wmb(); __raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT) | (event ? (1 << HCA_E_BIT) : 0) | (op_modifier << HCR_OPMOD_SHIFT) | op), dev->hcr + 6 * 4); return 0; } static int mthca_cmd_post(struct mthca_dev *dev, u64 in_param, u64 out_param, u32 in_modifier, u8 op_modifier, u16 op, u16 token, int event) { int err = 0; mutex_lock(&dev->cmd.hcr_mutex); if (event && dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS && fw_cmd_doorbell) mthca_cmd_post_dbell(dev, in_param, out_param, in_modifier, op_modifier, op, token); else err = mthca_cmd_post_hcr(dev, in_param, out_param, in_modifier, op_modifier, op, token, event); /* * Make sure that our HCR writes don't get mixed in with * writes from another CPU starting a FW command. */ mmiowb(); mutex_unlock(&dev->cmd.hcr_mutex); return err; } static int mthca_cmd_poll(struct mthca_dev *dev, u64 in_param, u64 *out_param, int out_is_imm, u32 in_modifier, u8 op_modifier, u16 op, unsigned long timeout, u8 *status) { int err = 0; unsigned long end; down(&dev->cmd.poll_sem); err = mthca_cmd_post(dev, in_param, out_param ? *out_param : 0, in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0); if (err) goto out; end = timeout + jiffies; while (go_bit(dev) && time_before(jiffies, end)) sched_yield(); if (go_bit(dev)) { err = -EBUSY; goto out; } if (out_is_imm) *out_param = (u64) be32_to_cpu((__force __be32) __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET)) << 32 | (u64) be32_to_cpu((__force __be32) __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET + 4)); *status = be32_to_cpu((__force __be32) __raw_readl(dev->hcr + HCR_STATUS_OFFSET)) >> 24; out: up(&dev->cmd.poll_sem); return err; } void mthca_cmd_event(struct mthca_dev *dev, u16 token, u8 status, u64 out_param) { struct mthca_cmd_context *context = &dev->cmd.context[token & dev->cmd.token_mask]; /* previously timed out command completing at long last */ if (token != context->token) return; context->result = 0; context->status = status; context->out_param = out_param; complete(&context->done); } static int mthca_cmd_wait(struct mthca_dev *dev, u64 in_param, u64 *out_param, int out_is_imm, u32 in_modifier, u8 op_modifier, u16 op, unsigned long timeout, u8 *status) { int err = 0; struct mthca_cmd_context *context; down(&dev->cmd.event_sem); spin_lock(&dev->cmd.context_lock); BUG_ON(dev->cmd.free_head < 0); context = &dev->cmd.context[dev->cmd.free_head]; context->token += dev->cmd.token_mask + 1; dev->cmd.free_head = context->next; spin_unlock(&dev->cmd.context_lock); init_completion(&context->done); err = mthca_cmd_post(dev, in_param, out_param ? *out_param : 0, in_modifier, op_modifier, op, context->token, 1); if (err) goto out; if (!wait_for_completion_timeout(&context->done, timeout)) { err = -EBUSY; goto out; } err = context->result; if (err) goto out; *status = context->status; if (*status) mthca_dbg(dev, "Command %02x completed with status %02x\n", op, *status); if (out_is_imm) *out_param = context->out_param; out: spin_lock(&dev->cmd.context_lock); context->next = dev->cmd.free_head; dev->cmd.free_head = context - dev->cmd.context; spin_unlock(&dev->cmd.context_lock); up(&dev->cmd.event_sem); return err; } /* Invoke a command with an output mailbox */ static int mthca_cmd_box(struct mthca_dev *dev, u64 in_param, u64 out_param, u32 in_modifier, u8 op_modifier, u16 op, unsigned long timeout, u8 *status) { if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS) return mthca_cmd_wait(dev, in_param, &out_param, 0, in_modifier, op_modifier, op, timeout, status); else return mthca_cmd_poll(dev, in_param, &out_param, 0, in_modifier, op_modifier, op, timeout, status); } /* Invoke a command with no output parameter */ static int mthca_cmd(struct mthca_dev *dev, u64 in_param, u32 in_modifier, u8 op_modifier, u16 op, unsigned long timeout, u8 *status) { return mthca_cmd_box(dev, in_param, 0, in_modifier, op_modifier, op, timeout, status); } /* * Invoke a command with an immediate output parameter (and copy the * output into the caller's out_param pointer after the command * executes). */ static int mthca_cmd_imm(struct mthca_dev *dev, u64 in_param, u64 *out_param, u32 in_modifier, u8 op_modifier, u16 op, unsigned long timeout, u8 *status) { if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS) return mthca_cmd_wait(dev, in_param, out_param, 1, in_modifier, op_modifier, op, timeout, status); else return mthca_cmd_poll(dev, in_param, out_param, 1, in_modifier, op_modifier, op, timeout, status); } int mthca_cmd_init(struct mthca_dev *dev) { mutex_init(&dev->cmd.hcr_mutex); sema_init(&dev->cmd.poll_sem, 1); dev->cmd.flags = 0; dev->hcr = ioremap(pci_resource_start(dev->pdev, 0) + MTHCA_HCR_BASE, MTHCA_HCR_SIZE); if (!dev->hcr) { mthca_err(dev, "Couldn't map command register."); return -ENOMEM; } dev->cmd.pool = pci_pool_create("mthca_cmd", dev->pdev, MTHCA_MAILBOX_SIZE, MTHCA_MAILBOX_SIZE, 0); if (!dev->cmd.pool) { iounmap(dev->hcr); return -ENOMEM; } return 0; } void mthca_cmd_cleanup(struct mthca_dev *dev) { pci_pool_destroy(dev->cmd.pool); iounmap(dev->hcr); if (dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS) iounmap(dev->cmd.dbell_map); } /* * Switch to using events to issue FW commands (should be called after * event queue to command events has been initialized). */ int mthca_cmd_use_events(struct mthca_dev *dev) { int i; dev->cmd.context = kmalloc(dev->cmd.max_cmds * sizeof (struct mthca_cmd_context), GFP_KERNEL); if (!dev->cmd.context) return -ENOMEM; for (i = 0; i < dev->cmd.max_cmds; ++i) { dev->cmd.context[i].token = i; dev->cmd.context[i].next = i + 1; } dev->cmd.context[dev->cmd.max_cmds - 1].next = -1; dev->cmd.free_head = 0; sema_init(&dev->cmd.event_sem, dev->cmd.max_cmds); spin_lock_init(&dev->cmd.context_lock); for (dev->cmd.token_mask = 1; dev->cmd.token_mask < dev->cmd.max_cmds; dev->cmd.token_mask <<= 1) ; /* nothing */ --dev->cmd.token_mask; dev->cmd.flags |= MTHCA_CMD_USE_EVENTS; down(&dev->cmd.poll_sem); return 0; } /* * Switch back to polling (used when shutting down the device) */ void mthca_cmd_use_polling(struct mthca_dev *dev) { int i; dev->cmd.flags &= ~MTHCA_CMD_USE_EVENTS; for (i = 0; i < dev->cmd.max_cmds; ++i) down(&dev->cmd.event_sem); kfree(dev->cmd.context); up(&dev->cmd.poll_sem); } struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev, gfp_t gfp_mask) { struct mthca_mailbox *mailbox; mailbox = kmalloc(sizeof *mailbox, gfp_mask); if (!mailbox) return ERR_PTR(-ENOMEM); mailbox->buf = pci_pool_alloc(dev->cmd.pool, gfp_mask, &mailbox->dma); if (!mailbox->buf) { kfree(mailbox); return ERR_PTR(-ENOMEM); } return mailbox; } void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox) { if (!mailbox) return; pci_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma); kfree(mailbox); } int mthca_SYS_EN(struct mthca_dev *dev, u8 *status) { u64 out; int ret; ret = mthca_cmd_imm(dev, 0, &out, 0, 0, CMD_SYS_EN, CMD_TIME_CLASS_D, status); if (*status == MTHCA_CMD_STAT_DDR_MEM_ERR) mthca_warn(dev, "SYS_EN DDR error: syn=%x, sock=%d, " "sladdr=%d, SPD source=%s\n", (int) (out >> 6) & 0xf, (int) (out >> 4) & 3, (int) (out >> 1) & 7, (int) out & 1 ? "NVMEM" : "DIMM"); return ret; } int mthca_SYS_DIS(struct mthca_dev *dev, u8 *status) { return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C, status); } static int mthca_map_cmd(struct mthca_dev *dev, u16 op, struct mthca_icm *icm, u64 virt, u8 *status) { struct mthca_mailbox *mailbox; struct mthca_icm_iter iter; __be64 *pages; int lg; int nent = 0; int i; int err = 0; int ts = 0, tc = 0; mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); memset(mailbox->buf, 0, MTHCA_MAILBOX_SIZE); pages = mailbox->buf; for (mthca_icm_first(icm, &iter); !mthca_icm_last(&iter); mthca_icm_next(&iter)) { /* * We have to pass pages that are aligned to their * size, so find the least significant 1 in the * address or size and use that as our log2 size. */ lg = ffs(mthca_icm_addr(&iter) | mthca_icm_size(&iter)) - 1; if (lg < MTHCA_ICM_PAGE_SHIFT) { mthca_warn(dev, "Got FW area not aligned to %d (%llx/%lx).\n", MTHCA_ICM_PAGE_SIZE, (unsigned long long) mthca_icm_addr(&iter), mthca_icm_size(&iter)); err = -EINVAL; goto out; } for (i = 0; i < mthca_icm_size(&iter) >> lg; ++i) { if (virt != -1) { pages[nent * 2] = cpu_to_be64(virt); virt += 1 << lg; } pages[nent * 2 + 1] = cpu_to_be64((mthca_icm_addr(&iter) + (i << lg)) | (lg - MTHCA_ICM_PAGE_SHIFT)); ts += 1 << (lg - 10); ++tc; if (++nent == MTHCA_MAILBOX_SIZE / 16) { err = mthca_cmd(dev, mailbox->dma, nent, 0, op, CMD_TIME_CLASS_B, status); if (err || *status) goto out; nent = 0; } } } if (nent) err = mthca_cmd(dev, mailbox->dma, nent, 0, op, CMD_TIME_CLASS_B, status); switch (op) { case CMD_MAP_FA: mthca_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts); break; case CMD_MAP_ICM_AUX: mthca_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts); break; case CMD_MAP_ICM: mthca_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n", tc, ts, (unsigned long long) virt - (ts << 10)); break; } out: mthca_free_mailbox(dev, mailbox); return err; } int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status) { return mthca_map_cmd(dev, CMD_MAP_FA, icm, -1, status); } int mthca_UNMAP_FA(struct mthca_dev *dev, u8 *status) { return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_FA, CMD_TIME_CLASS_B, status); } int mthca_RUN_FW(struct mthca_dev *dev, u8 *status) { return mthca_cmd(dev, 0, 0, 0, CMD_RUN_FW, CMD_TIME_CLASS_A, status); } static void mthca_setup_cmd_doorbells(struct mthca_dev *dev, u64 base) { unsigned long addr; u16 max_off = 0; int i; for (i = 0; i < 8; ++i) max_off = max(max_off, dev->cmd.dbell_offsets[i]); if ((base & PAGE_MASK) != ((base + max_off) & PAGE_MASK)) { mthca_warn(dev, "Firmware doorbell region at 0x%016llx, " "length 0x%x crosses a page boundary\n", (unsigned long long) base, max_off); return; } addr = pci_resource_start(dev->pdev, 2) + ((pci_resource_len(dev->pdev, 2) - 1) & base); dev->cmd.dbell_map = ioremap(addr, max_off + sizeof(u32)); if (!dev->cmd.dbell_map) return; dev->cmd.flags |= MTHCA_CMD_POST_DOORBELLS; mthca_dbg(dev, "Mapped doorbell page for posting FW commands\n"); } int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status) { struct mthca_mailbox *mailbox; u32 *outbox; u64 base; u32 tmp; int err = 0; u8 lg; int i; #define QUERY_FW_OUT_SIZE 0x100 #define QUERY_FW_VER_OFFSET 0x00 #define QUERY_FW_MAX_CMD_OFFSET 0x0f #define QUERY_FW_ERR_START_OFFSET 0x30 #define QUERY_FW_ERR_SIZE_OFFSET 0x38 #define QUERY_FW_CMD_DB_EN_OFFSET 0x10 #define QUERY_FW_CMD_DB_OFFSET 0x50 #define QUERY_FW_CMD_DB_BASE 0x60 #define QUERY_FW_START_OFFSET 0x20 #define QUERY_FW_END_OFFSET 0x28 #define QUERY_FW_SIZE_OFFSET 0x00 #define QUERY_FW_CLR_INT_BASE_OFFSET 0x20 #define QUERY_FW_EQ_ARM_BASE_OFFSET 0x40 #define QUERY_FW_EQ_SET_CI_BASE_OFFSET 0x48 mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); outbox = mailbox->buf; err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_FW, CMD_TIME_CLASS_A, status); if (err) goto out; MTHCA_GET(dev->fw_ver, outbox, QUERY_FW_VER_OFFSET); /* * FW subminor version is at more significant bits than minor * version, so swap here. */ dev->fw_ver = (dev->fw_ver & 0xffff00000000ull) | ((dev->fw_ver & 0xffff0000ull) >> 16) | ((dev->fw_ver & 0x0000ffffull) << 16); MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET); dev->cmd.max_cmds = 1 << lg; mthca_dbg(dev, "FW version %012llx, max commands %d\n", (unsigned long long) dev->fw_ver, dev->cmd.max_cmds); MTHCA_GET(dev->catas_err.addr, outbox, QUERY_FW_ERR_START_OFFSET); MTHCA_GET(dev->catas_err.size, outbox, QUERY_FW_ERR_SIZE_OFFSET); mthca_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x\n", (unsigned long long) dev->catas_err.addr, dev->catas_err.size); MTHCA_GET(tmp, outbox, QUERY_FW_CMD_DB_EN_OFFSET); if (tmp & 0x1) { mthca_dbg(dev, "FW supports commands through doorbells\n"); MTHCA_GET(base, outbox, QUERY_FW_CMD_DB_BASE); for (i = 0; i < MTHCA_CMD_NUM_DBELL_DWORDS; ++i) MTHCA_GET(dev->cmd.dbell_offsets[i], outbox, QUERY_FW_CMD_DB_OFFSET + (i << 1)); mthca_setup_cmd_doorbells(dev, base); } if (mthca_is_memfree(dev)) { MTHCA_GET(dev->fw.arbel.fw_pages, outbox, QUERY_FW_SIZE_OFFSET); MTHCA_GET(dev->fw.arbel.clr_int_base, outbox, QUERY_FW_CLR_INT_BASE_OFFSET); MTHCA_GET(dev->fw.arbel.eq_arm_base, outbox, QUERY_FW_EQ_ARM_BASE_OFFSET); MTHCA_GET(dev->fw.arbel.eq_set_ci_base, outbox, QUERY_FW_EQ_SET_CI_BASE_OFFSET); mthca_dbg(dev, "FW size %d KB\n", dev->fw.arbel.fw_pages << 2); /* * Round up number of system pages needed in case * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE. */ dev->fw.arbel.fw_pages = ALIGN(dev->fw.arbel.fw_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >> (PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT); mthca_dbg(dev, "Clear int @ %llx, EQ arm @ %llx, EQ set CI @ %llx\n", (unsigned long long) dev->fw.arbel.clr_int_base, (unsigned long long) dev->fw.arbel.eq_arm_base, (unsigned long long) dev->fw.arbel.eq_set_ci_base); } else { MTHCA_GET(dev->fw.tavor.fw_start, outbox, QUERY_FW_START_OFFSET); MTHCA_GET(dev->fw.tavor.fw_end, outbox, QUERY_FW_END_OFFSET); mthca_dbg(dev, "FW size %d KB (start %llx, end %llx)\n", (int) ((dev->fw.tavor.fw_end - dev->fw.tavor.fw_start) >> 10), (unsigned long long) dev->fw.tavor.fw_start, (unsigned long long) dev->fw.tavor.fw_end); } out: mthca_free_mailbox(dev, mailbox); return err; } int mthca_ENABLE_LAM(struct mthca_dev *dev, u8 *status) { struct mthca_mailbox *mailbox; u8 info; u32 *outbox; int err = 0; #define ENABLE_LAM_OUT_SIZE 0x100 #define ENABLE_LAM_START_OFFSET 0x00 #define ENABLE_LAM_END_OFFSET 0x08 #define ENABLE_LAM_INFO_OFFSET 0x13 #define ENABLE_LAM_INFO_HIDDEN_FLAG (1 << 4) #define ENABLE_LAM_INFO_ECC_MASK 0x3 mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); outbox = mailbox->buf; err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_ENABLE_LAM, CMD_TIME_CLASS_C, status); if (err) goto out; if (*status == MTHCA_CMD_STAT_LAM_NOT_PRE) goto out; MTHCA_GET(dev->ddr_start, outbox, ENABLE_LAM_START_OFFSET); MTHCA_GET(dev->ddr_end, outbox, ENABLE_LAM_END_OFFSET); MTHCA_GET(info, outbox, ENABLE_LAM_INFO_OFFSET); if (!!(info & ENABLE_LAM_INFO_HIDDEN_FLAG) != !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) { mthca_info(dev, "FW reports that HCA-attached memory " "is %s hidden; does not match PCI config\n", (info & ENABLE_LAM_INFO_HIDDEN_FLAG) ? "" : "not"); } if (info & ENABLE_LAM_INFO_HIDDEN_FLAG) mthca_dbg(dev, "HCA-attached memory is hidden.\n"); mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n", (int) ((dev->ddr_end - dev->ddr_start) >> 10), (unsigned long long) dev->ddr_start, (unsigned long long) dev->ddr_end); out: mthca_free_mailbox(dev, mailbox); return err; } int mthca_DISABLE_LAM(struct mthca_dev *dev, u8 *status) { return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C, status); } int mthca_QUERY_DDR(struct mthca_dev *dev, u8 *status) { struct mthca_mailbox *mailbox; u8 info; u32 *outbox; int err = 0; #define QUERY_DDR_OUT_SIZE 0x100 #define QUERY_DDR_START_OFFSET 0x00 #define QUERY_DDR_END_OFFSET 0x08 #define QUERY_DDR_INFO_OFFSET 0x13 #define QUERY_DDR_INFO_HIDDEN_FLAG (1 << 4) #define QUERY_DDR_INFO_ECC_MASK 0x3 mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); outbox = mailbox->buf; err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DDR, CMD_TIME_CLASS_A, status); if (err) goto out; MTHCA_GET(dev->ddr_start, outbox, QUERY_DDR_START_OFFSET); MTHCA_GET(dev->ddr_end, outbox, QUERY_DDR_END_OFFSET); MTHCA_GET(info, outbox, QUERY_DDR_INFO_OFFSET); if (!!(info & QUERY_DDR_INFO_HIDDEN_FLAG) != !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) { mthca_info(dev, "FW reports that HCA-attached memory " "is %s hidden; does not match PCI config\n", (info & QUERY_DDR_INFO_HIDDEN_FLAG) ? "" : "not"); } if (info & QUERY_DDR_INFO_HIDDEN_FLAG) mthca_dbg(dev, "HCA-attached memory is hidden.\n"); mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n", (int) ((dev->ddr_end - dev->ddr_start) >> 10), (unsigned long long) dev->ddr_start, (unsigned long long) dev->ddr_end); out: mthca_free_mailbox(dev, mailbox); return err; } int mthca_QUERY_DEV_LIM(struct mthca_dev *dev, struct mthca_dev_lim *dev_lim, u8 *status) { struct mthca_mailbox *mailbox; u32 *outbox; u8 field; u16 size; u16 stat_rate; int err; #define QUERY_DEV_LIM_OUT_SIZE 0x100 #define QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET 0x10 #define QUERY_DEV_LIM_MAX_QP_SZ_OFFSET 0x11 #define QUERY_DEV_LIM_RSVD_QP_OFFSET 0x12 #define QUERY_DEV_LIM_MAX_QP_OFFSET 0x13 #define QUERY_DEV_LIM_RSVD_SRQ_OFFSET 0x14 #define QUERY_DEV_LIM_MAX_SRQ_OFFSET 0x15 #define QUERY_DEV_LIM_RSVD_EEC_OFFSET 0x16 #define QUERY_DEV_LIM_MAX_EEC_OFFSET 0x17 #define QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET 0x19 #define QUERY_DEV_LIM_RSVD_CQ_OFFSET 0x1a #define QUERY_DEV_LIM_MAX_CQ_OFFSET 0x1b #define QUERY_DEV_LIM_MAX_MPT_OFFSET 0x1d #define QUERY_DEV_LIM_RSVD_EQ_OFFSET 0x1e #define QUERY_DEV_LIM_MAX_EQ_OFFSET 0x1f #define QUERY_DEV_LIM_RSVD_MTT_OFFSET 0x20 #define QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET 0x21 #define QUERY_DEV_LIM_RSVD_MRW_OFFSET 0x22 #define QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET 0x23 #define QUERY_DEV_LIM_MAX_AV_OFFSET 0x27 #define QUERY_DEV_LIM_MAX_REQ_QP_OFFSET 0x29 #define QUERY_DEV_LIM_MAX_RES_QP_OFFSET 0x2b #define QUERY_DEV_LIM_MAX_RDMA_OFFSET 0x2f #define QUERY_DEV_LIM_RSZ_SRQ_OFFSET 0x33 #define QUERY_DEV_LIM_ACK_DELAY_OFFSET 0x35 #define QUERY_DEV_LIM_MTU_WIDTH_OFFSET 0x36 #define QUERY_DEV_LIM_VL_PORT_OFFSET 0x37 #define QUERY_DEV_LIM_MAX_GID_OFFSET 0x3b #define QUERY_DEV_LIM_RATE_SUPPORT_OFFSET 0x3c #define QUERY_DEV_LIM_MAX_PKEY_OFFSET 0x3f #define QUERY_DEV_LIM_FLAGS_OFFSET 0x44 #define QUERY_DEV_LIM_RSVD_UAR_OFFSET 0x48 #define QUERY_DEV_LIM_UAR_SZ_OFFSET 0x49 #define QUERY_DEV_LIM_PAGE_SZ_OFFSET 0x4b #define QUERY_DEV_LIM_MAX_SG_OFFSET 0x51 #define QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET 0x52 #define QUERY_DEV_LIM_MAX_SG_RQ_OFFSET 0x55 #define QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET 0x56 #define QUERY_DEV_LIM_MAX_QP_MCG_OFFSET 0x61 #define QUERY_DEV_LIM_RSVD_MCG_OFFSET 0x62 #define QUERY_DEV_LIM_MAX_MCG_OFFSET 0x63 #define QUERY_DEV_LIM_RSVD_PD_OFFSET 0x64 #define QUERY_DEV_LIM_MAX_PD_OFFSET 0x65 #define QUERY_DEV_LIM_RSVD_RDD_OFFSET 0x66 #define QUERY_DEV_LIM_MAX_RDD_OFFSET 0x67 #define QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET 0x80 #define QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET 0x82 #define QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET 0x84 #define QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET 0x86 #define QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET 0x88 #define QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET 0x8a #define QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET 0x8c #define QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET 0x8e #define QUERY_DEV_LIM_MTT_ENTRY_SZ_OFFSET 0x90 #define QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET 0x92 #define QUERY_DEV_LIM_PBL_SZ_OFFSET 0x96 #define QUERY_DEV_LIM_BMME_FLAGS_OFFSET 0x97 #define QUERY_DEV_LIM_RSVD_LKEY_OFFSET 0x98 #define QUERY_DEV_LIM_LAMR_OFFSET 0x9f #define QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET 0xa0 mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); outbox = mailbox->buf; err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DEV_LIM, CMD_TIME_CLASS_A, status); if (err) goto out; MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_QP_OFFSET); dev_lim->reserved_qps = 1 << (field & 0xf); MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_OFFSET); dev_lim->max_qps = 1 << (field & 0x1f); MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_SRQ_OFFSET); dev_lim->reserved_srqs = 1 << (field >> 4); MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_OFFSET); dev_lim->max_srqs = 1 << (field & 0x1f); MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EEC_OFFSET); dev_lim->reserved_eecs = 1 << (field & 0xf); MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EEC_OFFSET); dev_lim->max_eecs = 1 << (field & 0x1f); MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET); dev_lim->max_cq_sz = 1 << field; MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_CQ_OFFSET); dev_lim->reserved_cqs = 1 << (field & 0xf); MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_OFFSET); dev_lim->max_cqs = 1 << (field & 0x1f); MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MPT_OFFSET); dev_lim->max_mpts = 1 << (field & 0x3f); MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EQ_OFFSET); dev_lim->reserved_eqs = 1 << (field & 0xf); MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EQ_OFFSET); dev_lim->max_eqs = 1 << (field & 0x7); MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET); if (mthca_is_memfree(dev)) dev_lim->reserved_mtts = ALIGN((1 << (field >> 4)) * sizeof(u64), dev->limits.mtt_seg_size) / dev->limits.mtt_seg_size; else dev_lim->reserved_mtts = 1 << (field >> 4); MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET); dev_lim->max_mrw_sz = 1 << field; MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MRW_OFFSET); dev_lim->reserved_mrws = 1 << (field & 0xf); MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET); dev_lim->max_mtt_seg = 1 << (field & 0x3f); MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_REQ_QP_OFFSET); dev_lim->max_requester_per_qp = 1 << (field & 0x3f); MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RES_QP_OFFSET); dev_lim->max_responder_per_qp = 1 << (field & 0x3f); MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDMA_OFFSET); dev_lim->max_rdma_global = 1 << (field & 0x3f); MTHCA_GET(field, outbox, QUERY_DEV_LIM_ACK_DELAY_OFFSET); dev_lim->local_ca_ack_delay = field & 0x1f; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MTU_WIDTH_OFFSET); dev_lim->max_mtu = field >> 4; dev_lim->max_port_width = field & 0xf; MTHCA_GET(field, outbox, QUERY_DEV_LIM_VL_PORT_OFFSET); dev_lim->max_vl = field >> 4; dev_lim->num_ports = field & 0xf; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_GID_OFFSET); dev_lim->max_gids = 1 << (field & 0xf); MTHCA_GET(stat_rate, outbox, QUERY_DEV_LIM_RATE_SUPPORT_OFFSET); dev_lim->stat_rate_support = stat_rate; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PKEY_OFFSET); dev_lim->max_pkeys = 1 << (field & 0xf); MTHCA_GET(dev_lim->flags, outbox, QUERY_DEV_LIM_FLAGS_OFFSET); MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_UAR_OFFSET); dev_lim->reserved_uars = field >> 4; MTHCA_GET(field, outbox, QUERY_DEV_LIM_UAR_SZ_OFFSET); dev_lim->uar_size = 1 << ((field & 0x3f) + 20); MTHCA_GET(field, outbox, QUERY_DEV_LIM_PAGE_SZ_OFFSET); dev_lim->min_page_sz = 1 << field; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_OFFSET); dev_lim->max_sg = field; MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET); dev_lim->max_desc_sz = size; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_MCG_OFFSET); dev_lim->max_qp_per_mcg = 1 << field; MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MCG_OFFSET); dev_lim->reserved_mgms = field & 0xf; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MCG_OFFSET); dev_lim->max_mcgs = 1 << field; MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_PD_OFFSET); dev_lim->reserved_pds = field >> 4; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PD_OFFSET); dev_lim->max_pds = 1 << (field & 0x3f); MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_RDD_OFFSET); dev_lim->reserved_rdds = field >> 4; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDD_OFFSET); dev_lim->max_rdds = 1 << (field & 0x3f); MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET); dev_lim->eec_entry_sz = size; MTHCA_GET(size, outbox, QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET); dev_lim->qpc_entry_sz = size; MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET); dev_lim->eeec_entry_sz = size; MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET); dev_lim->eqpc_entry_sz = size; MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET); dev_lim->eqc_entry_sz = size; MTHCA_GET(size, outbox, QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET); dev_lim->cqc_entry_sz = size; MTHCA_GET(size, outbox, QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET); dev_lim->srq_entry_sz = size; MTHCA_GET(size, outbox, QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET); dev_lim->uar_scratch_entry_sz = size; if (mthca_is_memfree(dev)) { MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET); dev_lim->max_srq_sz = 1 << field; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET); dev_lim->max_qp_sz = 1 << field; MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSZ_SRQ_OFFSET); dev_lim->hca.arbel.resize_srq = field & 1; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_RQ_OFFSET); dev_lim->max_sg = min_t(int, field, dev_lim->max_sg); MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET); dev_lim->max_desc_sz = min_t(int, size, dev_lim->max_desc_sz); MTHCA_GET(size, outbox, QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET); dev_lim->mpt_entry_sz = size; MTHCA_GET(field, outbox, QUERY_DEV_LIM_PBL_SZ_OFFSET); dev_lim->hca.arbel.max_pbl_sz = 1 << (field & 0x3f); MTHCA_GET(dev_lim->hca.arbel.bmme_flags, outbox, QUERY_DEV_LIM_BMME_FLAGS_OFFSET); MTHCA_GET(dev_lim->hca.arbel.reserved_lkey, outbox, QUERY_DEV_LIM_RSVD_LKEY_OFFSET); MTHCA_GET(field, outbox, QUERY_DEV_LIM_LAMR_OFFSET); dev_lim->hca.arbel.lam_required = field & 1; MTHCA_GET(dev_lim->hca.arbel.max_icm_sz, outbox, QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET); if (dev_lim->hca.arbel.bmme_flags & 1) mthca_dbg(dev, "Base MM extensions: yes " "(flags %d, max PBL %d, rsvd L_Key %08x)\n", dev_lim->hca.arbel.bmme_flags, dev_lim->hca.arbel.max_pbl_sz, dev_lim->hca.arbel.reserved_lkey); else mthca_dbg(dev, "Base MM extensions: no\n"); mthca_dbg(dev, "Max ICM size %lld MB\n", (unsigned long long) dev_lim->hca.arbel.max_icm_sz >> 20); } else { MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET); dev_lim->max_srq_sz = (1 << field) - 1; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET); dev_lim->max_qp_sz = (1 << field) - 1; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_AV_OFFSET); dev_lim->hca.tavor.max_avs = 1 << (field & 0x3f); dev_lim->mpt_entry_sz = MTHCA_MPT_ENTRY_SIZE; } mthca_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n", dev_lim->max_qps, dev_lim->reserved_qps, dev_lim->qpc_entry_sz); mthca_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n", dev_lim->max_srqs, dev_lim->reserved_srqs, dev_lim->srq_entry_sz); mthca_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n", dev_lim->max_cqs, dev_lim->reserved_cqs, dev_lim->cqc_entry_sz); mthca_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n", dev_lim->max_eqs, dev_lim->reserved_eqs, dev_lim->eqc_entry_sz); mthca_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n", dev_lim->reserved_mrws, dev_lim->reserved_mtts); mthca_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n", dev_lim->max_pds, dev_lim->reserved_pds, dev_lim->reserved_uars); mthca_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n", dev_lim->max_pds, dev_lim->reserved_mgms); mthca_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n", dev_lim->max_cq_sz, dev_lim->max_qp_sz, dev_lim->max_srq_sz); mthca_dbg(dev, "Flags: %08x\n", dev_lim->flags); out: mthca_free_mailbox(dev, mailbox); return err; } static void get_board_id(void *vsd, char *board_id) { int i; #define VSD_OFFSET_SIG1 0x00 #define VSD_OFFSET_SIG2 0xde #define VSD_OFFSET_MLX_BOARD_ID 0xd0 #define VSD_OFFSET_TS_BOARD_ID 0x20 #define VSD_SIGNATURE_TOPSPIN 0x5ad memset(board_id, 0, MTHCA_BOARD_ID_LEN); if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN && be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) { strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MTHCA_BOARD_ID_LEN); } else { /* * The board ID is a string but the firmware byte * swaps each 4-byte word before passing it back to * us. Therefore we need to swab it before printing. */ for (i = 0; i < 4; ++i) ((u32 *) board_id)[i] = swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4)); } } int mthca_QUERY_ADAPTER(struct mthca_dev *dev, struct mthca_adapter *adapter, u8 *status) { struct mthca_mailbox *mailbox; u32 *outbox; int err; #define QUERY_ADAPTER_OUT_SIZE 0x100 #define QUERY_ADAPTER_VENDOR_ID_OFFSET 0x00 #define QUERY_ADAPTER_DEVICE_ID_OFFSET 0x04 #define QUERY_ADAPTER_REVISION_ID_OFFSET 0x08 #define QUERY_ADAPTER_INTA_PIN_OFFSET 0x10 #define QUERY_ADAPTER_VSD_OFFSET 0x20 mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); outbox = mailbox->buf; err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_ADAPTER, CMD_TIME_CLASS_A, status); if (err) goto out; if (!mthca_is_memfree(dev)) { MTHCA_GET(adapter->vendor_id, outbox, QUERY_ADAPTER_VENDOR_ID_OFFSET); MTHCA_GET(adapter->device_id, outbox, QUERY_ADAPTER_DEVICE_ID_OFFSET); MTHCA_GET(adapter->revision_id, outbox, QUERY_ADAPTER_REVISION_ID_OFFSET); } MTHCA_GET(adapter->inta_pin, outbox, QUERY_ADAPTER_INTA_PIN_OFFSET); get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4, adapter->board_id); out: mthca_free_mailbox(dev, mailbox); return err; } int mthca_INIT_HCA(struct mthca_dev *dev, struct mthca_init_hca_param *param, u8 *status) { struct mthca_mailbox *mailbox; __be32 *inbox; int err; #define INIT_HCA_IN_SIZE 0x200 #define INIT_HCA_FLAGS1_OFFSET 0x00c #define INIT_HCA_FLAGS2_OFFSET 0x014 #define INIT_HCA_QPC_OFFSET 0x020 #define INIT_HCA_QPC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x10) #define INIT_HCA_LOG_QP_OFFSET (INIT_HCA_QPC_OFFSET + 0x17) #define INIT_HCA_EEC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x20) #define INIT_HCA_LOG_EEC_OFFSET (INIT_HCA_QPC_OFFSET + 0x27) #define INIT_HCA_SRQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x28) #define INIT_HCA_LOG_SRQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x2f) #define INIT_HCA_CQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x30) #define INIT_HCA_LOG_CQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x37) #define INIT_HCA_EQPC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x40) #define INIT_HCA_EEEC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x50) #define INIT_HCA_EQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x60) #define INIT_HCA_LOG_EQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x67) #define INIT_HCA_RDB_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x70) #define INIT_HCA_UDAV_OFFSET 0x0b0 #define INIT_HCA_UDAV_LKEY_OFFSET (INIT_HCA_UDAV_OFFSET + 0x0) #define INIT_HCA_UDAV_PD_OFFSET (INIT_HCA_UDAV_OFFSET + 0x4) #define INIT_HCA_MCAST_OFFSET 0x0c0 #define INIT_HCA_MC_BASE_OFFSET (INIT_HCA_MCAST_OFFSET + 0x00) #define INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12) #define INIT_HCA_MC_HASH_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x16) #define INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b) #define INIT_HCA_TPT_OFFSET 0x0f0 #define INIT_HCA_MPT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x00) #define INIT_HCA_MTT_SEG_SZ_OFFSET (INIT_HCA_TPT_OFFSET + 0x09) #define INIT_HCA_LOG_MPT_SZ_OFFSET (INIT_HCA_TPT_OFFSET + 0x0b) #define INIT_HCA_MTT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x10) #define INIT_HCA_UAR_OFFSET 0x120 #define INIT_HCA_UAR_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x00) #define INIT_HCA_UARC_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x09) #define INIT_HCA_LOG_UAR_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0a) #define INIT_HCA_UAR_PAGE_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0b) #define INIT_HCA_UAR_SCATCH_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x10) #define INIT_HCA_UAR_CTX_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x18) mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); inbox = mailbox->buf; memset(inbox, 0, INIT_HCA_IN_SIZE); if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT) MTHCA_PUT(inbox, 0x1, INIT_HCA_FLAGS1_OFFSET); #if defined(__LITTLE_ENDIAN) *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) &= ~cpu_to_be32(1 << 1); #elif defined(__BIG_ENDIAN) *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1 << 1); #else #error Host endianness not defined #endif /* Check port for UD address vector: */ *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1); /* Enable IPoIB checksumming if we can: */ if (dev->device_cap_flags & IB_DEVICE_UD_IP_CSUM) *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(7 << 3); /* We leave wqe_quota, responder_exu, etc as 0 (default) */ /* QPC/EEC/CQC/EQC/RDB attributes */ MTHCA_PUT(inbox, param->qpc_base, INIT_HCA_QPC_BASE_OFFSET); MTHCA_PUT(inbox, param->log_num_qps, INIT_HCA_LOG_QP_OFFSET); MTHCA_PUT(inbox, param->eec_base, INIT_HCA_EEC_BASE_OFFSET); MTHCA_PUT(inbox, param->log_num_eecs, INIT_HCA_LOG_EEC_OFFSET); MTHCA_PUT(inbox, param->srqc_base, INIT_HCA_SRQC_BASE_OFFSET); MTHCA_PUT(inbox, param->log_num_srqs, INIT_HCA_LOG_SRQ_OFFSET); MTHCA_PUT(inbox, param->cqc_base, INIT_HCA_CQC_BASE_OFFSET); MTHCA_PUT(inbox, param->log_num_cqs, INIT_HCA_LOG_CQ_OFFSET); MTHCA_PUT(inbox, param->eqpc_base, INIT_HCA_EQPC_BASE_OFFSET); MTHCA_PUT(inbox, param->eeec_base, INIT_HCA_EEEC_BASE_OFFSET); MTHCA_PUT(inbox, param->eqc_base, INIT_HCA_EQC_BASE_OFFSET); MTHCA_PUT(inbox, param->log_num_eqs, INIT_HCA_LOG_EQ_OFFSET); MTHCA_PUT(inbox, param->rdb_base, INIT_HCA_RDB_BASE_OFFSET); /* UD AV attributes */ /* multicast attributes */ MTHCA_PUT(inbox, param->mc_base, INIT_HCA_MC_BASE_OFFSET); MTHCA_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET); MTHCA_PUT(inbox, param->mc_hash_sz, INIT_HCA_MC_HASH_SZ_OFFSET); MTHCA_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET); /* TPT attributes */ MTHCA_PUT(inbox, param->mpt_base, INIT_HCA_MPT_BASE_OFFSET); if (!mthca_is_memfree(dev)) MTHCA_PUT(inbox, param->mtt_seg_sz, INIT_HCA_MTT_SEG_SZ_OFFSET); MTHCA_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET); MTHCA_PUT(inbox, param->mtt_base, INIT_HCA_MTT_BASE_OFFSET); /* UAR attributes */ { u8 uar_page_sz = PAGE_SHIFT - 12; MTHCA_PUT(inbox, uar_page_sz, INIT_HCA_UAR_PAGE_SZ_OFFSET); } MTHCA_PUT(inbox, param->uar_scratch_base, INIT_HCA_UAR_SCATCH_BASE_OFFSET); if (mthca_is_memfree(dev)) { MTHCA_PUT(inbox, param->log_uarc_sz, INIT_HCA_UARC_SZ_OFFSET); MTHCA_PUT(inbox, param->log_uar_sz, INIT_HCA_LOG_UAR_SZ_OFFSET); MTHCA_PUT(inbox, param->uarc_base, INIT_HCA_UAR_CTX_BASE_OFFSET); } err = mthca_cmd(dev, mailbox->dma, 0, 0, CMD_INIT_HCA, CMD_TIME_CLASS_D, status); mthca_free_mailbox(dev, mailbox); return err; } int mthca_INIT_IB(struct mthca_dev *dev, struct mthca_init_ib_param *param, int port, u8 *status) { struct mthca_mailbox *mailbox; u32 *inbox; int err; u32 flags; #define INIT_IB_IN_SIZE 56 #define INIT_IB_FLAGS_OFFSET 0x00 #define INIT_IB_FLAG_SIG (1 << 18) #define INIT_IB_FLAG_NG (1 << 17) #define INIT_IB_FLAG_G0 (1 << 16) #define INIT_IB_VL_SHIFT 4 #define INIT_IB_PORT_WIDTH_SHIFT 8 #define INIT_IB_MTU_SHIFT 12 #define INIT_IB_MAX_GID_OFFSET 0x06 #define INIT_IB_MAX_PKEY_OFFSET 0x0a #define INIT_IB_GUID0_OFFSET 0x10 #define INIT_IB_NODE_GUID_OFFSET 0x18 #define INIT_IB_SI_GUID_OFFSET 0x20 mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); inbox = mailbox->buf; memset(inbox, 0, INIT_IB_IN_SIZE); flags = 0; flags |= param->set_guid0 ? INIT_IB_FLAG_G0 : 0; flags |= param->set_node_guid ? INIT_IB_FLAG_NG : 0; flags |= param->set_si_guid ? INIT_IB_FLAG_SIG : 0; flags |= param->vl_cap << INIT_IB_VL_SHIFT; flags |= param->port_width << INIT_IB_PORT_WIDTH_SHIFT; flags |= param->mtu_cap << INIT_IB_MTU_SHIFT; MTHCA_PUT(inbox, flags, INIT_IB_FLAGS_OFFSET); MTHCA_PUT(inbox, param->gid_cap, INIT_IB_MAX_GID_OFFSET); MTHCA_PUT(inbox, param->pkey_cap, INIT_IB_MAX_PKEY_OFFSET); MTHCA_PUT(inbox, param->guid0, INIT_IB_GUID0_OFFSET); MTHCA_PUT(inbox, param->node_guid, INIT_IB_NODE_GUID_OFFSET); MTHCA_PUT(inbox, param->si_guid, INIT_IB_SI_GUID_OFFSET); err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_INIT_IB, CMD_TIME_CLASS_A, status); mthca_free_mailbox(dev, mailbox); return err; } int mthca_CLOSE_IB(struct mthca_dev *dev, int port, u8 *status) { return mthca_cmd(dev, 0, port, 0, CMD_CLOSE_IB, CMD_TIME_CLASS_A, status); } int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic, u8 *status) { return mthca_cmd(dev, 0, 0, panic, CMD_CLOSE_HCA, CMD_TIME_CLASS_C, status); } int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param, int port, u8 *status) { struct mthca_mailbox *mailbox; u32 *inbox; int err; u32 flags = 0; #define SET_IB_IN_SIZE 0x40 #define SET_IB_FLAGS_OFFSET 0x00 #define SET_IB_FLAG_SIG (1 << 18) #define SET_IB_FLAG_RQK (1 << 0) #define SET_IB_CAP_MASK_OFFSET 0x04 #define SET_IB_SI_GUID_OFFSET 0x08 mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); inbox = mailbox->buf; memset(inbox, 0, SET_IB_IN_SIZE); flags |= param->set_si_guid ? SET_IB_FLAG_SIG : 0; flags |= param->reset_qkey_viol ? SET_IB_FLAG_RQK : 0; MTHCA_PUT(inbox, flags, SET_IB_FLAGS_OFFSET); MTHCA_PUT(inbox, param->cap_mask, SET_IB_CAP_MASK_OFFSET); MTHCA_PUT(inbox, param->si_guid, SET_IB_SI_GUID_OFFSET); err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_SET_IB, CMD_TIME_CLASS_B, status); mthca_free_mailbox(dev, mailbox); return err; } int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt, u8 *status) { return mthca_map_cmd(dev, CMD_MAP_ICM, icm, virt, status); } int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt, u8 *status) { struct mthca_mailbox *mailbox; __be64 *inbox; int err; mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); inbox = mailbox->buf; inbox[0] = cpu_to_be64(virt); inbox[1] = cpu_to_be64(dma_addr); err = mthca_cmd(dev, mailbox->dma, 1, 0, CMD_MAP_ICM, CMD_TIME_CLASS_B, status); mthca_free_mailbox(dev, mailbox); if (!err) mthca_dbg(dev, "Mapped page at %llx to %llx for ICM.\n", (unsigned long long) dma_addr, (unsigned long long) virt); return err; } int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count, u8 *status) { mthca_dbg(dev, "Unmapping %d pages at %llx from ICM.\n", page_count, (unsigned long long) virt); return mthca_cmd(dev, virt, page_count, 0, CMD_UNMAP_ICM, CMD_TIME_CLASS_B, status); } int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status) { return mthca_map_cmd(dev, CMD_MAP_ICM_AUX, icm, -1, status); } int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev, u8 *status) { return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_ICM_AUX, CMD_TIME_CLASS_B, status); } int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages, u8 *status) { int ret = mthca_cmd_imm(dev, icm_size, aux_pages, 0, 0, CMD_SET_ICM_SIZE, CMD_TIME_CLASS_A, status); if (ret || status) return ret; /* * Round up number of system pages needed in case * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE. */ *aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >> (PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT); return 0; } int mthca_SW2HW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox, int mpt_index, u8 *status) { return mthca_cmd(dev, mailbox->dma, mpt_index, 0, CMD_SW2HW_MPT, CMD_TIME_CLASS_B, status); } int mthca_HW2SW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox, int mpt_index, u8 *status) { return mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index, !mailbox, CMD_HW2SW_MPT, CMD_TIME_CLASS_B, status); } int mthca_WRITE_MTT(struct mthca_dev *dev, struct mthca_mailbox *mailbox, int num_mtt, u8 *status) { return mthca_cmd(dev, mailbox->dma, num_mtt, 0, CMD_WRITE_MTT, CMD_TIME_CLASS_B, status); } int mthca_SYNC_TPT(struct mthca_dev *dev, u8 *status) { return mthca_cmd(dev, 0, 0, 0, CMD_SYNC_TPT, CMD_TIME_CLASS_B, status); } int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap, int eq_num, u8 *status) { mthca_dbg(dev, "%s mask %016llx for eqn %d\n", unmap ? "Clearing" : "Setting", (unsigned long long) event_mask, eq_num); return mthca_cmd(dev, event_mask, (unmap << 31) | eq_num, 0, CMD_MAP_EQ, CMD_TIME_CLASS_B, status); } int mthca_SW2HW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, int eq_num, u8 *status) { return mthca_cmd(dev, mailbox->dma, eq_num, 0, CMD_SW2HW_EQ, CMD_TIME_CLASS_A, status); } int mthca_HW2SW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, int eq_num, u8 *status) { return mthca_cmd_box(dev, 0, mailbox->dma, eq_num, 0, CMD_HW2SW_EQ, CMD_TIME_CLASS_A, status); } int mthca_SW2HW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, int cq_num, u8 *status) { return mthca_cmd(dev, mailbox->dma, cq_num, 0, CMD_SW2HW_CQ, CMD_TIME_CLASS_A, status); } int mthca_HW2SW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, int cq_num, u8 *status) { return mthca_cmd_box(dev, 0, mailbox->dma, cq_num, 0, CMD_HW2SW_CQ, CMD_TIME_CLASS_A, status); } int mthca_RESIZE_CQ(struct mthca_dev *dev, int cq_num, u32 lkey, u8 log_size, u8 *status) { struct mthca_mailbox *mailbox; __be32 *inbox; int err; #define RESIZE_CQ_IN_SIZE 0x40 #define RESIZE_CQ_LOG_SIZE_OFFSET 0x0c #define RESIZE_CQ_LKEY_OFFSET 0x1c mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); inbox = mailbox->buf; memset(inbox, 0, RESIZE_CQ_IN_SIZE); /* * Leave start address fields zeroed out -- mthca assumes that * MRs for CQs always start at virtual address 0. */ MTHCA_PUT(inbox, log_size, RESIZE_CQ_LOG_SIZE_OFFSET); MTHCA_PUT(inbox, lkey, RESIZE_CQ_LKEY_OFFSET); err = mthca_cmd(dev, mailbox->dma, cq_num, 1, CMD_RESIZE_CQ, CMD_TIME_CLASS_B, status); mthca_free_mailbox(dev, mailbox); return err; } int mthca_SW2HW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, int srq_num, u8 *status) { return mthca_cmd(dev, mailbox->dma, srq_num, 0, CMD_SW2HW_SRQ, CMD_TIME_CLASS_A, status); } int mthca_HW2SW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, int srq_num, u8 *status) { return mthca_cmd_box(dev, 0, mailbox->dma, srq_num, 0, CMD_HW2SW_SRQ, CMD_TIME_CLASS_A, status); } int mthca_QUERY_SRQ(struct mthca_dev *dev, u32 num, struct mthca_mailbox *mailbox, u8 *status) { return mthca_cmd_box(dev, 0, mailbox->dma, num, 0, CMD_QUERY_SRQ, CMD_TIME_CLASS_A, status); } int mthca_ARM_SRQ(struct mthca_dev *dev, int srq_num, int limit, u8 *status) { return mthca_cmd(dev, limit, srq_num, 0, CMD_ARM_SRQ, CMD_TIME_CLASS_B, status); } int mthca_MODIFY_QP(struct mthca_dev *dev, enum ib_qp_state cur, enum ib_qp_state next, u32 num, int is_ee, struct mthca_mailbox *mailbox, u32 optmask, u8 *status) { static const u16 op[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = CMD_ERR2RST_QPEE, [IB_QPS_ERR] = CMD_2ERR_QPEE, [IB_QPS_INIT] = CMD_RST2INIT_QPEE, }, [IB_QPS_INIT] = { [IB_QPS_RESET] = CMD_ERR2RST_QPEE, [IB_QPS_ERR] = CMD_2ERR_QPEE, [IB_QPS_INIT] = CMD_INIT2INIT_QPEE, [IB_QPS_RTR] = CMD_INIT2RTR_QPEE, }, [IB_QPS_RTR] = { [IB_QPS_RESET] = CMD_ERR2RST_QPEE, [IB_QPS_ERR] = CMD_2ERR_QPEE, [IB_QPS_RTS] = CMD_RTR2RTS_QPEE, }, [IB_QPS_RTS] = { [IB_QPS_RESET] = CMD_ERR2RST_QPEE, [IB_QPS_ERR] = CMD_2ERR_QPEE, [IB_QPS_RTS] = CMD_RTS2RTS_QPEE, [IB_QPS_SQD] = CMD_RTS2SQD_QPEE, }, [IB_QPS_SQD] = { [IB_QPS_RESET] = CMD_ERR2RST_QPEE, [IB_QPS_ERR] = CMD_2ERR_QPEE, [IB_QPS_RTS] = CMD_SQD2RTS_QPEE, [IB_QPS_SQD] = CMD_SQD2SQD_QPEE, }, [IB_QPS_SQE] = { [IB_QPS_RESET] = CMD_ERR2RST_QPEE, [IB_QPS_ERR] = CMD_2ERR_QPEE, [IB_QPS_RTS] = CMD_SQERR2RTS_QPEE, }, [IB_QPS_ERR] = { [IB_QPS_RESET] = CMD_ERR2RST_QPEE, [IB_QPS_ERR] = CMD_2ERR_QPEE, } }; u8 op_mod = 0; int my_mailbox = 0; int err; if (op[cur][next] == CMD_ERR2RST_QPEE) { op_mod = 3; /* don't write outbox, any->reset */ /* For debugging */ if (!mailbox) { mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (!IS_ERR(mailbox)) { my_mailbox = 1; op_mod = 2; /* write outbox, any->reset */ } else mailbox = NULL; } err = mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, (!!is_ee << 24) | num, op_mod, op[cur][next], CMD_TIME_CLASS_C, status); if (0 && mailbox) { int i; mthca_dbg(dev, "Dumping QP context:\n"); printk(" %08x\n", be32_to_cpup(mailbox->buf)); for (i = 0; i < 0x100 / 4; ++i) { if (i % 8 == 0) printk("[%02x] ", i * 4); printk(" %08x", be32_to_cpu(((__be32 *) mailbox->buf)[i + 2])); if ((i + 1) % 8 == 0) printk("\n"); } } if (my_mailbox) mthca_free_mailbox(dev, mailbox); } else { if (0) { int i; mthca_dbg(dev, "Dumping QP context:\n"); printk(" opt param mask: %08x\n", be32_to_cpup(mailbox->buf)); for (i = 0; i < 0x100 / 4; ++i) { if (i % 8 == 0) printk(" [%02x] ", i * 4); printk(" %08x", be32_to_cpu(((__be32 *) mailbox->buf)[i + 2])); if ((i + 1) % 8 == 0) printk("\n"); } } err = mthca_cmd(dev, mailbox->dma, optmask | (!!is_ee << 24) | num, op_mod, op[cur][next], CMD_TIME_CLASS_C, status); } return err; } int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee, struct mthca_mailbox *mailbox, u8 *status) { return mthca_cmd_box(dev, 0, mailbox->dma, (!!is_ee << 24) | num, 0, CMD_QUERY_QPEE, CMD_TIME_CLASS_A, status); } int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn, u8 *status) { u8 op_mod; switch (type) { case IB_QPT_SMI: op_mod = 0; break; case IB_QPT_GSI: op_mod = 1; break; case IB_QPT_RAW_IPV6: op_mod = 2; break; case IB_QPT_RAW_ETHERTYPE: op_mod = 3; break; default: return -EINVAL; } return mthca_cmd(dev, 0, qpn, op_mod, CMD_CONF_SPECIAL_QP, CMD_TIME_CLASS_B, status); } int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey, int port, struct ib_wc *in_wc, struct ib_grh *in_grh, void *in_mad, void *response_mad, u8 *status) { struct mthca_mailbox *inmailbox, *outmailbox; void *inbox; int err; u32 in_modifier = port; u8 op_modifier = 0; #define MAD_IFC_BOX_SIZE 0x400 #define MAD_IFC_MY_QPN_OFFSET 0x100 #define MAD_IFC_RQPN_OFFSET 0x108 #define MAD_IFC_SL_OFFSET 0x10c #define MAD_IFC_G_PATH_OFFSET 0x10d #define MAD_IFC_RLID_OFFSET 0x10e #define MAD_IFC_PKEY_OFFSET 0x112 #define MAD_IFC_GRH_OFFSET 0x140 inmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(inmailbox)) return PTR_ERR(inmailbox); inbox = inmailbox->buf; outmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(outmailbox)) { mthca_free_mailbox(dev, inmailbox); return PTR_ERR(outmailbox); } memcpy(inbox, in_mad, 256); /* * Key check traps can't be generated unless we have in_wc to * tell us where to send the trap. */ if (ignore_mkey || !in_wc) op_modifier |= 0x1; if (ignore_bkey || !in_wc) op_modifier |= 0x2; if (in_wc) { u8 val; memset(inbox + 256, 0, 256); MTHCA_PUT(inbox, in_wc->qp->qp_num, MAD_IFC_MY_QPN_OFFSET); MTHCA_PUT(inbox, in_wc->src_qp, MAD_IFC_RQPN_OFFSET); val = in_wc->sl << 4; MTHCA_PUT(inbox, val, MAD_IFC_SL_OFFSET); val = in_wc->dlid_path_bits | (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0); MTHCA_PUT(inbox, val, MAD_IFC_G_PATH_OFFSET); MTHCA_PUT(inbox, in_wc->slid, MAD_IFC_RLID_OFFSET); MTHCA_PUT(inbox, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET); if (in_grh) memcpy(inbox + MAD_IFC_GRH_OFFSET, in_grh, 40); op_modifier |= 0x4; in_modifier |= in_wc->slid << 16; } err = mthca_cmd_box(dev, inmailbox->dma, outmailbox->dma, in_modifier, op_modifier, CMD_MAD_IFC, CMD_TIME_CLASS_C, status); if (!err && !*status) memcpy(response_mad, outmailbox->buf, 256); mthca_free_mailbox(dev, inmailbox); mthca_free_mailbox(dev, outmailbox); return err; } int mthca_READ_MGM(struct mthca_dev *dev, int index, struct mthca_mailbox *mailbox, u8 *status) { return mthca_cmd_box(dev, 0, mailbox->dma, index, 0, CMD_READ_MGM, CMD_TIME_CLASS_A, status); } int mthca_WRITE_MGM(struct mthca_dev *dev, int index, struct mthca_mailbox *mailbox, u8 *status) { return mthca_cmd(dev, mailbox->dma, index, 0, CMD_WRITE_MGM, CMD_TIME_CLASS_A, status); } int mthca_MGID_HASH(struct mthca_dev *dev, struct mthca_mailbox *mailbox, u16 *hash, u8 *status) { u64 imm; int err; err = mthca_cmd_imm(dev, mailbox->dma, &imm, 0, 0, CMD_MGID_HASH, CMD_TIME_CLASS_A, status); *hash = imm; return err; } int mthca_NOP(struct mthca_dev *dev, u8 *status) { return mthca_cmd(dev, 0, 0x1f, 0, CMD_NOP, msecs_to_jiffies(100), status); } Index: head/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c =================================================================== --- head/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c (revision 300675) +++ head/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c (revision 300676) @@ -1,1361 +1,1363 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#define LINUXKPI_PARAM_PREFIX mthca_ + #include #include #include #include #include "mthca_dev.h" #include "mthca_config_reg.h" #include "mthca_cmd.h" #include "mthca_profile.h" #include "mthca_memfree.h" #include "mthca_wqe.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(mthca, 1); MODULE_DEPEND(mthca, linuxkpi, 1, 1, 1); MODULE_DEPEND(mthca, ibcore, 1, 1, 1); #ifdef CONFIG_INFINIBAND_MTHCA_DEBUG int mthca_debug_level = 0; module_param_named(debug_level, mthca_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */ #ifdef CONFIG_PCI_MSI static int msi_x = 1; module_param(msi_x, int, 0444); MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero"); #else /* CONFIG_PCI_MSI */ #define msi_x (0) #endif /* CONFIG_PCI_MSI */ static int tune_pci = 0; module_param(tune_pci, int, 0444); MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero"); DEFINE_MUTEX(mthca_device_mutex); #define MTHCA_DEFAULT_NUM_QP (1 << 16) #define MTHCA_DEFAULT_RDB_PER_QP (1 << 2) #define MTHCA_DEFAULT_NUM_CQ (1 << 16) #define MTHCA_DEFAULT_NUM_MCG (1 << 13) #define MTHCA_DEFAULT_NUM_MPT (1 << 17) #define MTHCA_DEFAULT_NUM_MTT (1 << 20) #define MTHCA_DEFAULT_NUM_UDAV (1 << 15) #define MTHCA_DEFAULT_NUM_RESERVED_MTTS (1 << 18) #define MTHCA_DEFAULT_NUM_UARC_SIZE (1 << 18) static struct mthca_profile hca_profile = { .num_qp = MTHCA_DEFAULT_NUM_QP, .rdb_per_qp = MTHCA_DEFAULT_RDB_PER_QP, .num_cq = MTHCA_DEFAULT_NUM_CQ, .num_mcg = MTHCA_DEFAULT_NUM_MCG, .num_mpt = MTHCA_DEFAULT_NUM_MPT, .num_mtt = MTHCA_DEFAULT_NUM_MTT, .num_udav = MTHCA_DEFAULT_NUM_UDAV, /* Tavor only */ .fmr_reserved_mtts = MTHCA_DEFAULT_NUM_RESERVED_MTTS, /* Tavor only */ .uarc_size = MTHCA_DEFAULT_NUM_UARC_SIZE, /* Arbel only */ }; module_param_named(num_qp, hca_profile.num_qp, int, 0444); MODULE_PARM_DESC(num_qp, "maximum number of QPs per HCA"); module_param_named(rdb_per_qp, hca_profile.rdb_per_qp, int, 0444); MODULE_PARM_DESC(rdb_per_qp, "number of RDB buffers per QP"); module_param_named(num_cq, hca_profile.num_cq, int, 0444); MODULE_PARM_DESC(num_cq, "maximum number of CQs per HCA"); module_param_named(num_mcg, hca_profile.num_mcg, int, 0444); MODULE_PARM_DESC(num_mcg, "maximum number of multicast groups per HCA"); module_param_named(num_mpt, hca_profile.num_mpt, int, 0444); MODULE_PARM_DESC(num_mpt, "maximum number of memory protection table entries per HCA"); module_param_named(num_mtt, hca_profile.num_mtt, int, 0444); MODULE_PARM_DESC(num_mtt, "maximum number of memory translation table segments per HCA"); module_param_named(num_udav, hca_profile.num_udav, int, 0444); MODULE_PARM_DESC(num_udav, "maximum number of UD address vectors per HCA"); module_param_named(fmr_reserved_mtts, hca_profile.fmr_reserved_mtts, int, 0444); MODULE_PARM_DESC(fmr_reserved_mtts, "number of memory translation table segments reserved for FMR"); static int log_mtts_per_seg; module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-5)"); static char mthca_version[] __devinitdata = DRV_NAME ": Mellanox InfiniBand HCA driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; static int mthca_tune_pci(struct mthca_dev *mdev) { if (!tune_pci) return 0; /* First try to max out Read Byte Count */ if (pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX)) { if (pcix_set_mmrbc(mdev->pdev, pcix_get_max_mmrbc(mdev->pdev))) { mthca_err(mdev, "Couldn't set PCI-X max read count, " "aborting.\n"); return -ENODEV; } } else if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE)) mthca_info(mdev, "No PCI-X capability, not setting RBC.\n"); if (pci_find_capability(mdev->pdev, PCI_CAP_ID_EXP)) { if (pcie_set_readrq(mdev->pdev, 4096)) { mthca_err(mdev, "Couldn't write PCI Express read request, " "aborting.\n"); return -ENODEV; } } else if (mdev->mthca_flags & MTHCA_FLAG_PCIE) mthca_info(mdev, "No PCI Express capability, " "not setting Max Read Request Size.\n"); return 0; } static int mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim *dev_lim) { int err; u8 status; mdev->limits.mtt_seg_size = (1 << log_mtts_per_seg) * 8; err = mthca_QUERY_DEV_LIM(mdev, dev_lim, &status); if (err) { mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n"); return err; } if (status) { mthca_err(mdev, "QUERY_DEV_LIM returned status 0x%02x, " "aborting.\n", status); return -EINVAL; } if (dev_lim->min_page_sz > PAGE_SIZE) { mthca_err(mdev, "HCA minimum page size of %d bigger than " "kernel PAGE_SIZE of %d, aborting.\n", dev_lim->min_page_sz, PAGE_SIZE); return -ENODEV; } if (dev_lim->num_ports > MTHCA_MAX_PORTS) { mthca_err(mdev, "HCA has %d ports, but we only support %d, " "aborting.\n", dev_lim->num_ports, MTHCA_MAX_PORTS); return -ENODEV; } if (dev_lim->uar_size > pci_resource_len(mdev->pdev, 2)) { mthca_err(mdev, "HCA reported UAR size of 0x%x bigger than " "PCI resource 2 size of 0x%llx, aborting.\n", dev_lim->uar_size, (unsigned long long)pci_resource_len(mdev->pdev, 2)); return -ENODEV; } mdev->limits.num_ports = dev_lim->num_ports; mdev->limits.vl_cap = dev_lim->max_vl; mdev->limits.mtu_cap = dev_lim->max_mtu; mdev->limits.gid_table_len = dev_lim->max_gids; mdev->limits.pkey_table_len = dev_lim->max_pkeys; mdev->limits.local_ca_ack_delay = dev_lim->local_ca_ack_delay; /* * Need to allow for worst case send WQE overhead and check * whether max_desc_sz imposes a lower limit than max_sg; UD * send has the biggest overhead. */ mdev->limits.max_sg = min_t(int, dev_lim->max_sg, (dev_lim->max_desc_sz - sizeof (struct mthca_next_seg) - (mthca_is_memfree(mdev) ? sizeof (struct mthca_arbel_ud_seg) : sizeof (struct mthca_tavor_ud_seg))) / sizeof (struct mthca_data_seg)); mdev->limits.max_wqes = dev_lim->max_qp_sz; mdev->limits.max_qp_init_rdma = dev_lim->max_requester_per_qp; mdev->limits.reserved_qps = dev_lim->reserved_qps; mdev->limits.max_srq_wqes = dev_lim->max_srq_sz; mdev->limits.reserved_srqs = dev_lim->reserved_srqs; mdev->limits.reserved_eecs = dev_lim->reserved_eecs; mdev->limits.max_desc_sz = dev_lim->max_desc_sz; mdev->limits.max_srq_sge = mthca_max_srq_sge(mdev); /* * Subtract 1 from the limit because we need to allocate a * spare CQE so the HCA HW can tell the difference between an * empty CQ and a full CQ. */ mdev->limits.max_cqes = dev_lim->max_cq_sz - 1; mdev->limits.reserved_cqs = dev_lim->reserved_cqs; mdev->limits.reserved_eqs = dev_lim->reserved_eqs; mdev->limits.reserved_mtts = dev_lim->reserved_mtts; mdev->limits.reserved_mrws = dev_lim->reserved_mrws; mdev->limits.reserved_uars = dev_lim->reserved_uars; mdev->limits.reserved_pds = dev_lim->reserved_pds; mdev->limits.port_width_cap = dev_lim->max_port_width; mdev->limits.page_size_cap = ~(u32) (dev_lim->min_page_sz - 1); mdev->limits.flags = dev_lim->flags; /* * For old FW that doesn't return static rate support, use a * value of 0x3 (only static rate values of 0 or 1 are handled), * except on Sinai, where even old FW can handle static rate * values of 2 and 3. */ if (dev_lim->stat_rate_support) mdev->limits.stat_rate_support = dev_lim->stat_rate_support; else if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT) mdev->limits.stat_rate_support = 0xf; else mdev->limits.stat_rate_support = 0x3; /* IB_DEVICE_RESIZE_MAX_WR not supported by driver. May be doable since hardware supports it for SRQ. IB_DEVICE_N_NOTIFY_CQ is supported by hardware but not by driver. IB_DEVICE_SRQ_RESIZE is supported by hardware but SRQ is not supported by driver. */ mdev->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN; if (dev_lim->flags & DEV_LIM_FLAG_BAD_PKEY_CNTR) mdev->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (dev_lim->flags & DEV_LIM_FLAG_BAD_QKEY_CNTR) mdev->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (dev_lim->flags & DEV_LIM_FLAG_RAW_MULTI) mdev->device_cap_flags |= IB_DEVICE_RAW_MULTI; if (dev_lim->flags & DEV_LIM_FLAG_AUTO_PATH_MIG) mdev->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; if (dev_lim->flags & DEV_LIM_FLAG_UD_AV_PORT_ENFORCE) mdev->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; if (dev_lim->flags & DEV_LIM_FLAG_SRQ) mdev->mthca_flags |= MTHCA_FLAG_SRQ; if (mthca_is_memfree(mdev)) if (dev_lim->flags & DEV_LIM_FLAG_IPOIB_CSUM) mdev->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; return 0; } static int mthca_init_tavor(struct mthca_dev *mdev) { s64 size; u8 status; int err; struct mthca_dev_lim dev_lim; struct mthca_profile profile; struct mthca_init_hca_param init_hca; err = mthca_SYS_EN(mdev, &status); if (err) { mthca_err(mdev, "SYS_EN command failed, aborting.\n"); return err; } if (status) { mthca_err(mdev, "SYS_EN returned status 0x%02x, " "aborting.\n", status); return -EINVAL; } err = mthca_QUERY_FW(mdev, &status); if (err) { mthca_err(mdev, "QUERY_FW command failed, aborting.\n"); goto err_disable; } if (status) { mthca_err(mdev, "QUERY_FW returned status 0x%02x, " "aborting.\n", status); err = -EINVAL; goto err_disable; } err = mthca_QUERY_DDR(mdev, &status); if (err) { mthca_err(mdev, "QUERY_DDR command failed, aborting.\n"); goto err_disable; } if (status) { mthca_err(mdev, "QUERY_DDR returned status 0x%02x, " "aborting.\n", status); err = -EINVAL; goto err_disable; } err = mthca_dev_lim(mdev, &dev_lim); if (err) { mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n"); goto err_disable; } profile = hca_profile; profile.num_uar = dev_lim.uar_size / PAGE_SIZE; profile.uarc_size = 0; if (mdev->mthca_flags & MTHCA_FLAG_SRQ) profile.num_srq = dev_lim.max_srqs; size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca); if (size < 0) { err = size; goto err_disable; } err = mthca_INIT_HCA(mdev, &init_hca, &status); if (err) { mthca_err(mdev, "INIT_HCA command failed, aborting.\n"); goto err_disable; } if (status) { mthca_err(mdev, "INIT_HCA returned status 0x%02x, " "aborting.\n", status); err = -EINVAL; goto err_disable; } return 0; err_disable: mthca_SYS_DIS(mdev, &status); return err; } static int mthca_load_fw(struct mthca_dev *mdev) { u8 status; int err; /* FIXME: use HCA-attached memory for FW if present */ mdev->fw.arbel.fw_icm = mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages, GFP_HIGHUSER | __GFP_NOWARN, 0); if (!mdev->fw.arbel.fw_icm) { mthca_err(mdev, "Couldn't allocate FW area, aborting.\n"); return -ENOMEM; } err = mthca_MAP_FA(mdev, mdev->fw.arbel.fw_icm, &status); if (err) { mthca_err(mdev, "MAP_FA command failed, aborting.\n"); goto err_free; } if (status) { mthca_err(mdev, "MAP_FA returned status 0x%02x, aborting.\n", status); err = -EINVAL; goto err_free; } err = mthca_RUN_FW(mdev, &status); if (err) { mthca_err(mdev, "RUN_FW command failed, aborting.\n"); goto err_unmap_fa; } if (status) { mthca_err(mdev, "RUN_FW returned status 0x%02x, aborting.\n", status); err = -EINVAL; goto err_unmap_fa; } return 0; err_unmap_fa: mthca_UNMAP_FA(mdev, &status); err_free: mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); return err; } static int mthca_init_icm(struct mthca_dev *mdev, struct mthca_dev_lim *dev_lim, struct mthca_init_hca_param *init_hca, u64 icm_size) { u64 aux_pages; u8 status; int err; err = mthca_SET_ICM_SIZE(mdev, icm_size, &aux_pages, &status); if (err) { mthca_err(mdev, "SET_ICM_SIZE command failed, aborting.\n"); return err; } if (status) { mthca_err(mdev, "SET_ICM_SIZE returned status 0x%02x, " "aborting.\n", status); return -EINVAL; } mthca_dbg(mdev, "%lld KB of HCA context requires %lld KB aux memory.\n", (unsigned long long) icm_size >> 10, (unsigned long long) aux_pages << 2); mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages, GFP_HIGHUSER | __GFP_NOWARN, 0); if (!mdev->fw.arbel.aux_icm) { mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n"); return -ENOMEM; } err = mthca_MAP_ICM_AUX(mdev, mdev->fw.arbel.aux_icm, &status); if (err) { mthca_err(mdev, "MAP_ICM_AUX command failed, aborting.\n"); goto err_free_aux; } if (status) { mthca_err(mdev, "MAP_ICM_AUX returned status 0x%02x, aborting.\n", status); err = -EINVAL; goto err_free_aux; } err = mthca_map_eq_icm(mdev, init_hca->eqc_base); if (err) { mthca_err(mdev, "Failed to map EQ context memory, aborting.\n"); goto err_unmap_aux; } /* CPU writes to non-reserved MTTs, while HCA might DMA to reserved mtts */ mdev->limits.reserved_mtts = ALIGN(mdev->limits.reserved_mtts * mdev->limits.mtt_seg_size, dma_get_cache_alignment()) / mdev->limits.mtt_seg_size; mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base, mdev->limits.mtt_seg_size, mdev->limits.num_mtt_segs, mdev->limits.reserved_mtts, 1, 0); if (!mdev->mr_table.mtt_table) { mthca_err(mdev, "Failed to map MTT context memory, aborting.\n"); err = -ENOMEM; goto err_unmap_eq; } mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base, dev_lim->mpt_entry_sz, mdev->limits.num_mpts, mdev->limits.reserved_mrws, 1, 1); if (!mdev->mr_table.mpt_table) { mthca_err(mdev, "Failed to map MPT context memory, aborting.\n"); err = -ENOMEM; goto err_unmap_mtt; } mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base, dev_lim->qpc_entry_sz, mdev->limits.num_qps, mdev->limits.reserved_qps, 0, 0); if (!mdev->qp_table.qp_table) { mthca_err(mdev, "Failed to map QP context memory, aborting.\n"); err = -ENOMEM; goto err_unmap_mpt; } mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base, dev_lim->eqpc_entry_sz, mdev->limits.num_qps, mdev->limits.reserved_qps, 0, 0); if (!mdev->qp_table.eqp_table) { mthca_err(mdev, "Failed to map EQP context memory, aborting.\n"); err = -ENOMEM; goto err_unmap_qp; } mdev->qp_table.rdb_table = mthca_alloc_icm_table(mdev, init_hca->rdb_base, MTHCA_RDB_ENTRY_SIZE, mdev->limits.num_qps << mdev->qp_table.rdb_shift, 0, 0, 0); if (!mdev->qp_table.rdb_table) { mthca_err(mdev, "Failed to map RDB context memory, aborting\n"); err = -ENOMEM; goto err_unmap_eqp; } mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base, dev_lim->cqc_entry_sz, mdev->limits.num_cqs, mdev->limits.reserved_cqs, 0, 0); if (!mdev->cq_table.table) { mthca_err(mdev, "Failed to map CQ context memory, aborting.\n"); err = -ENOMEM; goto err_unmap_rdb; } if (mdev->mthca_flags & MTHCA_FLAG_SRQ) { mdev->srq_table.table = mthca_alloc_icm_table(mdev, init_hca->srqc_base, dev_lim->srq_entry_sz, mdev->limits.num_srqs, mdev->limits.reserved_srqs, 0, 0); if (!mdev->srq_table.table) { mthca_err(mdev, "Failed to map SRQ context memory, " "aborting.\n"); err = -ENOMEM; goto err_unmap_cq; } } /* * It's not strictly required, but for simplicity just map the * whole multicast group table now. The table isn't very big * and it's a lot easier than trying to track ref counts. */ mdev->mcg_table.table = mthca_alloc_icm_table(mdev, init_hca->mc_base, MTHCA_MGM_ENTRY_SIZE, mdev->limits.num_mgms + mdev->limits.num_amgms, mdev->limits.num_mgms + mdev->limits.num_amgms, 0, 0); if (!mdev->mcg_table.table) { mthca_err(mdev, "Failed to map MCG context memory, aborting.\n"); err = -ENOMEM; goto err_unmap_srq; } return 0; err_unmap_srq: if (mdev->mthca_flags & MTHCA_FLAG_SRQ) mthca_free_icm_table(mdev, mdev->srq_table.table); err_unmap_cq: mthca_free_icm_table(mdev, mdev->cq_table.table); err_unmap_rdb: mthca_free_icm_table(mdev, mdev->qp_table.rdb_table); err_unmap_eqp: mthca_free_icm_table(mdev, mdev->qp_table.eqp_table); err_unmap_qp: mthca_free_icm_table(mdev, mdev->qp_table.qp_table); err_unmap_mpt: mthca_free_icm_table(mdev, mdev->mr_table.mpt_table); err_unmap_mtt: mthca_free_icm_table(mdev, mdev->mr_table.mtt_table); err_unmap_eq: mthca_unmap_eq_icm(mdev); err_unmap_aux: mthca_UNMAP_ICM_AUX(mdev, &status); err_free_aux: mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0); return err; } static void mthca_free_icms(struct mthca_dev *mdev) { u8 status; mthca_free_icm_table(mdev, mdev->mcg_table.table); if (mdev->mthca_flags & MTHCA_FLAG_SRQ) mthca_free_icm_table(mdev, mdev->srq_table.table); mthca_free_icm_table(mdev, mdev->cq_table.table); mthca_free_icm_table(mdev, mdev->qp_table.rdb_table); mthca_free_icm_table(mdev, mdev->qp_table.eqp_table); mthca_free_icm_table(mdev, mdev->qp_table.qp_table); mthca_free_icm_table(mdev, mdev->mr_table.mpt_table); mthca_free_icm_table(mdev, mdev->mr_table.mtt_table); mthca_unmap_eq_icm(mdev); mthca_UNMAP_ICM_AUX(mdev, &status); mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0); } static int mthca_init_arbel(struct mthca_dev *mdev) { struct mthca_dev_lim dev_lim; struct mthca_profile profile; struct mthca_init_hca_param init_hca; s64 icm_size; u8 status; int err; err = mthca_QUERY_FW(mdev, &status); if (err) { mthca_err(mdev, "QUERY_FW command failed, aborting.\n"); return err; } if (status) { mthca_err(mdev, "QUERY_FW returned status 0x%02x, " "aborting.\n", status); return -EINVAL; } err = mthca_ENABLE_LAM(mdev, &status); if (err) { mthca_err(mdev, "ENABLE_LAM command failed, aborting.\n"); return err; } if (status == MTHCA_CMD_STAT_LAM_NOT_PRE) { mthca_dbg(mdev, "No HCA-attached memory (running in MemFree mode)\n"); mdev->mthca_flags |= MTHCA_FLAG_NO_LAM; } else if (status) { mthca_err(mdev, "ENABLE_LAM returned status 0x%02x, " "aborting.\n", status); return -EINVAL; } err = mthca_load_fw(mdev); if (err) { mthca_err(mdev, "Failed to start FW, aborting.\n"); goto err_disable; } err = mthca_dev_lim(mdev, &dev_lim); if (err) { mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n"); goto err_stop_fw; } profile = hca_profile; profile.num_uar = dev_lim.uar_size / PAGE_SIZE; profile.num_udav = 0; if (mdev->mthca_flags & MTHCA_FLAG_SRQ) profile.num_srq = dev_lim.max_srqs; icm_size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca); if (icm_size < 0) { err = icm_size; goto err_stop_fw; } err = mthca_init_icm(mdev, &dev_lim, &init_hca, icm_size); if (err) goto err_stop_fw; err = mthca_INIT_HCA(mdev, &init_hca, &status); if (err) { mthca_err(mdev, "INIT_HCA command failed, aborting.\n"); goto err_free_icm; } if (status) { mthca_err(mdev, "INIT_HCA returned status 0x%02x, " "aborting.\n", status); err = -EINVAL; goto err_free_icm; } return 0; err_free_icm: mthca_free_icms(mdev); err_stop_fw: mthca_UNMAP_FA(mdev, &status); mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); err_disable: if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM)) mthca_DISABLE_LAM(mdev, &status); return err; } static void mthca_close_hca(struct mthca_dev *mdev) { u8 status; mthca_CLOSE_HCA(mdev, 0, &status); if (mthca_is_memfree(mdev)) { mthca_free_icms(mdev); mthca_UNMAP_FA(mdev, &status); mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM)) mthca_DISABLE_LAM(mdev, &status); } else mthca_SYS_DIS(mdev, &status); } static int mthca_init_hca(struct mthca_dev *mdev) { u8 status; int err; struct mthca_adapter adapter; if (mthca_is_memfree(mdev)) err = mthca_init_arbel(mdev); else err = mthca_init_tavor(mdev); if (err) return err; err = mthca_QUERY_ADAPTER(mdev, &adapter, &status); if (err) { mthca_err(mdev, "QUERY_ADAPTER command failed, aborting.\n"); goto err_close; } if (status) { mthca_err(mdev, "QUERY_ADAPTER returned status 0x%02x, " "aborting.\n", status); err = -EINVAL; goto err_close; } mdev->eq_table.inta_pin = adapter.inta_pin; if (!mthca_is_memfree(mdev)) mdev->rev_id = adapter.revision_id; memcpy(mdev->board_id, adapter.board_id, sizeof mdev->board_id); return 0; err_close: mthca_close_hca(mdev); return err; } static int mthca_setup_hca(struct mthca_dev *dev) { int err; u8 status; MTHCA_INIT_DOORBELL_LOCK(&dev->doorbell_lock); err = mthca_init_uar_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "user access region table, aborting.\n"); return err; } err = mthca_uar_alloc(dev, &dev->driver_uar); if (err) { mthca_err(dev, "Failed to allocate driver access region, " "aborting.\n"); goto err_uar_table_free; } dev->kar = ioremap(dev->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!dev->kar) { mthca_err(dev, "Couldn't map kernel access region, " "aborting.\n"); err = -ENOMEM; goto err_uar_free; } err = mthca_init_pd_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "protection domain table, aborting.\n"); goto err_kar_unmap; } err = mthca_init_mr_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "memory region table, aborting.\n"); goto err_pd_table_free; } err = mthca_pd_alloc(dev, 1, &dev->driver_pd); if (err) { mthca_err(dev, "Failed to create driver PD, " "aborting.\n"); goto err_mr_table_free; } err = mthca_init_eq_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "event queue table, aborting.\n"); goto err_pd_free; } err = mthca_cmd_use_events(dev); if (err) { mthca_err(dev, "Failed to switch to event-driven " "firmware commands, aborting.\n"); goto err_eq_table_free; } err = mthca_NOP(dev, &status); if (err || status) { if (dev->mthca_flags & MTHCA_FLAG_MSI_X) { mthca_warn(dev, "NOP command failed to generate interrupt " "(IRQ %d).\n", dev->eq_table.eq[MTHCA_EQ_CMD].msi_x_vector); mthca_warn(dev, "Trying again with MSI-X disabled.\n"); } else { mthca_err(dev, "NOP command failed to generate interrupt " "(IRQ %d), aborting.\n", dev->pdev->irq); mthca_err(dev, "BIOS or ACPI interrupt routing problem?\n"); } goto err_cmd_poll; } mthca_dbg(dev, "NOP command IRQ test passed\n"); err = mthca_init_cq_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "completion queue table, aborting.\n"); goto err_cmd_poll; } err = mthca_init_srq_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "shared receive queue table, aborting.\n"); goto err_cq_table_free; } err = mthca_init_qp_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "queue pair table, aborting.\n"); goto err_srq_table_free; } err = mthca_init_av_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "address vector table, aborting.\n"); goto err_qp_table_free; } err = mthca_init_mcg_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "multicast group table, aborting.\n"); goto err_av_table_free; } return 0; err_av_table_free: mthca_cleanup_av_table(dev); err_qp_table_free: mthca_cleanup_qp_table(dev); err_srq_table_free: mthca_cleanup_srq_table(dev); err_cq_table_free: mthca_cleanup_cq_table(dev); err_cmd_poll: mthca_cmd_use_polling(dev); err_eq_table_free: mthca_cleanup_eq_table(dev); err_pd_free: mthca_pd_free(dev, &dev->driver_pd); err_mr_table_free: mthca_cleanup_mr_table(dev); err_pd_table_free: mthca_cleanup_pd_table(dev); err_kar_unmap: iounmap(dev->kar); err_uar_free: mthca_uar_free(dev, &dev->driver_uar); err_uar_table_free: mthca_cleanup_uar_table(dev); return err; } static int mthca_enable_msi_x(struct mthca_dev *mdev) { struct msix_entry entries[3]; int err; entries[0].entry = 0; entries[1].entry = 1; entries[2].entry = 2; err = pci_enable_msix(mdev->pdev, entries, ARRAY_SIZE(entries)); if (err) { if (err > 0) mthca_info(mdev, "Only %d MSI-X vectors available, " "not using MSI-X\n", err); return err; } mdev->eq_table.eq[MTHCA_EQ_COMP ].msi_x_vector = entries[0].vector; mdev->eq_table.eq[MTHCA_EQ_ASYNC].msi_x_vector = entries[1].vector; mdev->eq_table.eq[MTHCA_EQ_CMD ].msi_x_vector = entries[2].vector; return 0; } /* Types of supported HCA */ enum { TAVOR, /* MT23108 */ ARBEL_COMPAT, /* MT25208 in Tavor compat mode */ ARBEL_NATIVE, /* MT25208 with extended features */ SINAI /* MT25204 */ }; #define MTHCA_FW_VER(major, minor, subminor) \ (((u64) (major) << 32) | ((u64) (minor) << 16) | (u64) (subminor)) static struct { u64 latest_fw; u32 flags; } mthca_hca_table[] = { [TAVOR] = { .latest_fw = MTHCA_FW_VER(3, 5, 0), .flags = 0 }, [ARBEL_COMPAT] = { .latest_fw = MTHCA_FW_VER(4, 8, 200), .flags = MTHCA_FLAG_PCIE }, [ARBEL_NATIVE] = { .latest_fw = MTHCA_FW_VER(5, 3, 0), .flags = MTHCA_FLAG_MEMFREE | MTHCA_FLAG_PCIE }, [SINAI] = { .latest_fw = MTHCA_FW_VER(1, 2, 0), .flags = MTHCA_FLAG_MEMFREE | MTHCA_FLAG_PCIE | MTHCA_FLAG_SINAI_OPT } }; static int __mthca_init_one(struct pci_dev *pdev, int hca_type) { int ddr_hidden = 0; int err; struct mthca_dev *mdev; printk(KERN_INFO PFX "Initializing %s\n", pci_name(pdev)); err = pci_enable_device(pdev); if (err) { dev_err(&pdev->dev, "Cannot enable PCI device, " "aborting.\n"); return err; } /* * Check for BARs. We expect 0: 1MB, 2: 8MB, 4: DDR (may not * be present) */ if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) || pci_resource_len(pdev, 0) != 1 << 20) { dev_err(&pdev->dev, "Missing DCS, aborting.\n"); err = -ENODEV; goto err_disable_pdev; } if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) { dev_err(&pdev->dev, "Missing UAR, aborting.\n"); err = -ENODEV; goto err_disable_pdev; } if (!(pci_resource_flags(pdev, 4) & IORESOURCE_MEM)) ddr_hidden = 1; err = pci_request_regions(pdev, DRV_NAME); if (err) { dev_err(&pdev->dev, "Cannot obtain PCI resources, " "aborting.\n"); goto err_disable_pdev; } pci_set_master(pdev); err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); if (err) { dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n"); err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n"); goto err_free_res; } } err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); if (err) { dev_warn(&pdev->dev, "Warning: couldn't set 64-bit " "consistent PCI DMA mask.\n"); err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, " "aborting.\n"); goto err_free_res; } } mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev); if (!mdev) { dev_err(&pdev->dev, "Device struct alloc failed, " "aborting.\n"); err = -ENOMEM; goto err_free_res; } mdev->pdev = pdev; mdev->mthca_flags = mthca_hca_table[hca_type].flags; if (ddr_hidden) mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN; /* * Now reset the HCA before we touch the PCI capabilities or * attempt a firmware command, since a boot ROM may have left * the HCA in an undefined state. */ err = mthca_reset(mdev); if (err) { mthca_err(mdev, "Failed to reset HCA, aborting.\n"); goto err_free_dev; } if (mthca_cmd_init(mdev)) { mthca_err(mdev, "Failed to init command interface, aborting.\n"); goto err_free_dev; } err = mthca_tune_pci(mdev); if (err) goto err_cmd; err = mthca_init_hca(mdev); if (err) goto err_cmd; if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) { mthca_warn(mdev, "HCA FW version %d.%d.%03d is old (%d.%d.%03d is current).\n", (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff, (int) (mdev->fw_ver & 0xffff), (int) (mthca_hca_table[hca_type].latest_fw >> 32), (int) (mthca_hca_table[hca_type].latest_fw >> 16) & 0xffff, (int) (mthca_hca_table[hca_type].latest_fw & 0xffff)); mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n"); } if (msi_x && !mthca_enable_msi_x(mdev)) mdev->mthca_flags |= MTHCA_FLAG_MSI_X; err = mthca_setup_hca(mdev); if (err == -EBUSY && (mdev->mthca_flags & MTHCA_FLAG_MSI_X)) { if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) pci_disable_msix(pdev); mdev->mthca_flags &= ~MTHCA_FLAG_MSI_X; err = mthca_setup_hca(mdev); } if (err) goto err_close; err = mthca_register_device(mdev); if (err) goto err_cleanup; err = mthca_create_agents(mdev); if (err) goto err_unregister; pci_set_drvdata(pdev, mdev); mdev->hca_type = hca_type; mdev->active = 1; return 0; err_unregister: mthca_unregister_device(mdev); err_cleanup: mthca_cleanup_mcg_table(mdev); mthca_cleanup_av_table(mdev); mthca_cleanup_qp_table(mdev); mthca_cleanup_srq_table(mdev); mthca_cleanup_cq_table(mdev); mthca_cmd_use_polling(mdev); mthca_cleanup_eq_table(mdev); mthca_pd_free(mdev, &mdev->driver_pd); mthca_cleanup_mr_table(mdev); mthca_cleanup_pd_table(mdev); mthca_cleanup_uar_table(mdev); err_close: if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) pci_disable_msix(pdev); mthca_close_hca(mdev); err_cmd: mthca_cmd_cleanup(mdev); err_free_dev: ib_dealloc_device(&mdev->ib_dev); err_free_res: pci_release_regions(pdev); err_disable_pdev: pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); return err; } static void __mthca_remove_one(struct pci_dev *pdev) { struct mthca_dev *mdev = pci_get_drvdata(pdev); u8 status; int p; if (mdev) { mthca_free_agents(mdev); mthca_unregister_device(mdev); for (p = 1; p <= mdev->limits.num_ports; ++p) mthca_CLOSE_IB(mdev, p, &status); mthca_cleanup_mcg_table(mdev); mthca_cleanup_av_table(mdev); mthca_cleanup_qp_table(mdev); mthca_cleanup_srq_table(mdev); mthca_cleanup_cq_table(mdev); mthca_cmd_use_polling(mdev); mthca_cleanup_eq_table(mdev); mthca_pd_free(mdev, &mdev->driver_pd); mthca_cleanup_mr_table(mdev); mthca_cleanup_pd_table(mdev); iounmap(mdev->kar); mthca_uar_free(mdev, &mdev->driver_uar); mthca_cleanup_uar_table(mdev); mthca_close_hca(mdev); mthca_cmd_cleanup(mdev); if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) pci_disable_msix(pdev); ib_dealloc_device(&mdev->ib_dev); pci_release_regions(pdev); pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); } } int __mthca_restart_one(struct pci_dev *pdev) { struct mthca_dev *mdev; int hca_type; mdev = pci_get_drvdata(pdev); if (!mdev) return -ENODEV; hca_type = mdev->hca_type; __mthca_remove_one(pdev); return __mthca_init_one(pdev, hca_type); } static int __devinit mthca_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { static int mthca_version_printed = 0; int ret; mutex_lock(&mthca_device_mutex); if (!mthca_version_printed) { printk(KERN_INFO "%s", mthca_version); ++mthca_version_printed; } if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) { printk(KERN_ERR PFX "%s has invalid driver data %jx\n", pci_name(pdev), (uintmax_t)id->driver_data); mutex_unlock(&mthca_device_mutex); return -ENODEV; } ret = __mthca_init_one(pdev, id->driver_data); mutex_unlock(&mthca_device_mutex); return ret; } static void __devexit mthca_remove_one(struct pci_dev *pdev) { mutex_lock(&mthca_device_mutex); __mthca_remove_one(pdev); mutex_unlock(&mthca_device_mutex); } static struct pci_device_id mthca_pci_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR), .driver_data = TAVOR }, { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_TAVOR), .driver_data = TAVOR }, { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT), .driver_data = ARBEL_COMPAT }, { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT), .driver_data = ARBEL_COMPAT }, { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL), .driver_data = ARBEL_NATIVE }, { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL), .driver_data = ARBEL_NATIVE }, { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI), .driver_data = SINAI }, { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI), .driver_data = SINAI }, { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI_OLD), .driver_data = SINAI }, { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI_OLD), .driver_data = SINAI }, { 0, } }; MODULE_DEVICE_TABLE(pci, mthca_pci_table); static struct pci_driver mthca_driver = { .name = DRV_NAME, .id_table = mthca_pci_table, .probe = mthca_init_one, .remove = __devexit_p(mthca_remove_one) }; static void __init __mthca_check_profile_val(const char *name, int *pval, int pval_default) { /* value must be positive and power of 2 */ int old_pval = *pval; if (old_pval <= 0) *pval = pval_default; else *pval = roundup_pow_of_two(old_pval); if (old_pval != *pval) { printk(KERN_WARNING PFX "Invalid value %d for %s in module parameter.\n", old_pval, name); printk(KERN_WARNING PFX "Corrected %s to %d.\n", name, *pval); } } #define mthca_check_profile_val(name, default) \ __mthca_check_profile_val(#name, &hca_profile.name, default) static void __init mthca_validate_profile(void) { mthca_check_profile_val(num_qp, MTHCA_DEFAULT_NUM_QP); mthca_check_profile_val(rdb_per_qp, MTHCA_DEFAULT_RDB_PER_QP); mthca_check_profile_val(num_cq, MTHCA_DEFAULT_NUM_CQ); mthca_check_profile_val(num_mcg, MTHCA_DEFAULT_NUM_MCG); mthca_check_profile_val(num_mpt, MTHCA_DEFAULT_NUM_MPT); mthca_check_profile_val(num_mtt, MTHCA_DEFAULT_NUM_MTT); mthca_check_profile_val(num_udav, MTHCA_DEFAULT_NUM_UDAV); mthca_check_profile_val(fmr_reserved_mtts, MTHCA_DEFAULT_NUM_RESERVED_MTTS); if (hca_profile.fmr_reserved_mtts >= hca_profile.num_mtt) { printk(KERN_WARNING PFX "Invalid fmr_reserved_mtts module parameter %d.\n", hca_profile.fmr_reserved_mtts); printk(KERN_WARNING PFX "(Must be smaller than num_mtt %d)\n", hca_profile.num_mtt); hca_profile.fmr_reserved_mtts = hca_profile.num_mtt / 2; printk(KERN_WARNING PFX "Corrected fmr_reserved_mtts to %d.\n", hca_profile.fmr_reserved_mtts); } if (log_mtts_per_seg == 0) log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 5)) { printk(KERN_WARNING PFX "bad log_mtts_per_seg (%d). Using default - %d\n", log_mtts_per_seg, ilog2(MTHCA_MTT_SEG_SIZE / 8)); log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); } } static int __init mthca_init(void) { int ret; mthca_validate_profile(); ret = mthca_catas_init(); if (ret) return ret; ret = pci_register_driver(&mthca_driver); if (ret < 0) { mthca_catas_cleanup(); return ret; } return 0; } static void __exit mthca_cleanup(void) { pci_unregister_driver(&mthca_driver); mthca_catas_cleanup(); } module_init_order(mthca_init, SI_ORDER_MIDDLE); module_exit(mthca_cleanup); Index: head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h =================================================================== --- head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h (revision 300675) +++ head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h (revision 300676) @@ -1,761 +1,763 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef _IPOIB_H #define _IPOIB_H +#define LINUXKPI_PARAM_PREFIX ipoib_ + #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ofed.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #endif #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include /* constants */ #define INFINIBAND_ALEN 20 /* Octets in IPoIB HW addr */ #ifdef IPOIB_CM #define CONFIG_INFINIBAND_IPOIB_CM #endif #ifdef IPOIB_DEBUG #define CONFIG_INFINIBAND_IPOIB_DEBUG #define CONFIG_INFINIBAND_IPOIB_DEBUG_DATA #endif enum ipoib_flush_level { IPOIB_FLUSH_LIGHT, IPOIB_FLUSH_NORMAL, IPOIB_FLUSH_HEAVY }; enum { IPOIB_ENCAP_LEN = 4, IPOIB_HEADER_LEN = IPOIB_ENCAP_LEN + INFINIBAND_ALEN, IPOIB_UD_MAX_MTU = 4 * 1024, // IPOIB_UD_RX_SG = (IPOIB_UD_MAX_MTU / MJUMPAGESIZE), IPOIB_UD_RX_SG = 2, IPOIB_UD_TX_SG = (IPOIB_UD_MAX_MTU / MCLBYTES) + 2, IPOIB_CM_MAX_MTU = (64 * 1024), IPOIB_CM_TX_SG = (IPOIB_CM_MAX_MTU / MCLBYTES) + 2, IPOIB_CM_RX_SG = (IPOIB_CM_MAX_MTU / MJUMPAGESIZE), IPOIB_RX_RING_SIZE = 256, IPOIB_TX_RING_SIZE = 128, IPOIB_MAX_RX_SG = MAX(IPOIB_CM_RX_SG, IPOIB_UD_RX_SG), IPOIB_MAX_TX_SG = MAX(IPOIB_CM_TX_SG, IPOIB_UD_TX_SG), IPOIB_MAX_QUEUE_SIZE = 8192, IPOIB_MIN_QUEUE_SIZE = 2, IPOIB_CM_MAX_CONN_QP = 4096, IPOIB_NUM_WC = 4, IPOIB_MAX_PATH_REC_QUEUE = 3, IPOIB_MAX_MCAST_QUEUE = 3, IPOIB_FLAG_OPER_UP = 0, IPOIB_FLAG_INITIALIZED = 1, IPOIB_FLAG_ADMIN_UP = 2, IPOIB_PKEY_ASSIGNED = 3, IPOIB_PKEY_STOP = 4, IPOIB_FLAG_SUBINTERFACE = 5, IPOIB_MCAST_RUN = 6, IPOIB_STOP_REAPER = 7, IPOIB_FLAG_UMCAST = 10, IPOIB_FLAG_CSUM = 11, IPOIB_MAX_BACKOFF_SECONDS = 16, IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ IPOIB_MCAST_FLAG_SENDONLY = 1, IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ IPOIB_MCAST_FLAG_ATTACHED = 3, IPOIB_MAX_LRO_DESCRIPTORS = 8, IPOIB_LRO_MAX_AGGR = 64, MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, }; #define IPOIB_OP_RECV (1ul << 31) #ifdef CONFIG_INFINIBAND_IPOIB_CM #define IPOIB_OP_CM (1ul << 30) #else #define IPOIB_OP_CM (0) #endif /* structs */ struct ipoib_header { u8 hwaddr[INFINIBAND_ALEN]; __be16 proto; u16 reserved; }; struct ipoib_pseudoheader { u8 hwaddr[INFINIBAND_ALEN]; }; /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ struct ipoib_mcast { struct ib_sa_mcmember_rec mcmember; struct ib_sa_multicast *mc; struct ipoib_ah *ah; struct rb_node rb_node; struct list_head list; unsigned long created; unsigned long backoff; unsigned long flags; unsigned char logcount; struct ifqueue pkt_queue; struct ipoib_dev_priv *priv; }; struct ipoib_cm_rx_buf { struct mbuf *mb; u64 mapping[IPOIB_CM_RX_SG]; }; struct ipoib_cm_tx_buf { struct mbuf *mb; u64 mapping[IPOIB_CM_TX_SG]; }; struct ipoib_rx_buf { struct mbuf *mb; u64 mapping[IPOIB_UD_RX_SG]; }; struct ipoib_tx_buf { struct mbuf *mb; u64 mapping[IPOIB_UD_TX_SG]; }; struct ib_cm_id; struct ipoib_cm_data { __be32 qpn; /* High byte MUST be ignored on receive */ __be32 mtu; }; /* * Quoting 10.3.1 Queue Pair and EE Context States: * * Note, for QPs that are associated with an SRQ, the Consumer should take the * QP through the Error State before invoking a Destroy QP or a Modify QP to the * Reset State. The Consumer may invoke the Destroy QP without first performing * a Modify QP to the Error State and waiting for the Affiliated Asynchronous * Last WQE Reached Event. However, if the Consumer does not wait for the * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment * leakage may occur. Therefore, it is good programming practice to tear down a * QP that is associated with an SRQ by using the following process: * * - Put the QP in the Error State * - Wait for the Affiliated Asynchronous Last WQE Reached Event; * - either: * drain the CQ by invoking the Poll CQ verb and either wait for CQ * to be empty or the number of Poll CQ operations has exceeded * CQ capacity size; * - or * post another WR that completes on the same CQ and wait for this * WR to return as a WC; * - and then invoke a Destroy QP or Reset QP. * * We use the second option and wait for a completion on the * same CQ before destroying QPs attached to our SRQ. */ enum ipoib_cm_state { IPOIB_CM_RX_LIVE, IPOIB_CM_RX_ERROR, /* Ignored by stale task */ IPOIB_CM_RX_FLUSH /* Last WQE Reached event observed */ }; struct ipoib_cm_rx { struct ib_cm_id *id; struct ib_qp *qp; struct ipoib_cm_rx_buf *rx_ring; struct list_head list; struct ipoib_dev_priv *priv; unsigned long jiffies; enum ipoib_cm_state state; int recv_count; }; struct ipoib_cm_tx { struct ib_cm_id *id; struct ib_qp *qp; struct list_head list; struct ipoib_dev_priv *priv; struct ipoib_path *path; struct ipoib_cm_tx_buf *tx_ring; unsigned tx_head; unsigned tx_tail; unsigned long flags; u32 mtu; /* remote specified mtu, with grh. */ }; struct ipoib_cm_dev_priv { struct ib_srq *srq; struct ipoib_cm_rx_buf *srq_ring; struct ib_cm_id *id; struct list_head passive_ids; /* state: LIVE */ struct list_head rx_error_list; /* state: ERROR */ struct list_head rx_flush_list; /* state: FLUSH, drain not started */ struct list_head rx_drain_list; /* state: FLUSH, drain started */ struct list_head rx_reap_list; /* state: FLUSH, drain done */ struct work_struct start_task; struct work_struct reap_task; struct work_struct mb_task; struct work_struct rx_reap_task; struct delayed_work stale_task; struct ifqueue mb_queue; struct list_head start_list; struct list_head reap_list; struct ib_sge rx_sge[IPOIB_CM_RX_SG]; struct ib_recv_wr rx_wr; int nonsrq_conn_qp; int max_cm_mtu; /* Actual buf size. */ int num_frags; }; struct ipoib_ethtool_st { u16 coalesce_usecs; u16 max_coalesced_frames; }; /* * Device private locking: network stack tx_lock protects members used * in TX fast path, lock protects everything else. lock nests inside * of tx_lock (ie tx_lock must be acquired first if needed). */ struct ipoib_dev_priv { spinlock_t lock; spinlock_t drain_lock; struct ifnet *dev; u8 broadcastaddr[INFINIBAND_ALEN]; unsigned long flags; int gone; struct mutex vlan_mutex; struct rb_root path_tree; struct list_head path_list; struct ipoib_mcast *broadcast; struct list_head multicast_list; struct rb_root multicast_tree; struct delayed_work pkey_poll_task; struct delayed_work mcast_task; struct work_struct carrier_on_task; struct work_struct flush_light; struct work_struct flush_normal; struct work_struct flush_heavy; struct work_struct restart_task; struct delayed_work ah_reap_task; struct ib_device *ca; u8 port; u16 pkey; u16 pkey_index; struct ib_pd *pd; struct ib_mr *mr; struct ib_cq *recv_cq; struct ib_cq *send_cq; struct ib_qp *qp; u32 qkey; union ib_gid local_gid; u16 local_lid; unsigned int admin_mtu; /* User selected MTU, no GRH. */ unsigned int mcast_mtu; /* Minus GRH bytes, from mcast group. */ unsigned int max_ib_mtu; /* Without header, actual buf size. */ struct ipoib_rx_buf *rx_ring; struct ipoib_tx_buf *tx_ring; unsigned tx_head; unsigned tx_tail; struct ib_sge tx_sge[IPOIB_MAX_TX_SG]; struct ib_send_wr tx_wr; unsigned tx_outstanding; struct ib_wc send_wc[MAX_SEND_CQE]; struct ib_recv_wr rx_wr; struct ib_sge rx_sge[IPOIB_MAX_RX_SG]; struct ib_wc ibwc[IPOIB_NUM_WC]; struct list_head dead_ahs; struct ib_event_handler event_handler; struct ifnet *parent; struct list_head child_intfs; struct list_head list; #ifdef CONFIG_INFINIBAND_IPOIB_CM struct ipoib_cm_dev_priv cm; #endif #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct list_head fs_list; struct dentry *mcg_dentry; struct dentry *path_dentry; #endif int hca_caps; struct ipoib_ethtool_st ethtool; struct timer_list poll_timer; }; struct ipoib_ah { struct ipoib_dev_priv *priv; struct ib_ah *ah; struct list_head list; struct kref ref; unsigned last_send; }; struct ipoib_path { struct ipoib_dev_priv *priv; struct rb_node rb_node; struct list_head list; #ifdef CONFIG_INFINIBAND_IPOIB_CM uint8_t hwaddr[INFINIBAND_ALEN]; struct ipoib_cm_tx *cm; #endif struct ipoib_ah *ah; struct ib_sa_path_rec pathrec; struct ifqueue queue; int query_id; struct ib_sa_query *query; struct completion done; int valid; }; /* UD Only transmits encap len but we want the two sizes to be symmetrical. */ #define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) #define IPOIB_CM_MTU(ib_mtu) (ib_mtu - 0x10) #define IPOIB_IS_MULTICAST(addr) ((addr)[4] == 0xff) extern struct workqueue_struct *ipoib_workqueue; #define IPOIB_MTAP_PROTO(_ifp, _m, _proto) \ do { \ if (bpf_peers_present((_ifp)->if_bpf)) { \ M_ASSERTVALID(_m); \ ipoib_mtap_proto((_ifp), (_m), (_proto)); \ } \ } while (0) /* functions */ void ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto); void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr); void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr); struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *, struct ib_pd *pd, struct ib_ah_attr *attr); void ipoib_free_ah(struct kref *kref); static inline void ipoib_put_ah(struct ipoib_ah *ah) { kref_put(&ah->ref, ipoib_free_ah); } int ipoib_open(struct ipoib_dev_priv *priv); int ipoib_add_pkey_attr(struct ipoib_dev_priv *priv); int ipoib_add_umcast_attr(struct ipoib_dev_priv *priv); void ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto); void ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_ah *address, u32 qpn); void ipoib_reap_ah(struct work_struct *work); void ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv); void ipoib_flush_paths(struct ipoib_dev_priv *priv); struct ipoib_dev_priv *ipoib_intf_alloc(const char *format); int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port); void ipoib_ib_dev_flush_light(struct work_struct *work); void ipoib_ib_dev_flush_normal(struct work_struct *work); void ipoib_ib_dev_flush_heavy(struct work_struct *work); void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv); int ipoib_ib_dev_open(struct ipoib_dev_priv *priv); int ipoib_ib_dev_up(struct ipoib_dev_priv *priv); int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush); int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush); int ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port); void ipoib_dev_cleanup(struct ipoib_dev_priv *priv); void ipoib_mcast_join_task(struct work_struct *work); void ipoib_mcast_carrier_on_task(struct work_struct *work); void ipoib_mcast_send(struct ipoib_dev_priv *priv, void *mgid, struct mbuf *mb); void ipoib_mcast_restart_task(struct work_struct *work); void ipoib_mcast_restart(struct ipoib_dev_priv *); int ipoib_mcast_start_thread(struct ipoib_dev_priv *priv); int ipoib_mcast_stop_thread(struct ipoib_dev_priv *priv, int flush); void ipoib_mcast_dev_down(struct ipoib_dev_priv *priv); void ipoib_mcast_dev_flush(struct ipoib_dev_priv *priv); void ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct ipoib_dev_priv *priv); int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter); void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, union ib_gid *gid, unsigned long *created, unsigned int *queuelen, unsigned int *complete, unsigned int *send_only); struct ipoib_path_iter *ipoib_path_iter_init(struct ipoib_dev_priv *priv); int ipoib_path_iter_next(struct ipoib_path_iter *iter); void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path); #endif int ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu); int ipoib_mcast_attach(struct ipoib_dev_priv *priv, u16 mlid, union ib_gid *mgid, int set_qkey); int ipoib_init_qp(struct ipoib_dev_priv *priv); int ipoib_transport_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca); void ipoib_transport_dev_cleanup(struct ipoib_dev_priv *priv); void ipoib_event(struct ib_event_handler *handler, struct ib_event *record); void ipoib_pkey_poll(struct work_struct *work); int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv); void ipoib_drain_cq(struct ipoib_dev_priv *priv); int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max); void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req); int ipoib_poll_tx(struct ipoib_dev_priv *priv); void ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req); void ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length); struct mbuf *ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req, int size); void ipoib_set_ethtool_ops(struct ifnet *dev); int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca); #ifdef CONFIG_INFINIBAND_IPOIB_CM #define IPOIB_FLAGS_RC 0x80 #define IPOIB_FLAGS_UC 0x40 /* We don't support UC connections at the moment */ #define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC)) extern int ipoib_max_conn_qp; static inline int ipoib_cm_admin_enabled(struct ipoib_dev_priv *priv) { return IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev)); } static inline int ipoib_cm_enabled(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { return IPOIB_CM_SUPPORTED(hwaddr); } static inline int ipoib_cm_up(struct ipoib_path *path) { return test_bit(IPOIB_FLAG_OPER_UP, &path->cm->flags); } static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_path *path) { return path->cm; } static inline void ipoib_cm_set(struct ipoib_path *path, struct ipoib_cm_tx *tx) { path->cm = tx; } static inline int ipoib_cm_has_srq(struct ipoib_dev_priv *priv) { return !!priv->cm.srq; } static inline unsigned int ipoib_cm_max_mtu(struct ipoib_dev_priv *priv) { return priv->cm.max_cm_mtu; } void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx); int ipoib_cm_dev_open(struct ipoib_dev_priv *priv); void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv); int ipoib_cm_dev_init(struct ipoib_dev_priv *priv); int ipoib_cm_add_mode_attr(struct ipoib_dev_priv *priv); void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv); struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv, struct ipoib_path *path); void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx); void ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu); void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc); void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc); #else struct ipoib_cm_tx; #define ipoib_max_conn_qp 0 static inline int ipoib_cm_admin_enabled(struct ipoib_dev_priv *priv) { return 0; } static inline int ipoib_cm_enabled(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { return 0; } static inline int ipoib_cm_up(struct ipoib_path *path) { return 0; } static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_path *path) { return NULL; } static inline void ipoib_cm_set(struct ipoib_path *path, struct ipoib_cm_tx *tx) { } static inline int ipoib_cm_has_srq(struct ipoib_dev_priv *priv) { return 0; } static inline unsigned int ipoib_cm_max_mtu(struct ipoib_dev_priv *priv) { return 0; } static inline void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx) { return; } static inline int ipoib_cm_dev_open(struct ipoib_dev_priv *priv) { return 0; } static inline void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv) { return; } static inline int ipoib_cm_dev_init(struct ipoib_dev_priv *priv) { return -ENOSYS; } static inline void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv) { return; } static inline struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv, struct ipoib_path *path) { return NULL; } static inline void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) { return; } static inline int ipoib_cm_add_mode_attr(struct ipoib_dev_priv *priv) { return 0; } static inline void ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu) { m_freem(mb); } static inline void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { } static inline void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { } #endif #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG void ipoib_create_debug_files(struct ipoib_dev_priv *priv); void ipoib_delete_debug_files(struct ipoib_dev_priv *priv); int ipoib_register_debugfs(void); void ipoib_unregister_debugfs(void); #else static inline void ipoib_create_debug_files(struct ipoib_dev_priv *priv) { } static inline void ipoib_delete_debug_files(struct ipoib_dev_priv *priv) { } static inline int ipoib_register_debugfs(void) { return 0; } static inline void ipoib_unregister_debugfs(void) { } #endif #define ipoib_printk(level, priv, format, arg...) \ printk(level "%s: " format, if_name(((struct ipoib_dev_priv *) priv)->dev), ## arg) #define ipoib_warn(priv, format, arg...) \ ipoib_printk(KERN_WARNING, priv, format , ## arg) extern int ipoib_sendq_size; extern int ipoib_recvq_size; extern struct ib_sa_client ipoib_sa_client; #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG extern int ipoib_debug_level; #define ipoib_dbg(priv, format, arg...) \ do { \ if (ipoib_debug_level > 0) \ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ } while (0) #define ipoib_dbg_mcast(priv, format, arg...) \ do { \ if (mcast_debug_level > 0) \ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ } while (0) #else /* CONFIG_INFINIBAND_IPOIB_DEBUG */ #define ipoib_dbg(priv, format, arg...) \ do { (void) (priv); } while (0) #define ipoib_dbg_mcast(priv, format, arg...) \ do { (void) (priv); } while (0) #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA #define ipoib_dbg_data(priv, format, arg...) \ do { \ if (data_debug_level > 0) \ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ } while (0) #else /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ #define ipoib_dbg_data(priv, format, arg...) \ do { (void) (priv); } while (0) #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ #define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) #endif /* _IPOIB_H */ Index: head/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h =================================================================== --- head/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h (revision 300675) +++ head/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h (revision 300676) @@ -1,723 +1,725 @@ #ifndef _SDP_H_ #define _SDP_H_ +#define LINUXKPI_PARAM_PREFIX ib_sdp_ + #include "opt_ddb.h" #include "opt_inet.h" #include "opt_ofed.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SDP_DEBUG #define CONFIG_INFINIBAND_SDP_DEBUG #endif #include "sdp_dbg.h" #undef LIST_HEAD /* From sys/queue.h */ #define LIST_HEAD(name, type) \ struct name { \ struct type *lh_first; /* first element */ \ } /* Interval between successive polls in the Tx routine when polling is used instead of interrupts (in per-core Tx rings) - should be power of 2 */ #define SDP_TX_POLL_MODER 16 #define SDP_TX_POLL_TIMEOUT (HZ / 20) #define SDP_NAGLE_TIMEOUT (HZ / 10) #define SDP_SRCAVAIL_CANCEL_TIMEOUT (HZ * 5) #define SDP_SRCAVAIL_ADV_TIMEOUT (1 * HZ) #define SDP_SRCAVAIL_PAYLOAD_LEN 1 #define SDP_RESOLVE_TIMEOUT 1000 #define SDP_ROUTE_TIMEOUT 1000 #define SDP_RETRY_COUNT 5 #define SDP_KEEPALIVE_TIME (120 * 60 * HZ) #define SDP_FIN_WAIT_TIMEOUT (60 * HZ) /* like TCP_FIN_TIMEOUT */ #define SDP_TX_SIZE 0x40 #define SDP_RX_SIZE 0x40 #define SDP_FMR_SIZE (MIN(0x1000, PAGE_SIZE) / sizeof(u64)) #define SDP_FMR_POOL_SIZE 1024 #define SDP_FMR_DIRTY_SIZE ( SDP_FMR_POOL_SIZE / 4 ) #define SDP_MAX_RDMA_READ_LEN (PAGE_SIZE * (SDP_FMR_SIZE - 2)) /* mb inlined data len - rest will be rx'ed into frags */ #define SDP_HEAD_SIZE (sizeof(struct sdp_bsdh)) /* limit tx payload len, if the sink supports bigger buffers than the source * can handle. * or rx fragment size (limited by sge->length size) */ #define SDP_MAX_PACKET (1 << 16) #define SDP_MAX_PAYLOAD (SDP_MAX_PACKET - SDP_HEAD_SIZE) #define SDP_MAX_RECV_SGES (SDP_MAX_PACKET / MCLBYTES) #define SDP_MAX_SEND_SGES (SDP_MAX_PACKET / MCLBYTES) + 2 #define SDP_NUM_WC 4 #define SDP_DEF_ZCOPY_THRESH 64*1024 #define SDP_MIN_ZCOPY_THRESH PAGE_SIZE #define SDP_MAX_ZCOPY_THRESH 1048576 #define SDP_OP_RECV 0x800000000LL #define SDP_OP_SEND 0x400000000LL #define SDP_OP_RDMA 0x200000000LL #define SDP_OP_NOP 0x100000000LL /* how long (in jiffies) to block sender till tx completion*/ #define SDP_BZCOPY_POLL_TIMEOUT (HZ / 10) #define SDP_AUTO_CONF 0xffff #define AUTO_MOD_DELAY (HZ / 4) struct sdp_mb_cb { __u32 seq; /* Starting sequence number */ struct bzcopy_state *bz; struct rx_srcavail_state *rx_sa; struct tx_srcavail_state *tx_sa; }; #define M_PUSH M_PROTO1 /* Do a 'push'. */ #define M_URG M_PROTO2 /* Mark as urgent (oob). */ #define SDP_SKB_CB(__mb) ((struct sdp_mb_cb *)&((__mb)->cb[0])) #define BZCOPY_STATE(mb) (SDP_SKB_CB(mb)->bz) #define RX_SRCAVAIL_STATE(mb) (SDP_SKB_CB(mb)->rx_sa) #define TX_SRCAVAIL_STATE(mb) (SDP_SKB_CB(mb)->tx_sa) #ifndef MIN #define MIN(a, b) (a < b ? a : b) #endif #define ring_head(ring) (atomic_read(&(ring).head)) #define ring_tail(ring) (atomic_read(&(ring).tail)) #define ring_posted(ring) (ring_head(ring) - ring_tail(ring)) #define rx_ring_posted(ssk) ring_posted(ssk->rx_ring) #ifdef SDP_ZCOPY #define tx_ring_posted(ssk) (ring_posted(ssk->tx_ring) + \ (ssk->tx_ring.rdma_inflight ? ssk->tx_ring.rdma_inflight->busy : 0)) #else #define tx_ring_posted(ssk) ring_posted(ssk->tx_ring) #endif extern int sdp_zcopy_thresh; extern int rcvbuf_initial_size; extern struct workqueue_struct *rx_comp_wq; extern struct ib_client sdp_client; enum sdp_mid { SDP_MID_HELLO = 0x0, SDP_MID_HELLO_ACK = 0x1, SDP_MID_DISCONN = 0x2, SDP_MID_ABORT = 0x3, SDP_MID_SENDSM = 0x4, SDP_MID_RDMARDCOMPL = 0x6, SDP_MID_SRCAVAIL_CANCEL = 0x8, SDP_MID_CHRCVBUF = 0xB, SDP_MID_CHRCVBUF_ACK = 0xC, SDP_MID_SINKAVAIL = 0xFD, SDP_MID_SRCAVAIL = 0xFE, SDP_MID_DATA = 0xFF, }; enum sdp_flags { SDP_OOB_PRES = 1 << 0, SDP_OOB_PEND = 1 << 1, }; enum { SDP_MIN_TX_CREDITS = 2 }; enum { SDP_ERR_ERROR = -4, SDP_ERR_FAULT = -3, SDP_NEW_SEG = -2, SDP_DO_WAIT_MEM = -1 }; struct sdp_bsdh { u8 mid; u8 flags; __u16 bufs; __u32 len; __u32 mseq; __u32 mseq_ack; } __attribute__((__packed__)); union cma_ip_addr { struct in6_addr ip6; struct { __u32 pad[3]; __u32 addr; } ip4; } __attribute__((__packed__)); /* TODO: too much? Can I avoid having the src/dst and port here? */ struct sdp_hh { struct sdp_bsdh bsdh; u8 majv_minv; u8 ipv_cap; u8 rsvd1; u8 max_adverts; __u32 desremrcvsz; __u32 localrcvsz; __u16 port; __u16 rsvd2; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; u8 rsvd3[IB_CM_REQ_PRIVATE_DATA_SIZE - sizeof(struct sdp_bsdh) - 48]; } __attribute__((__packed__)); struct sdp_hah { struct sdp_bsdh bsdh; u8 majv_minv; u8 ipv_cap; u8 rsvd1; u8 ext_max_adverts; __u32 actrcvsz; u8 rsvd2[IB_CM_REP_PRIVATE_DATA_SIZE - sizeof(struct sdp_bsdh) - 8]; } __attribute__((__packed__)); struct sdp_rrch { __u32 len; } __attribute__((__packed__)); struct sdp_srcah { __u32 len; __u32 rkey; __u64 vaddr; } __attribute__((__packed__)); struct sdp_buf { struct mbuf *mb; u64 mapping[SDP_MAX_SEND_SGES]; } __attribute__((__packed__)); struct sdp_chrecvbuf { u32 size; } __attribute__((__packed__)); /* Context used for synchronous zero copy bcopy (BZCOPY) */ struct bzcopy_state { unsigned char __user *u_base; int u_len; int left; int page_cnt; int cur_page; int cur_offset; int busy; struct sdp_sock *ssk; struct page **pages; }; enum rx_sa_flag { RX_SA_ABORTED = 2, }; enum tx_sa_flag { TX_SA_SENDSM = 0x01, TX_SA_CROSS_SEND = 0x02, TX_SA_INTRRUPTED = 0x04, TX_SA_TIMEDOUT = 0x08, TX_SA_ERROR = 0x10, }; struct rx_srcavail_state { /* Advertised buffer stuff */ u32 mseq; u32 used; u32 reported; u32 len; u32 rkey; u64 vaddr; /* Dest buff info */ struct ib_umem *umem; struct ib_pool_fmr *fmr; /* Utility */ u8 busy; enum rx_sa_flag flags; }; struct tx_srcavail_state { /* Data below 'busy' will be reset */ u8 busy; struct ib_umem *umem; struct ib_pool_fmr *fmr; u32 bytes_sent; u32 bytes_acked; enum tx_sa_flag abort_flags; u8 posted; u32 mseq; }; struct sdp_tx_ring { #ifdef SDP_ZCOPY struct rx_srcavail_state *rdma_inflight; #endif struct sdp_buf *buffer; atomic_t head; atomic_t tail; struct ib_cq *cq; atomic_t credits; #define tx_credits(ssk) (atomic_read(&ssk->tx_ring.credits)) struct callout timer; u16 poll_cnt; }; struct sdp_rx_ring { struct sdp_buf *buffer; atomic_t head; atomic_t tail; struct ib_cq *cq; int destroyed; struct rwlock destroyed_lock; }; struct sdp_device { struct ib_pd *pd; struct ib_mr *mr; struct ib_fmr_pool *fmr_pool; }; struct sdp_moderation { unsigned long last_moder_packets; unsigned long last_moder_tx_packets; unsigned long last_moder_bytes; unsigned long last_moder_jiffies; int last_moder_time; u16 rx_usecs; u16 rx_frames; u16 tx_usecs; u32 pkt_rate_low; u16 rx_usecs_low; u32 pkt_rate_high; u16 rx_usecs_high; u16 sample_interval; u16 adaptive_rx_coal; u32 msg_enable; int moder_cnt; int moder_time; }; /* These are flags fields. */ #define SDP_TIMEWAIT 0x0001 /* In ssk timewait state. */ #define SDP_DROPPED 0x0002 /* Socket has been dropped. */ #define SDP_SOCKREF 0x0004 /* Holding a sockref for close. */ #define SDP_NODELAY 0x0008 /* Disble nagle. */ #define SDP_NEEDFIN 0x0010 /* Send a fin on the next tx. */ #define SDP_DREQWAIT 0x0020 /* Waiting on DREQ. */ #define SDP_DESTROY 0x0040 /* Being destroyed. */ #define SDP_DISCON 0x0080 /* rdma_disconnect is owed. */ /* These are oobflags */ #define SDP_HADOOB 0x0001 /* Had OOB data. */ #define SDP_HAVEOOB 0x0002 /* Have OOB data. */ struct sdp_sock { LIST_ENTRY(sdp_sock) list; struct socket *socket; struct rdma_cm_id *id; struct ib_device *ib_device; struct sdp_device *sdp_dev; struct ib_qp *qp; struct ucred *cred; struct callout keep2msl; /* 2msl and keepalive timer. */ struct callout nagle_timer; /* timeout waiting for ack */ struct ib_ucontext context; in_port_t lport; in_addr_t laddr; in_port_t fport; in_addr_t faddr; int flags; int oobflags; /* protected by rx lock. */ int state; int softerror; int recv_bytes; /* Bytes per recv. buf including header */ int xmit_size_goal; char iobc; struct sdp_rx_ring rx_ring; struct sdp_tx_ring tx_ring; struct rwlock lock; struct mbuf *rx_ctl_q; struct mbuf *rx_ctl_tail; int qp_active; /* XXX Flag. */ int max_sge; struct work_struct rx_comp_work; #define rcv_nxt(ssk) atomic_read(&(ssk->rcv_nxt)) atomic_t rcv_nxt; /* SDP specific */ atomic_t mseq_ack; #define mseq_ack(ssk) (atomic_read(&ssk->mseq_ack)) unsigned max_bufs; /* Initial buffers offered by other side */ unsigned min_bufs; /* Low water mark to wake senders */ unsigned long nagle_last_unacked; /* mseq of lastest unacked packet */ atomic_t remote_credits; #define remote_credits(ssk) (atomic_read(&ssk->remote_credits)) int poll_cq; /* SDP slow start */ int recv_request_head; /* mark the rx_head when the resize request was received */ int recv_request; /* XXX flag if request to resize was received */ unsigned long tx_packets; unsigned long rx_packets; unsigned long tx_bytes; unsigned long rx_bytes; struct sdp_moderation auto_mod; struct task shutdown_task; #ifdef SDP_ZCOPY struct tx_srcavail_state *tx_sa; struct rx_srcavail_state *rx_sa; spinlock_t tx_sa_lock; struct delayed_work srcavail_cancel_work; int srcavail_cancel_mseq; /* ZCOPY data: -1:use global; 0:disable zcopy; >0: zcopy threshold */ int zcopy_thresh; #endif }; #define sdp_sk(so) ((struct sdp_sock *)(so->so_pcb)) #define SDP_RLOCK(ssk) rw_rlock(&(ssk)->lock) #define SDP_WLOCK(ssk) rw_wlock(&(ssk)->lock) #define SDP_RUNLOCK(ssk) rw_runlock(&(ssk)->lock) #define SDP_WUNLOCK(ssk) rw_wunlock(&(ssk)->lock) #define SDP_WLOCK_ASSERT(ssk) rw_assert(&(ssk)->lock, RA_WLOCKED) #define SDP_RLOCK_ASSERT(ssk) rw_assert(&(ssk)->lock, RA_RLOCKED) #define SDP_LOCK_ASSERT(ssk) rw_assert(&(ssk)->lock, RA_LOCKED) static inline void tx_sa_reset(struct tx_srcavail_state *tx_sa) { memset((void *)&tx_sa->busy, 0, sizeof(*tx_sa) - offsetof(typeof(*tx_sa), busy)); } static inline void rx_ring_unlock(struct sdp_rx_ring *rx_ring) { rw_runlock(&rx_ring->destroyed_lock); } static inline int rx_ring_trylock(struct sdp_rx_ring *rx_ring) { rw_rlock(&rx_ring->destroyed_lock); if (rx_ring->destroyed) { rx_ring_unlock(rx_ring); return 0; } return 1; } static inline void rx_ring_destroy_lock(struct sdp_rx_ring *rx_ring) { rw_wlock(&rx_ring->destroyed_lock); rx_ring->destroyed = 1; rw_wunlock(&rx_ring->destroyed_lock); } static inline void sdp_arm_rx_cq(struct sdp_sock *ssk) { sdp_prf(ssk->socket, NULL, "Arming RX cq"); sdp_dbg_data(ssk->socket, "Arming RX cq\n"); ib_req_notify_cq(ssk->rx_ring.cq, IB_CQ_NEXT_COMP); } static inline void sdp_arm_tx_cq(struct sdp_sock *ssk) { sdp_prf(ssk->socket, NULL, "Arming TX cq"); sdp_dbg_data(ssk->socket, "Arming TX cq. credits: %d, posted: %d\n", tx_credits(ssk), tx_ring_posted(ssk)); ib_req_notify_cq(ssk->tx_ring.cq, IB_CQ_NEXT_COMP); } /* return the min of: * - tx credits * - free slots in tx_ring (not including SDP_MIN_TX_CREDITS */ static inline int tx_slots_free(struct sdp_sock *ssk) { int min_free; min_free = MIN(tx_credits(ssk), SDP_TX_SIZE - tx_ring_posted(ssk)); if (min_free < SDP_MIN_TX_CREDITS) return 0; return min_free - SDP_MIN_TX_CREDITS; }; /* utilities */ static inline char *mid2str(int mid) { #define ENUM2STR(e) [e] = #e static char *mid2str[] = { ENUM2STR(SDP_MID_HELLO), ENUM2STR(SDP_MID_HELLO_ACK), ENUM2STR(SDP_MID_ABORT), ENUM2STR(SDP_MID_DISCONN), ENUM2STR(SDP_MID_SENDSM), ENUM2STR(SDP_MID_RDMARDCOMPL), ENUM2STR(SDP_MID_SRCAVAIL_CANCEL), ENUM2STR(SDP_MID_CHRCVBUF), ENUM2STR(SDP_MID_CHRCVBUF_ACK), ENUM2STR(SDP_MID_DATA), ENUM2STR(SDP_MID_SRCAVAIL), ENUM2STR(SDP_MID_SINKAVAIL), }; if (mid >= ARRAY_SIZE(mid2str)) return NULL; return mid2str[mid]; } static inline struct mbuf * sdp_alloc_mb(struct socket *sk, u8 mid, int size, int wait) { struct sdp_bsdh *h; struct mbuf *mb; MGETHDR(mb, wait, MT_DATA); if (mb == NULL) return (NULL); mb->m_pkthdr.len = mb->m_len = sizeof(struct sdp_bsdh); h = mtod(mb, struct sdp_bsdh *); h->mid = mid; return mb; } static inline struct mbuf * sdp_alloc_mb_data(struct socket *sk, int wait) { return sdp_alloc_mb(sk, SDP_MID_DATA, 0, wait); } static inline struct mbuf * sdp_alloc_mb_disconnect(struct socket *sk, int wait) { return sdp_alloc_mb(sk, SDP_MID_DISCONN, 0, wait); } static inline void * mb_put(struct mbuf *mb, int len) { uint8_t *data; data = mb->m_data; data += mb->m_len; mb->m_len += len; return (void *)data; } static inline struct mbuf * sdp_alloc_mb_chrcvbuf_ack(struct socket *sk, int size, int wait) { struct mbuf *mb; struct sdp_chrecvbuf *resp_size; mb = sdp_alloc_mb(sk, SDP_MID_CHRCVBUF_ACK, sizeof(*resp_size), wait); if (mb == NULL) return (NULL); resp_size = (struct sdp_chrecvbuf *)mb_put(mb, sizeof *resp_size); resp_size->size = htonl(size); return mb; } static inline struct mbuf * sdp_alloc_mb_srcavail(struct socket *sk, u32 len, u32 rkey, u64 vaddr, int wait) { struct mbuf *mb; struct sdp_srcah *srcah; mb = sdp_alloc_mb(sk, SDP_MID_SRCAVAIL, sizeof(*srcah), wait); if (mb == NULL) return (NULL); srcah = (struct sdp_srcah *)mb_put(mb, sizeof(*srcah)); srcah->len = htonl(len); srcah->rkey = htonl(rkey); srcah->vaddr = cpu_to_be64(vaddr); return mb; } static inline struct mbuf * sdp_alloc_mb_srcavail_cancel(struct socket *sk, int wait) { return sdp_alloc_mb(sk, SDP_MID_SRCAVAIL_CANCEL, 0, wait); } static inline struct mbuf * sdp_alloc_mb_rdmardcompl(struct socket *sk, u32 len, int wait) { struct mbuf *mb; struct sdp_rrch *rrch; mb = sdp_alloc_mb(sk, SDP_MID_RDMARDCOMPL, sizeof(*rrch), wait); if (mb == NULL) return (NULL); rrch = (struct sdp_rrch *)mb_put(mb, sizeof(*rrch)); rrch->len = htonl(len); return mb; } static inline struct mbuf * sdp_alloc_mb_sendsm(struct socket *sk, int wait) { return sdp_alloc_mb(sk, SDP_MID_SENDSM, 0, wait); } static inline int sdp_tx_ring_slots_left(struct sdp_sock *ssk) { return SDP_TX_SIZE - tx_ring_posted(ssk); } static inline int credit_update_needed(struct sdp_sock *ssk) { int c; c = remote_credits(ssk); if (likely(c > SDP_MIN_TX_CREDITS)) c += c/2; return unlikely(c < rx_ring_posted(ssk)) && likely(tx_credits(ssk) > 0) && likely(sdp_tx_ring_slots_left(ssk)); } #define SDPSTATS_COUNTER_INC(stat) #define SDPSTATS_COUNTER_ADD(stat, val) #define SDPSTATS_COUNTER_MID_INC(stat, mid) #define SDPSTATS_HIST_LINEAR(stat, size) #define SDPSTATS_HIST(stat, size) static inline void sdp_cleanup_sdp_buf(struct sdp_sock *ssk, struct sdp_buf *sbuf, enum dma_data_direction dir) { struct ib_device *dev; struct mbuf *mb; int i; dev = ssk->ib_device; for (i = 0, mb = sbuf->mb; mb != NULL; mb = mb->m_next, i++) ib_dma_unmap_single(dev, sbuf->mapping[i], mb->m_len, dir); } /* sdp_main.c */ void sdp_set_default_moderation(struct sdp_sock *ssk); void sdp_start_keepalive_timer(struct socket *sk); void sdp_urg(struct sdp_sock *ssk, struct mbuf *mb); void sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk); void sdp_abort(struct socket *sk); struct sdp_sock *sdp_notify(struct sdp_sock *ssk, int error); /* sdp_cma.c */ int sdp_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *); /* sdp_tx.c */ int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device); void sdp_tx_ring_destroy(struct sdp_sock *ssk); int sdp_xmit_poll(struct sdp_sock *ssk, int force); void sdp_post_send(struct sdp_sock *ssk, struct mbuf *mb); void sdp_post_sends(struct sdp_sock *ssk, int wait); void sdp_post_keepalive(struct sdp_sock *ssk); /* sdp_rx.c */ void sdp_rx_ring_init(struct sdp_sock *ssk); int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device); void sdp_rx_ring_destroy(struct sdp_sock *ssk); int sdp_resize_buffers(struct sdp_sock *ssk, u32 new_size); int sdp_init_buffers(struct sdp_sock *ssk, u32 new_size); void sdp_do_posts(struct sdp_sock *ssk); void sdp_rx_comp_full(struct sdp_sock *ssk); /* sdp_zcopy.c */ struct kiocb; int sdp_sendmsg_zcopy(struct kiocb *iocb, struct socket *sk, struct iovec *iov); int sdp_handle_srcavail(struct sdp_sock *ssk, struct sdp_srcah *srcah); void sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack); void sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack, u32 bytes_completed); int sdp_handle_rdma_read_cqe(struct sdp_sock *ssk); int sdp_rdma_to_iovec(struct socket *sk, struct iovec *iov, struct mbuf *mb, unsigned long *used); int sdp_post_rdma_rd_compl(struct sdp_sock *ssk, struct rx_srcavail_state *rx_sa); int sdp_post_sendsm(struct socket *sk); void srcavail_cancel_timeout(struct work_struct *work); void sdp_abort_srcavail(struct socket *sk); void sdp_abort_rdma_read(struct socket *sk); int sdp_process_rx(struct sdp_sock *ssk); #endif Index: head/sys/ofed/drivers/infiniband/util/madeye.c =================================================================== --- head/sys/ofed/drivers/infiniband/util/madeye.c (revision 300675) +++ head/sys/ofed/drivers/infiniband/util/madeye.c (revision 300676) @@ -1,593 +1,596 @@ /* * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005, 2006 Voltaire Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directorY of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id$ */ + +#define LINUXKPI_PARAM_PREFIX ib_madeye_ + #include #include #include #include #include #include MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("InfiniBand MAD viewer"); MODULE_LICENSE("Dual BSD/GPL"); static void madeye_remove_one(struct ib_device *device); static void madeye_add_one(struct ib_device *device); static struct ib_client madeye_client = { .name = "madeye", .add = madeye_add_one, .remove = madeye_remove_one }; struct madeye_port { struct ib_mad_agent *smi_agent; struct ib_mad_agent *gsi_agent; }; static int smp = 1; static int gmp = 1; static int mgmt_class = 0; static int attr_id = 0; static int data = 0; module_param(smp, int, 0444); module_param(gmp, int, 0444); module_param(mgmt_class, int, 0444); module_param(attr_id, int, 0444); module_param(data, int, 0444); MODULE_PARM_DESC(smp, "Display all SMPs (default=1)"); MODULE_PARM_DESC(gmp, "Display all GMPs (default=1)"); MODULE_PARM_DESC(mgmt_class, "Display all MADs of specified class (default=0)"); MODULE_PARM_DESC(attr_id, "Display add MADs of specified attribute ID (default=0)"); MODULE_PARM_DESC(data, "Display data area of MADs (default=0)"); static char * get_class_name(u8 mgmt_class) { switch(mgmt_class) { case IB_MGMT_CLASS_SUBN_LID_ROUTED: return "LID routed SMP"; case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: return "Directed route SMP"; case IB_MGMT_CLASS_SUBN_ADM: return "Subnet admin."; case IB_MGMT_CLASS_PERF_MGMT: return "Perf. mgmt."; case IB_MGMT_CLASS_BM: return "Baseboard mgmt."; case IB_MGMT_CLASS_DEVICE_MGMT: return "Device mgmt."; case IB_MGMT_CLASS_CM: return "Comm. mgmt."; case IB_MGMT_CLASS_SNMP: return "SNMP"; default: return "Unknown vendor/application"; } } static char * get_method_name(u8 mgmt_class, u8 method) { switch(method) { case IB_MGMT_METHOD_GET: return "Get"; case IB_MGMT_METHOD_SET: return "Set"; case IB_MGMT_METHOD_GET_RESP: return "Get response"; case IB_MGMT_METHOD_SEND: return "Send"; case IB_MGMT_METHOD_SEND | IB_MGMT_METHOD_RESP: return "Send response"; case IB_MGMT_METHOD_TRAP: return "Trap"; case IB_MGMT_METHOD_REPORT: return "Report"; case IB_MGMT_METHOD_REPORT_RESP: return "Report response"; case IB_MGMT_METHOD_TRAP_REPRESS: return "Trap repress"; default: break; } switch (mgmt_class) { case IB_MGMT_CLASS_SUBN_ADM: switch (method) { case IB_SA_METHOD_GET_TABLE: return "Get table"; case IB_SA_METHOD_GET_TABLE_RESP: return "Get table response"; case IB_SA_METHOD_DELETE: return "Delete"; case IB_SA_METHOD_DELETE_RESP: return "Delete response"; case IB_SA_METHOD_GET_MULTI: return "Get Multi"; case IB_SA_METHOD_GET_MULTI_RESP: return "Get Multi response"; case IB_SA_METHOD_GET_TRACE_TBL: return "Get Trace Table response"; default: break; } default: break; } return "Unknown"; } static void print_status_details(u16 status) { if (status & 0x0001) printk(" busy\n"); if (status & 0x0002) printk(" redirection required\n"); switch((status & 0x001C) >> 2) { case 1: printk(" bad version\n"); break; case 2: printk(" method not supported\n"); break; case 3: printk(" method/attribute combo not supported\n"); break; case 7: printk(" invalid attribute/modifier value\n"); break; } } static char * get_sa_attr(__be16 attr) { switch(attr) { case IB_SA_ATTR_CLASS_PORTINFO: return "Class Port Info"; case IB_SA_ATTR_NOTICE: return "Notice"; case IB_SA_ATTR_INFORM_INFO: return "Inform Info"; case IB_SA_ATTR_NODE_REC: return "Node Record"; case IB_SA_ATTR_PORT_INFO_REC: return "PortInfo Record"; case IB_SA_ATTR_SL2VL_REC: return "SL to VL Record"; case IB_SA_ATTR_SWITCH_REC: return "Switch Record"; case IB_SA_ATTR_LINEAR_FDB_REC: return "Linear FDB Record"; case IB_SA_ATTR_RANDOM_FDB_REC: return "Random FDB Record"; case IB_SA_ATTR_MCAST_FDB_REC: return "Multicast FDB Record"; case IB_SA_ATTR_SM_INFO_REC: return "SM Info Record"; case IB_SA_ATTR_LINK_REC: return "Link Record"; case IB_SA_ATTR_GUID_INFO_REC: return "Guid Info Record"; case IB_SA_ATTR_SERVICE_REC: return "Service Record"; case IB_SA_ATTR_PARTITION_REC: return "Partition Record"; case IB_SA_ATTR_PATH_REC: return "Path Record"; case IB_SA_ATTR_VL_ARB_REC: return "VL Arb Record"; case IB_SA_ATTR_MC_MEMBER_REC: return "MC Member Record"; case IB_SA_ATTR_TRACE_REC: return "Trace Record"; case IB_SA_ATTR_MULTI_PATH_REC: return "Multi Path Record"; case IB_SA_ATTR_SERVICE_ASSOC_REC: return "Service Assoc Record"; case IB_SA_ATTR_INFORM_INFO_REC: return "Inform Info Record"; default: return ""; } } static void print_mad_hdr(struct ib_mad_hdr *mad_hdr) { printk("MAD version....0x%01x\n", mad_hdr->base_version); printk("Class..........0x%01x (%s)\n", mad_hdr->mgmt_class, get_class_name(mad_hdr->mgmt_class)); printk("Class version..0x%01x\n", mad_hdr->class_version); printk("Method.........0x%01x (%s)\n", mad_hdr->method, get_method_name(mad_hdr->mgmt_class, mad_hdr->method)); printk("Status.........0x%02x\n", be16_to_cpu(mad_hdr->status)); if (mad_hdr->status) print_status_details(be16_to_cpu(mad_hdr->status)); printk("Class specific.0x%02x\n", be16_to_cpu(mad_hdr->class_specific)); printk("Trans ID.......0x%llx\n", (unsigned long long)be64_to_cpu(mad_hdr->tid)); if (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_ADM) printk("Attr ID........0x%02x (%s)\n", be16_to_cpu(mad_hdr->attr_id), get_sa_attr(be16_to_cpu(mad_hdr->attr_id))); else printk("Attr ID........0x%02x\n", be16_to_cpu(mad_hdr->attr_id)); printk("Attr modifier..0x%04x\n", be32_to_cpu(mad_hdr->attr_mod)); } static char * get_rmpp_type(u8 rmpp_type) { switch (rmpp_type) { case IB_MGMT_RMPP_TYPE_DATA: return "Data"; case IB_MGMT_RMPP_TYPE_ACK: return "Ack"; case IB_MGMT_RMPP_TYPE_STOP: return "Stop"; case IB_MGMT_RMPP_TYPE_ABORT: return "Abort"; default: return "Unknown"; } } static char * get_rmpp_flags(u8 rmpp_flags) { if (rmpp_flags & IB_MGMT_RMPP_FLAG_ACTIVE) if (rmpp_flags & IB_MGMT_RMPP_FLAG_FIRST) if (rmpp_flags & IB_MGMT_RMPP_FLAG_LAST) return "Active - First & Last"; else return "Active - First"; else if (rmpp_flags & IB_MGMT_RMPP_FLAG_LAST) return "Active - Last"; else return "Active"; else return "Inactive"; } static void print_rmpp_hdr(struct ib_rmpp_hdr *rmpp_hdr) { printk("RMPP version...0x%01x\n", rmpp_hdr->rmpp_version); printk("RMPP type......0x%01x (%s)\n", rmpp_hdr->rmpp_type, get_rmpp_type(rmpp_hdr->rmpp_type)); printk("RMPP RRespTime.0x%01x\n", ib_get_rmpp_resptime(rmpp_hdr)); printk("RMPP flags.....0x%01x (%s)\n", ib_get_rmpp_flags(rmpp_hdr), get_rmpp_flags(ib_get_rmpp_flags(rmpp_hdr))); printk("RMPP status....0x%01x\n", rmpp_hdr->rmpp_status); printk("Seg number.....0x%04x\n", be32_to_cpu(rmpp_hdr->seg_num)); switch (rmpp_hdr->rmpp_type) { case IB_MGMT_RMPP_TYPE_DATA: printk("Payload len....0x%04x\n", be32_to_cpu(rmpp_hdr->paylen_newwin)); break; case IB_MGMT_RMPP_TYPE_ACK: printk("New window.....0x%04x\n", be32_to_cpu(rmpp_hdr->paylen_newwin)); break; default: printk("Data 2.........0x%04x\n", be32_to_cpu(rmpp_hdr->paylen_newwin)); break; } } static char * get_smp_attr(__be16 attr) { switch (attr) { case IB_SMP_ATTR_NOTICE: return "notice"; case IB_SMP_ATTR_NODE_DESC: return "node description"; case IB_SMP_ATTR_NODE_INFO: return "node info"; case IB_SMP_ATTR_SWITCH_INFO: return "switch info"; case IB_SMP_ATTR_GUID_INFO: return "GUID info"; case IB_SMP_ATTR_PORT_INFO: return "port info"; case IB_SMP_ATTR_PKEY_TABLE: return "pkey table"; case IB_SMP_ATTR_SL_TO_VL_TABLE: return "SL to VL table"; case IB_SMP_ATTR_VL_ARB_TABLE: return "VL arbitration table"; case IB_SMP_ATTR_LINEAR_FORWARD_TABLE: return "linear forwarding table"; case IB_SMP_ATTR_RANDOM_FORWARD_TABLE: return "random forward table"; case IB_SMP_ATTR_MCAST_FORWARD_TABLE: return "multicast forward table"; case IB_SMP_ATTR_SM_INFO: return "SM info"; case IB_SMP_ATTR_VENDOR_DIAG: return "vendor diags"; case IB_SMP_ATTR_LED_INFO: return "LED info"; default: return ""; } } static void print_smp(struct ib_smp *smp) { int i; printk("MAD version....0x%01x\n", smp->base_version); printk("Class..........0x%01x (%s)\n", smp->mgmt_class, get_class_name(smp->mgmt_class)); printk("Class version..0x%01x\n", smp->class_version); printk("Method.........0x%01x (%s)\n", smp->method, get_method_name(smp->mgmt_class, smp->method)); printk("Status.........0x%02x\n", be16_to_cpu(smp->status)); if (smp->status) print_status_details(be16_to_cpu(smp->status)); printk("Hop pointer....0x%01x\n", smp->hop_ptr); printk("Hop counter....0x%01x\n", smp->hop_cnt); printk("Trans ID.......0x%llx\n", (unsigned long long)be64_to_cpu(smp->tid)); printk("Attr ID........0x%02x (%s)\n", be16_to_cpu(smp->attr_id), get_smp_attr(smp->attr_id)); printk("Attr modifier..0x%04x\n", be32_to_cpu(smp->attr_mod)); printk("Mkey...........0x%llx\n", (unsigned long long)be64_to_cpu(smp->mkey)); printk("DR SLID........0x%02x\n", be16_to_cpu(smp->dr_slid)); printk("DR DLID........0x%02x", be16_to_cpu(smp->dr_dlid)); if (data) { for (i = 0; i < IB_SMP_DATA_SIZE; i++) { if (i % 16 == 0) printk("\nSMP Data......."); printk("%01x ", smp->data[i]); } for (i = 0; i < IB_SMP_MAX_PATH_HOPS; i++) { if (i % 16 == 0) printk("\nInitial path..."); printk("%01x ", smp->initial_path[i]); } for (i = 0; i < IB_SMP_MAX_PATH_HOPS; i++) { if (i % 16 == 0) printk("\nReturn path...."); printk("%01x ", smp->return_path[i]); } } printk("\n"); } static void snoop_smi_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, struct ib_mad_send_wc *mad_send_wc) { struct ib_mad_hdr *hdr = send_buf->mad; if (!smp && hdr->mgmt_class != mgmt_class) return; if (attr_id && be16_to_cpu(hdr->attr_id) != attr_id) return; printk("Madeye:sent SMP\n"); print_smp(send_buf->mad); } static void recv_smi_handler(struct ib_mad_agent *mad_agent, struct ib_mad_recv_wc *mad_recv_wc) { if (!smp && mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class != mgmt_class) return; if (attr_id && be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) != attr_id) return; printk("Madeye:recv SMP\n"); print_smp((struct ib_smp *)&mad_recv_wc->recv_buf.mad->mad_hdr); } static int is_rmpp_mad(struct ib_mad_hdr *mad_hdr) { if (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_ADM) { switch (mad_hdr->method) { case IB_SA_METHOD_GET_TABLE: case IB_SA_METHOD_GET_TABLE_RESP: case IB_SA_METHOD_GET_MULTI_RESP: return 1; default: break; } } else if ((mad_hdr->mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) && (mad_hdr->mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)) return 1; return 0; } static void snoop_gsi_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, struct ib_mad_send_wc *mad_send_wc) { struct ib_mad_hdr *hdr = send_buf->mad; if (!gmp && hdr->mgmt_class != mgmt_class) return; if (attr_id && be16_to_cpu(hdr->attr_id) != attr_id) return; printk("Madeye:sent GMP\n"); print_mad_hdr(hdr); if (is_rmpp_mad(hdr)) print_rmpp_hdr(&((struct ib_rmpp_mad *) hdr)->rmpp_hdr); } static void recv_gsi_handler(struct ib_mad_agent *mad_agent, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_mad_hdr *hdr = &mad_recv_wc->recv_buf.mad->mad_hdr; struct ib_rmpp_mad *mad = NULL; struct ib_sa_mad *sa_mad; struct ib_vendor_mad *vendor_mad; u8 *mad_data; int i, j; if (!gmp && hdr->mgmt_class != mgmt_class) return; if (attr_id && be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) != attr_id) return; printk("Madeye:recv GMP\n"); print_mad_hdr(hdr); if (is_rmpp_mad(hdr)) { mad = (struct ib_rmpp_mad *) hdr; print_rmpp_hdr(&mad->rmpp_hdr); } if (data) { if (hdr->mgmt_class == IB_MGMT_CLASS_SUBN_ADM) { j = IB_MGMT_SA_DATA; /* Display SA header */ if (is_rmpp_mad(hdr) && mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) return; sa_mad = (struct ib_sa_mad *) &mad_recv_wc->recv_buf.mad; mad_data = sa_mad->data; } else { if (is_rmpp_mad(hdr)) { j = IB_MGMT_VENDOR_DATA; /* Display OUI */ vendor_mad = (struct ib_vendor_mad *) &mad_recv_wc->recv_buf.mad; printk("Vendor OUI......%01x %01x %01x\n", vendor_mad->oui[0], vendor_mad->oui[1], vendor_mad->oui[2]); mad_data = vendor_mad->data; } else { j = IB_MGMT_MAD_DATA; mad_data = mad_recv_wc->recv_buf.mad->data; } } for (i = 0; i < j; i++) { if (i % 16 == 0) printk("\nData..........."); printk("%01x ", mad_data[i]); } printk("\n"); } } static void madeye_add_one(struct ib_device *device) { struct madeye_port *port; int reg_flags; u8 i, s, e; if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; } else { s = 1; e = device->phys_port_cnt; } port = kmalloc(sizeof *port * (e - s + 1), GFP_KERNEL); if (!port) goto out; reg_flags = IB_MAD_SNOOP_SEND_COMPLETIONS | IB_MAD_SNOOP_RECVS; for (i = 0; i <= e - s; i++) { port[i].smi_agent = ib_register_mad_snoop(device, i + s, IB_QPT_SMI, reg_flags, snoop_smi_handler, recv_smi_handler, &port[i]); port[i].gsi_agent = ib_register_mad_snoop(device, i + s, IB_QPT_GSI, reg_flags, snoop_gsi_handler, recv_gsi_handler, &port[i]); } out: ib_set_client_data(device, &madeye_client, port); } static void madeye_remove_one(struct ib_device *device) { struct madeye_port *port; int i, s, e; port = (struct madeye_port *) ib_get_client_data(device, &madeye_client); if (!port) return; if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; } else { s = 1; e = device->phys_port_cnt; } for (i = 0; i <= e - s; i++) { if (!IS_ERR(port[i].smi_agent)) ib_unregister_mad_agent(port[i].smi_agent); if (!IS_ERR(port[i].gsi_agent)) ib_unregister_mad_agent(port[i].gsi_agent); } kfree(port); } static int __init ib_madeye_init(void) { return ib_register_client(&madeye_client); } static void __exit ib_madeye_cleanup(void) { ib_unregister_client(&madeye_client); } module_init(ib_madeye_init); module_exit(ib_madeye_cleanup); Index: head/sys/ofed/drivers/net/mlx4/catas.c =================================================================== --- head/sys/ofed/drivers/net/mlx4/catas.c (revision 300675) +++ head/sys/ofed/drivers/net/mlx4/catas.c (revision 300676) @@ -1,173 +1,175 @@ /* * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007, 2008, 2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#define LINUXKPI_PARAM_PREFIX mlx4_ + #include #include #include #include "mlx4.h" #define MLX4_CATAS_POLL_INTERVAL (5 * HZ) static DEFINE_SPINLOCK(catas_lock); static LIST_HEAD(catas_list); static struct work_struct catas_work; static int internal_err_reset = 1; module_param(internal_err_reset, int, 0644); MODULE_PARM_DESC(internal_err_reset, "Reset device on internal errors if non-zero" " (default 1, in SRIOV mode default is 0)"); static void dump_err_buf(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int i; mlx4_err(dev, "Internal error detected:\n"); for (i = 0; i < priv->fw.catas_size; ++i) mlx4_err(dev, " buf[%02x]: %08x\n", i, swab32(readl(priv->catas_err.map + i))); } static void poll_catas(unsigned long dev_ptr) { struct mlx4_dev *dev = (struct mlx4_dev *) dev_ptr; struct mlx4_priv *priv = mlx4_priv(dev); if (readl(priv->catas_err.map)) { /* If the device is off-line, we cannot try to recover it */ if (pci_channel_offline(dev->pdev)) mod_timer(&priv->catas_err.timer, round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL)); else { dump_err_buf(dev); mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0); if (internal_err_reset) { spin_lock(&catas_lock); list_add(&priv->catas_err.list, &catas_list); spin_unlock(&catas_lock); queue_work(mlx4_wq, &catas_work); } } } else mod_timer(&priv->catas_err.timer, round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL)); } static void catas_reset(struct work_struct *work) { struct mlx4_priv *priv, *tmppriv; struct mlx4_dev *dev; LIST_HEAD(tlist); int ret; spin_lock_irq(&catas_lock); list_splice_init(&catas_list, &tlist); spin_unlock_irq(&catas_lock); list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) { struct pci_dev *pdev = priv->dev.pdev; /* If the device is off-line, we cannot reset it */ if (pci_channel_offline(pdev)) continue; ret = mlx4_restart_one(priv->dev.pdev); /* 'priv' now is not valid */ if (ret) pr_err("mlx4 %s: Reset failed (%d)\n", pci_name(pdev), ret); else { dev = pci_get_drvdata(pdev); mlx4_dbg(dev, "Reset succeeded\n"); } } } void mlx4_start_catas_poll(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); phys_addr_t addr; /*If we are in SRIOV the default of the module param must be 0*/ if (mlx4_is_mfunc(dev)) internal_err_reset = 0; INIT_LIST_HEAD(&priv->catas_err.list); init_timer(&priv->catas_err.timer); priv->catas_err.map = NULL; addr = pci_resource_start(dev->pdev, priv->fw.catas_bar) + priv->fw.catas_offset; priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4); if (!priv->catas_err.map) { mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n", (unsigned long long) addr); return; } priv->catas_err.timer.data = (unsigned long) dev; priv->catas_err.timer.function = poll_catas; priv->catas_err.timer.expires = round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL); add_timer(&priv->catas_err.timer); } void mlx4_stop_catas_poll(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); del_timer_sync(&priv->catas_err.timer); if (priv->catas_err.map) { iounmap(priv->catas_err.map); priv->catas_err.map = NULL; } spin_lock_irq(&catas_lock); list_del_init(&priv->catas_err.list); spin_unlock_irq(&catas_lock); } void __init mlx4_catas_init(void) { INIT_WORK(&catas_work, catas_reset); } Index: head/sys/ofed/drivers/net/mlx4/en_main.c =================================================================== --- head/sys/ofed/drivers/net/mlx4/en_main.c (revision 300675) +++ head/sys/ofed/drivers/net/mlx4/en_main.c (revision 300676) @@ -1,341 +1,343 @@ /* * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ +#define LINUXKPI_PARAM_PREFIX mlx4_ + #include #include #include #include #include #include #include #include "mlx4_en.h" /* Mellanox ConnectX HCA Ethernet driver */ #define MLX4_EN_PARM_INT(X, def_val, desc) \ static unsigned int X = def_val;\ module_param(X , uint, 0444); \ MODULE_PARM_DESC(X, desc); /* * Device scope module parameters */ /* Enable RSS UDP traffic */ MLX4_EN_PARM_INT(udp_rss, 1, "Enable RSS for incoming UDP traffic"); /* Priority pausing */ MLX4_EN_PARM_INT(pfctx, 0, "Priority based Flow Control policy on TX[7:0]." " Per priority bit mask"); MLX4_EN_PARM_INT(pfcrx, 0, "Priority based Flow Control policy on RX[7:0]." " Per priority bit mask"); #define MAX_PFC_TX 0xff #define MAX_PFC_RX 0xff static int mlx4_en_get_profile(struct mlx4_en_dev *mdev) { struct mlx4_en_profile *params = &mdev->profile; int i; params->udp_rss = udp_rss; params->num_tx_rings_p_up = min_t(int, mp_ncpus, MLX4_EN_MAX_TX_RING_P_UP); if (params->udp_rss && !(mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS)) { mlx4_warn(mdev, "UDP RSS is not supported on this device.\n"); params->udp_rss = 0; } for (i = 1; i <= MLX4_MAX_PORTS; i++) { params->prof[i].rx_pause = 1; params->prof[i].rx_ppp = pfcrx; params->prof[i].tx_pause = 1; params->prof[i].tx_ppp = pfctx; params->prof[i].tx_ring_size = MLX4_EN_DEF_TX_RING_SIZE; params->prof[i].rx_ring_size = MLX4_EN_DEF_RX_RING_SIZE; params->prof[i].tx_ring_num = params->num_tx_rings_p_up * MLX4_EN_NUM_UP; params->prof[i].rss_rings = 0; } return 0; } static void *mlx4_en_get_netdev(struct mlx4_dev *dev, void *ctx, u8 port) { struct mlx4_en_dev *endev = ctx; return endev->pndev[port]; } static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr, enum mlx4_dev_event event, unsigned long port) { struct mlx4_en_dev *mdev = (struct mlx4_en_dev *) endev_ptr; struct mlx4_en_priv *priv; switch (event) { case MLX4_DEV_EVENT_PORT_UP: case MLX4_DEV_EVENT_PORT_DOWN: if (!mdev->pndev[port]) return; priv = netdev_priv(mdev->pndev[port]); /* To prevent races, we poll the link state in a separate task rather than changing it here */ priv->link_state = event; queue_work(mdev->workqueue, &priv->linkstate_task); break; case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: mlx4_err(mdev, "Internal error detected, restarting device\n"); break; case MLX4_DEV_EVENT_SLAVE_INIT: case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: break; default: if (port < 1 || port > dev->caps.num_ports || !mdev->pndev[port]) return; mlx4_warn(mdev, "Unhandled event %d for port %d\n", event, (int) port); } } static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr) { struct mlx4_en_dev *mdev = endev_ptr; int i, ret; mutex_lock(&mdev->state_lock); mdev->device_up = false; mutex_unlock(&mdev->state_lock); mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) if (mdev->pndev[i]) mlx4_en_destroy_netdev(mdev->pndev[i]); flush_workqueue(mdev->workqueue); destroy_workqueue(mdev->workqueue); ret = mlx4_mr_free(dev, &mdev->mr); if (ret) mlx4_err(mdev, "Error deregistering MR. The system may have become unstable."); iounmap(mdev->uar_map); mlx4_uar_free(dev, &mdev->priv_uar); mlx4_pd_free(dev, mdev->priv_pdn); kfree(mdev); } static void *mlx4_en_add(struct mlx4_dev *dev) { struct mlx4_en_dev *mdev; int i; int err; mdev = kzalloc(sizeof *mdev, GFP_KERNEL); if (!mdev) { dev_err(&dev->pdev->dev, "Device struct alloc failed, " "aborting.\n"); err = -ENOMEM; goto err_free_res; } if (mlx4_pd_alloc(dev, &mdev->priv_pdn)) goto err_free_dev; if (mlx4_uar_alloc(dev, &mdev->priv_uar)) goto err_pd; mdev->uar_map = ioremap((phys_addr_t) mdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!mdev->uar_map) goto err_uar; spin_lock_init(&mdev->uar_lock); mdev->dev = dev; mdev->dma_device = &(dev->pdev->dev); mdev->pdev = dev->pdev; mdev->device_up = false; mdev->LSO_support = !!(dev->caps.flags & (1 << 15)); if (!mdev->LSO_support) mlx4_warn(mdev, "LSO not supported, please upgrade to later " "FW version to enable LSO\n"); if (mlx4_mr_alloc(mdev->dev, mdev->priv_pdn, 0, ~0ull, MLX4_PERM_LOCAL_WRITE | MLX4_PERM_LOCAL_READ, 0, 0, &mdev->mr)) { mlx4_err(mdev, "Failed allocating memory region\n"); goto err_map; } if (mlx4_mr_enable(mdev->dev, &mdev->mr)) { mlx4_err(mdev, "Failed enabling memory region\n"); goto err_mr; } /* Build device profile according to supplied module parameters */ err = mlx4_en_get_profile(mdev); if (err) { mlx4_err(mdev, "Bad module parameters, aborting.\n"); goto err_mr; } /* Configure which ports to start according to module parameters */ mdev->port_cnt = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) mdev->port_cnt++; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) { if (!dev->caps.comp_pool) { mdev->profile.prof[i].rx_ring_num = rounddown_pow_of_two(max_t(int, MIN_RX_RINGS, min_t(int, dev->caps.num_comp_vectors, DEF_RX_RINGS))); } else { mdev->profile.prof[i].rx_ring_num = rounddown_pow_of_two( min_t(int, dev->caps.comp_pool / dev->caps.num_ports, MAX_MSIX_P_PORT)); } } /* Create our own workqueue for reset/multicast tasks * Note: we cannot use the shared workqueue because of deadlocks caused * by the rtnl lock */ mdev->workqueue = create_singlethread_workqueue("mlx4_en"); if (!mdev->workqueue) { err = -ENOMEM; goto err_mr; } /* At this stage all non-port specific tasks are complete: * mark the card state as up */ mutex_init(&mdev->state_lock); mdev->device_up = true; /* Setup ports */ /* Create a netdev for each port */ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) { mlx4_info(mdev, "Activating port:%d\n", i); if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i])) mdev->pndev[i] = NULL; } return mdev; err_mr: err = mlx4_mr_free(dev, &mdev->mr); if (err) mlx4_err(mdev, "Error deregistering MR. The system may have become unstable."); err_map: if (mdev->uar_map) iounmap(mdev->uar_map); err_uar: mlx4_uar_free(dev, &mdev->priv_uar); err_pd: mlx4_pd_free(dev, mdev->priv_pdn); err_free_dev: kfree(mdev); err_free_res: return NULL; } static struct mlx4_interface mlx4_en_interface = { .add = mlx4_en_add, .remove = mlx4_en_remove, .event = mlx4_en_event, .get_dev = mlx4_en_get_netdev, .protocol = MLX4_PROT_ETH, }; static void mlx4_en_verify_params(void) { if (pfctx > MAX_PFC_TX) { pr_warn("mlx4_en: WARNING: illegal module parameter pfctx 0x%x - " "should be in range 0-0x%x, will be changed to default (0)\n", pfctx, MAX_PFC_TX); pfctx = 0; } if (pfcrx > MAX_PFC_RX) { pr_warn("mlx4_en: WARNING: illegal module parameter pfcrx 0x%x - " "should be in range 0-0x%x, will be changed to default (0)\n", pfcrx, MAX_PFC_RX); pfcrx = 0; } } static int __init mlx4_en_init(void) { mlx4_en_verify_params(); #ifdef CONFIG_DEBUG_FS int err = 0; err = mlx4_en_register_debugfs(); if (err) pr_err(KERN_ERR "Failed to register debugfs\n"); #endif return mlx4_register_interface(&mlx4_en_interface); } static void __exit mlx4_en_cleanup(void) { mlx4_unregister_interface(&mlx4_en_interface); #ifdef CONFIG_DEBUG_FS mlx4_en_unregister_debugfs(); #endif } module_init(mlx4_en_init); module_exit(mlx4_en_cleanup); static int mlxen_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t mlxen_mod = { .name = "mlxen", .evhand = mlxen_evhand, }; DECLARE_MODULE(mlxen, mlxen_mod, SI_SUB_OFED_PREINIT, SI_ORDER_ANY); MODULE_DEPEND(mlxen, mlx4, 1, 1, 1); MODULE_DEPEND(mlxen, linuxkpi, 1, 1, 1); Index: head/sys/ofed/drivers/net/mlx4/en_tx.c =================================================================== --- head/sys/ofed/drivers/net/mlx4/en_tx.c (revision 300675) +++ head/sys/ofed/drivers/net/mlx4/en_tx.c (revision 300676) @@ -1,1117 +1,1119 @@ /* * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ +#define LINUXKPI_PARAM_PREFIX mlx4_ + #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mlx4_en.h" enum { MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */ MAX_BF = 256, MIN_PKT_LEN = 17, }; static int inline_thold __read_mostly = MAX_INLINE; module_param_named(inline_thold, inline_thold, uint, 0444); MODULE_PARM_DESC(inline_thold, "threshold for using inline data"); int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring **pring, u32 size, u16 stride, int node, int queue_idx) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_tx_ring *ring; uint32_t x; int tmp; int err; ring = kzalloc_node(sizeof(struct mlx4_en_tx_ring), GFP_KERNEL, node); if (!ring) { ring = kzalloc(sizeof(struct mlx4_en_tx_ring), GFP_KERNEL); if (!ring) { en_err(priv, "Failed allocating TX ring\n"); return -ENOMEM; } } /* Create DMA descriptor TAG */ if ((err = -bus_dma_tag_create( bus_get_dma_tag(mdev->pdev->dev.bsddev), 1, /* any alignment */ 0, /* no boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MLX4_EN_TX_MAX_PAYLOAD_SIZE, /* maxsize */ MLX4_EN_TX_MAX_MBUF_FRAGS, /* nsegments */ MLX4_EN_TX_MAX_MBUF_SIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockfuncarg */ &ring->dma_tag))) goto done; ring->size = size; ring->size_mask = size - 1; ring->stride = stride; ring->inline_thold = MAX(MIN_PKT_LEN, MIN(inline_thold, MAX_INLINE)); mtx_init(&ring->tx_lock.m, "mlx4 tx", NULL, MTX_DEF); mtx_init(&ring->comp_lock.m, "mlx4 comp", NULL, MTX_DEF); /* Allocate the buf ring */ ring->br = buf_ring_alloc(MLX4_EN_DEF_TX_QUEUE_SIZE, M_DEVBUF, M_WAITOK, &ring->tx_lock.m); if (ring->br == NULL) { en_err(priv, "Failed allocating tx_info ring\n"); err = -ENOMEM; goto err_free_dma_tag; } tmp = size * sizeof(struct mlx4_en_tx_info); ring->tx_info = kzalloc_node(tmp, GFP_KERNEL, node); if (!ring->tx_info) { ring->tx_info = kzalloc(tmp, GFP_KERNEL); if (!ring->tx_info) { err = -ENOMEM; goto err_ring; } } /* Create DMA descriptor MAPs */ for (x = 0; x != size; x++) { err = -bus_dmamap_create(ring->dma_tag, 0, &ring->tx_info[x].dma_map); if (err != 0) { while (x--) { bus_dmamap_destroy(ring->dma_tag, ring->tx_info[x].dma_map); } goto err_info; } } en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n", ring->tx_info, tmp); ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE); /* Allocate HW buffers on provided NUMA node */ err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size, 2 * PAGE_SIZE); if (err) { en_err(priv, "Failed allocating hwq resources\n"); goto err_dma_map; } err = mlx4_en_map_buffer(&ring->wqres.buf); if (err) { en_err(priv, "Failed to map TX buffer\n"); goto err_hwq_res; } ring->buf = ring->wqres.buf.direct.buf; en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d " "buf_size:%d dma:%llx\n", ring, ring->buf, ring->size, ring->buf_size, (unsigned long long) ring->wqres.buf.direct.map); err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn, MLX4_RESERVE_BF_QP); if (err) { en_err(priv, "failed reserving qp for TX ring\n"); goto err_map; } err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp); if (err) { en_err(priv, "Failed allocating qp %d\n", ring->qpn); goto err_reserve; } ring->qp.event = mlx4_en_sqp_event; err = mlx4_bf_alloc(mdev->dev, &ring->bf, node); if (err) { en_dbg(DRV, priv, "working without blueflame (%d)", err); ring->bf.uar = &mdev->priv_uar; ring->bf.uar->map = mdev->uar_map; ring->bf_enabled = false; } else ring->bf_enabled = true; ring->queue_index = queue_idx; if (queue_idx < priv->num_tx_rings_p_up ) CPU_SET(queue_idx, &ring->affinity_mask); *pring = ring; return 0; err_reserve: mlx4_qp_release_range(mdev->dev, ring->qpn, 1); err_map: mlx4_en_unmap_buffer(&ring->wqres.buf); err_hwq_res: mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); err_dma_map: for (x = 0; x != size; x++) bus_dmamap_destroy(ring->dma_tag, ring->tx_info[x].dma_map); err_info: vfree(ring->tx_info); err_ring: buf_ring_free(ring->br, M_DEVBUF); err_free_dma_tag: bus_dma_tag_destroy(ring->dma_tag); done: kfree(ring); return err; } void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring **pring) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_tx_ring *ring = *pring; uint32_t x; en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn); buf_ring_free(ring->br, M_DEVBUF); if (ring->bf_enabled) mlx4_bf_free(mdev->dev, &ring->bf); mlx4_qp_remove(mdev->dev, &ring->qp); mlx4_qp_free(mdev->dev, &ring->qp); mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1); mlx4_en_unmap_buffer(&ring->wqres.buf); mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); for (x = 0; x != ring->size; x++) bus_dmamap_destroy(ring->dma_tag, ring->tx_info[x].dma_map); vfree(ring->tx_info); mtx_destroy(&ring->tx_lock.m); mtx_destroy(&ring->comp_lock.m); bus_dma_tag_destroy(ring->dma_tag); kfree(ring); *pring = NULL; } int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, int cq, int user_prio) { struct mlx4_en_dev *mdev = priv->mdev; int err; ring->cqn = cq; ring->prod = 0; ring->cons = 0xffffffff; ring->last_nr_txbb = 1; ring->poll_cnt = 0; ring->blocked = 0; memset(ring->buf, 0, ring->buf_size); ring->qp_state = MLX4_QP_STATE_RST; ring->doorbell_qpn = ring->qp.qpn << 8; mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn, ring->cqn, user_prio, &ring->context); if (ring->bf_enabled) ring->context.usr_page = cpu_to_be32(ring->bf.uar->index); err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context, &ring->qp, &ring->qp_state); return err; } void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring) { struct mlx4_en_dev *mdev = priv->mdev; mlx4_qp_modify(mdev->dev, NULL, ring->qp_state, MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp); } static volatile struct mlx4_wqe_data_seg * mlx4_en_store_inline_lso_data(volatile struct mlx4_wqe_data_seg *dseg, struct mbuf *mb, int len, __be32 owner_bit) { uint8_t *inl = __DEVOLATILE(uint8_t *, dseg); /* copy data into place */ m_copydata(mb, 0, len, inl + 4); dseg += DIV_ROUND_UP(4 + len, DS_SIZE_ALIGNMENT); return (dseg); } static void mlx4_en_store_inline_lso_header(volatile struct mlx4_wqe_data_seg *dseg, int len, __be32 owner_bit) { } static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, u32 index, u8 owner) { struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; struct mlx4_en_tx_desc *tx_desc = (struct mlx4_en_tx_desc *) (ring->buf + (index * TXBB_SIZE)); volatile __be32 *ptr = (__be32 *)tx_desc; const __be32 stamp = cpu_to_be32(STAMP_VAL | ((u32)owner << STAMP_SHIFT)); u32 i; /* Stamp the freed descriptor */ for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) { *ptr = stamp; ptr += STAMP_DWORDS; } } static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, u32 index) { struct mlx4_en_tx_info *tx_info; struct mbuf *mb; tx_info = &ring->tx_info[index]; mb = tx_info->mb; if (mb == NULL) goto done; bus_dmamap_sync(ring->dma_tag, tx_info->dma_map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(ring->dma_tag, tx_info->dma_map); m_freem(mb); done: return (tx_info->nr_txbb); } int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring) { struct mlx4_en_priv *priv = netdev_priv(dev); int cnt = 0; /* Skip last polled descriptor */ ring->cons += ring->last_nr_txbb; en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n", ring->cons, ring->prod); if ((u32) (ring->prod - ring->cons) > ring->size) { en_warn(priv, "Tx consumer passed producer!\n"); return 0; } while (ring->cons != ring->prod) { ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring, ring->cons & ring->size_mask); ring->cons += ring->last_nr_txbb; cnt++; } if (cnt) en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt); return cnt; } static bool mlx4_en_tx_ring_is_full(struct mlx4_en_tx_ring *ring) { int wqs; wqs = ring->size - (ring->prod - ring->cons); return (wqs < (HEADROOM + (2 * MLX4_EN_TX_WQE_MAX_WQEBBS))); } static int mlx4_en_process_tx_cq(struct net_device *dev, struct mlx4_en_cq *cq) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_cq *mcq = &cq->mcq; struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; struct mlx4_cqe *cqe; u16 index; u16 new_index, ring_index, stamp_index; u32 txbbs_skipped = 0; u32 txbbs_stamp = 0; u32 cons_index = mcq->cons_index; int size = cq->size; u32 size_mask = ring->size_mask; struct mlx4_cqe *buf = cq->buf; int factor = priv->cqe_factor; if (!priv->port_up) return 0; index = cons_index & size_mask; cqe = &buf[(index << factor) + factor]; ring_index = ring->cons & size_mask; stamp_index = ring_index; /* Process all completed CQEs */ while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK, cons_index & size)) { /* * make sure we read the CQE after we read the * ownership bit */ rmb(); if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_ERROR)) { en_err(priv, "CQE completed in error - vendor syndrom: 0x%x syndrom: 0x%x\n", ((struct mlx4_err_cqe *)cqe)-> vendor_err_syndrome, ((struct mlx4_err_cqe *)cqe)->syndrome); } /* Skip over last polled CQE */ new_index = be16_to_cpu(cqe->wqe_index) & size_mask; do { txbbs_skipped += ring->last_nr_txbb; ring_index = (ring_index + ring->last_nr_txbb) & size_mask; /* free next descriptor */ ring->last_nr_txbb = mlx4_en_free_tx_desc( priv, ring, ring_index); mlx4_en_stamp_wqe(priv, ring, stamp_index, !!((ring->cons + txbbs_stamp) & ring->size)); stamp_index = ring_index; txbbs_stamp = txbbs_skipped; } while (ring_index != new_index); ++cons_index; index = cons_index & size_mask; cqe = &buf[(index << factor) + factor]; } /* * To prevent CQ overflow we first update CQ consumer and only then * the ring consumer. */ mcq->cons_index = cons_index; mlx4_cq_set_ci(mcq); wmb(); ring->cons += txbbs_skipped; /* Wakeup Tx queue if it was stopped and ring is not full */ if (unlikely(ring->blocked) && !mlx4_en_tx_ring_is_full(ring)) { ring->blocked = 0; if (atomic_fetchadd_int(&priv->blocked, -1) == 1) atomic_clear_int(&dev->if_drv_flags ,IFF_DRV_OACTIVE); ring->wake_queue++; priv->port_stats.wake_queue++; } return (0); } void mlx4_en_tx_irq(struct mlx4_cq *mcq) { struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq); struct mlx4_en_priv *priv = netdev_priv(cq->dev); struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; if (priv->port_up == 0 || !spin_trylock(&ring->comp_lock)) return; mlx4_en_process_tx_cq(cq->dev, cq); mod_timer(&cq->timer, jiffies + 1); spin_unlock(&ring->comp_lock); } void mlx4_en_poll_tx_cq(unsigned long data) { struct mlx4_en_cq *cq = (struct mlx4_en_cq *) data; struct mlx4_en_priv *priv = netdev_priv(cq->dev); struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; u32 inflight; INC_PERF_COUNTER(priv->pstats.tx_poll); if (priv->port_up == 0) return; if (!spin_trylock(&ring->comp_lock)) { mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); return; } mlx4_en_process_tx_cq(cq->dev, cq); inflight = (u32) (ring->prod - ring->cons - ring->last_nr_txbb); /* If there are still packets in flight and the timer has not already * been scheduled by the Tx routine then schedule it here to guarantee * completion processing of these packets */ if (inflight && priv->port_up) mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); spin_unlock(&ring->comp_lock); } static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind) { struct mlx4_en_cq *cq = priv->tx_cq[tx_ind]; struct mlx4_en_tx_ring *ring = priv->tx_ring[tx_ind]; if (priv->port_up == 0) return; /* If we don't have a pending timer, set one up to catch our recent post in case the interface becomes idle */ if (!timer_pending(&cq->timer)) mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); /* Poll the CQ every mlx4_en_TX_MODER_POLL packets */ if ((++ring->poll_cnt & (MLX4_EN_TX_POLL_MODER - 1)) == 0) if (spin_trylock(&ring->comp_lock)) { mlx4_en_process_tx_cq(priv->dev, cq); spin_unlock(&ring->comp_lock); } } static u16 mlx4_en_get_inline_hdr_size(struct mlx4_en_tx_ring *ring, struct mbuf *mb) { u16 retval; /* only copy from first fragment, if possible */ retval = MIN(ring->inline_thold, mb->m_len); /* check for too little data */ if (unlikely(retval < MIN_PKT_LEN)) retval = MIN(ring->inline_thold, mb->m_pkthdr.len); return (retval); } static int mlx4_en_get_header_size(struct mbuf *mb) { struct ether_vlan_header *eh; struct tcphdr *th; struct ip *ip; int ip_hlen, tcp_hlen; struct ip6_hdr *ip6; uint16_t eth_type; int eth_hdr_len; eh = mtod(mb, struct ether_vlan_header *); if (mb->m_len < ETHER_HDR_LEN) return (0); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { eth_type = ntohs(eh->evl_proto); eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { eth_type = ntohs(eh->evl_encap_proto); eth_hdr_len = ETHER_HDR_LEN; } if (mb->m_len < eth_hdr_len) return (0); switch (eth_type) { case ETHERTYPE_IP: ip = (struct ip *)(mb->m_data + eth_hdr_len); if (mb->m_len < eth_hdr_len + sizeof(*ip)) return (0); if (ip->ip_p != IPPROTO_TCP) return (0); ip_hlen = ip->ip_hl << 2; eth_hdr_len += ip_hlen; break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mb->m_data + eth_hdr_len); if (mb->m_len < eth_hdr_len + sizeof(*ip6)) return (0); if (ip6->ip6_nxt != IPPROTO_TCP) return (0); eth_hdr_len += sizeof(*ip6); break; default: return (0); } if (mb->m_len < eth_hdr_len + sizeof(*th)) return (0); th = (struct tcphdr *)(mb->m_data + eth_hdr_len); tcp_hlen = th->th_off << 2; eth_hdr_len += tcp_hlen; if (mb->m_len < eth_hdr_len) return (0); return (eth_hdr_len); } static volatile struct mlx4_wqe_data_seg * mlx4_en_store_inline_data(volatile struct mlx4_wqe_data_seg *dseg, struct mbuf *mb, int len, __be32 owner_bit) { uint8_t *inl = __DEVOLATILE(uint8_t *, dseg); const int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - 4; if (unlikely(len < MIN_PKT_LEN)) { m_copydata(mb, 0, len, inl + 4); memset(inl + 4 + len, 0, MIN_PKT_LEN - len); dseg += DIV_ROUND_UP(4 + MIN_PKT_LEN, DS_SIZE_ALIGNMENT); } else if (len <= spc) { m_copydata(mb, 0, len, inl + 4); dseg += DIV_ROUND_UP(4 + len, DS_SIZE_ALIGNMENT); } else { m_copydata(mb, 0, spc, inl + 4); m_copydata(mb, spc, len - spc, inl + 8 + spc); dseg += DIV_ROUND_UP(8 + len, DS_SIZE_ALIGNMENT); } return (dseg); } static void mlx4_en_store_inline_header(volatile struct mlx4_wqe_data_seg *dseg, int len, __be32 owner_bit) { uint8_t *inl = __DEVOLATILE(uint8_t *, dseg); const int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - 4; if (unlikely(len < MIN_PKT_LEN)) { *(volatile uint32_t *)inl = SET_BYTE_COUNT((1 << 31) | MIN_PKT_LEN); } else if (len <= spc) { *(volatile uint32_t *)inl = SET_BYTE_COUNT((1 << 31) | len); } else { *(volatile uint32_t *)(inl + 4 + spc) = SET_BYTE_COUNT((1 << 31) | (len - spc)); wmb(); *(volatile uint32_t *)inl = SET_BYTE_COUNT((1 << 31) | spc); } } static uint32_t hashrandom; static void hashrandom_init(void *arg) { /* * It is assumed that the random subsystem has been * initialized when this function is called: */ hashrandom = m_ether_tcpip_hash_init(); } SYSINIT(hashrandom_init, SI_SUB_RANDOM, SI_ORDER_ANY, &hashrandom_init, NULL); u16 mlx4_en_select_queue(struct net_device *dev, struct mbuf *mb) { struct mlx4_en_priv *priv = netdev_priv(dev); u32 rings_p_up = priv->num_tx_rings_p_up; u32 up = 0; u32 queue_index; #if (MLX4_EN_NUM_UP > 1) /* Obtain VLAN information if present */ if (mb->m_flags & M_VLANTAG) { u32 vlan_tag = mb->m_pkthdr.ether_vtag; up = (vlan_tag >> 13) % MLX4_EN_NUM_UP; } #endif queue_index = m_ether_tcpip_hash(MBUF_HASHFLAG_L3 | MBUF_HASHFLAG_L4, mb, hashrandom); return ((queue_index % rings_p_up) + (up * rings_p_up)); } static void mlx4_bf_copy(void __iomem *dst, volatile unsigned long *src, unsigned bytecnt) { __iowrite64_copy(dst, __DEVOLATILE(void *, src), bytecnt / 8); } static u64 mlx4_en_mac_to_u64(u8 *addr) { u64 mac = 0; int i; for (i = 0; i < ETHER_ADDR_LEN; i++) { mac <<= 8; mac |= addr[i]; } return mac; } static int mlx4_en_xmit(struct mlx4_en_priv *priv, int tx_ind, struct mbuf **mbp) { enum { DS_FACT = TXBB_SIZE / DS_SIZE_ALIGNMENT, CTRL_FLAGS = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICITED), }; bus_dma_segment_t segs[MLX4_EN_TX_MAX_MBUF_FRAGS]; volatile struct mlx4_wqe_data_seg *dseg; volatile struct mlx4_wqe_data_seg *dseg_inline; volatile struct mlx4_en_tx_desc *tx_desc; struct mlx4_en_tx_ring *ring = priv->tx_ring[tx_ind]; struct ifnet *ifp = priv->dev; struct mlx4_en_tx_info *tx_info; struct mbuf *mb = *mbp; struct mbuf *m; __be32 owner_bit; int nr_segs; int pad; int err; u32 bf_size; u32 bf_prod; u32 opcode; u16 index; u16 ds_cnt; u16 ihs; if (unlikely(!priv->port_up)) { err = EINVAL; goto tx_drop; } /* check if TX ring is full */ if (unlikely(mlx4_en_tx_ring_is_full(ring))) { /* every full native Tx ring stops queue */ if (ring->blocked == 0) atomic_add_int(&priv->blocked, 1); /* Set HW-queue-is-full flag */ atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); priv->port_stats.queue_stopped++; ring->blocked = 1; priv->port_stats.queue_stopped++; ring->queue_stopped++; /* Use interrupts to find out when queue opened */ mlx4_en_arm_cq(priv, priv->tx_cq[tx_ind]); return (ENOBUFS); } /* sanity check we are not wrapping around */ KASSERT(((~ring->prod) & ring->size_mask) >= (MLX4_EN_TX_WQE_MAX_WQEBBS - 1), ("Wrapping around TX ring")); /* Track current inflight packets for performance analysis */ AVG_PERF_COUNTER(priv->pstats.inflight_avg, (u32) (ring->prod - ring->cons - 1)); /* Track current mbuf packet header length */ AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, mb->m_pkthdr.len); /* Grab an index and try to transmit packet */ owner_bit = (ring->prod & ring->size) ? cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0; index = ring->prod & ring->size_mask; tx_desc = (volatile struct mlx4_en_tx_desc *) (ring->buf + index * TXBB_SIZE); tx_info = &ring->tx_info[index]; dseg = &tx_desc->data; /* send a copy of the frame to the BPF listener, if any */ if (ifp != NULL && ifp->if_bpf != NULL) ETHER_BPF_MTAP(ifp, mb); /* get default flags */ tx_desc->ctrl.srcrb_flags = CTRL_FLAGS; if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM); if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM); /* do statistics */ if (likely(tx_desc->ctrl.srcrb_flags != CTRL_FLAGS)) { priv->port_stats.tx_chksum_offload++; ring->tx_csum++; } /* check for VLAN tag */ if (mb->m_flags & M_VLANTAG) { tx_desc->ctrl.vlan_tag = cpu_to_be16(mb->m_pkthdr.ether_vtag); tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN; } else { tx_desc->ctrl.vlan_tag = 0; tx_desc->ctrl.ins_vlan = 0; } /* clear immediate field */ tx_desc->ctrl.imm = 0; /* Handle LSO (TSO) packets */ if (mb->m_pkthdr.csum_flags & CSUM_TSO) { u32 payload_len; u32 mss = mb->m_pkthdr.tso_segsz; u32 num_pkts; opcode = cpu_to_be32(MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR) | owner_bit; ihs = mlx4_en_get_header_size(mb); if (unlikely(ihs > MAX_INLINE)) { ring->oversized_packets++; err = EINVAL; goto tx_drop; } tx_desc->lso.mss_hdr_size = cpu_to_be32((mss << 16) | ihs); payload_len = mb->m_pkthdr.len - ihs; if (unlikely(payload_len == 0)) num_pkts = 1; else num_pkts = DIV_ROUND_UP(payload_len, mss); ring->bytes += payload_len + (num_pkts * ihs); ring->packets += num_pkts; priv->port_stats.tso_packets++; /* store pointer to inline header */ dseg_inline = dseg; /* copy data inline */ dseg = mlx4_en_store_inline_lso_data(dseg, mb, ihs, owner_bit); } else { opcode = cpu_to_be32(MLX4_OPCODE_SEND) | owner_bit; ihs = mlx4_en_get_inline_hdr_size(ring, mb); ring->bytes += max_t (unsigned int, mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN); ring->packets++; /* store pointer to inline header */ dseg_inline = dseg; /* copy data inline */ dseg = mlx4_en_store_inline_data(dseg, mb, ihs, owner_bit); } m_adj(mb, ihs); /* trim off empty mbufs */ while (mb->m_len == 0) { mb = m_free(mb); /* check if all data has been inlined */ if (mb == NULL) { nr_segs = 0; goto skip_dma; } } err = bus_dmamap_load_mbuf_sg(ring->dma_tag, tx_info->dma_map, mb, segs, &nr_segs, BUS_DMA_NOWAIT); if (unlikely(err == EFBIG)) { /* Too many mbuf fragments */ m = m_defrag(mb, M_NOWAIT); if (m == NULL) { ring->oversized_packets++; goto tx_drop; } mb = m; /* Try again */ err = bus_dmamap_load_mbuf_sg(ring->dma_tag, tx_info->dma_map, mb, segs, &nr_segs, BUS_DMA_NOWAIT); } /* catch errors */ if (unlikely(err != 0)) { ring->oversized_packets++; goto tx_drop; } /* make sure all mbuf data is written to RAM */ bus_dmamap_sync(ring->dma_tag, tx_info->dma_map, BUS_DMASYNC_PREWRITE); skip_dma: /* compute number of DS needed */ ds_cnt = (dseg - ((volatile struct mlx4_wqe_data_seg *)tx_desc)) + nr_segs; /* * Check if the next request can wrap around and fill the end * of the current request with zero immediate data: */ pad = DIV_ROUND_UP(ds_cnt, DS_FACT); pad = (~(ring->prod + pad)) & ring->size_mask; if (unlikely(pad < (MLX4_EN_TX_WQE_MAX_WQEBBS - 1))) { /* * Compute the least number of DS blocks we need to * pad in order to achieve a TX ring wraparound: */ pad = (DS_FACT * (pad + 1)); } else { /* * The hardware will automatically jump to the next * TXBB. No need for padding. */ pad = 0; } /* compute total number of DS blocks */ ds_cnt += pad; /* * When modifying this code, please ensure that the following * computation is always less than or equal to 0x3F: * * ((MLX4_EN_TX_WQE_MAX_WQEBBS - 1) * DS_FACT) + * (MLX4_EN_TX_WQE_MAX_WQEBBS * DS_FACT) * * Else the "ds_cnt" variable can become too big. */ tx_desc->ctrl.fence_size = (ds_cnt & 0x3f); /* store pointer to mbuf */ tx_info->mb = mb; tx_info->nr_txbb = DIV_ROUND_UP(ds_cnt, DS_FACT); bf_size = ds_cnt * DS_SIZE_ALIGNMENT; bf_prod = ring->prod; /* compute end of "dseg" array */ dseg += nr_segs + pad; /* pad using zero immediate dseg */ while (pad--) { dseg--; dseg->addr = 0; dseg->lkey = 0; wmb(); dseg->byte_count = SET_BYTE_COUNT((1 << 31)|0); } /* fill segment list */ while (nr_segs--) { if (unlikely(segs[nr_segs].ds_len == 0)) { dseg--; dseg->addr = 0; dseg->lkey = 0; wmb(); dseg->byte_count = SET_BYTE_COUNT((1 << 31)|0); } else { dseg--; dseg->addr = cpu_to_be64((uint64_t)segs[nr_segs].ds_addr); dseg->lkey = cpu_to_be32(priv->mdev->mr.key); wmb(); dseg->byte_count = SET_BYTE_COUNT((uint32_t)segs[nr_segs].ds_len); } } wmb(); /* write owner bits in reverse order */ if ((opcode & cpu_to_be32(0x1F)) == cpu_to_be32(MLX4_OPCODE_LSO)) mlx4_en_store_inline_lso_header(dseg_inline, ihs, owner_bit); else mlx4_en_store_inline_header(dseg_inline, ihs, owner_bit); if (unlikely(priv->validate_loopback)) { /* Copy dst mac address to wqe */ struct ether_header *ethh; u64 mac; u32 mac_l, mac_h; ethh = mtod(mb, struct ether_header *); mac = mlx4_en_mac_to_u64(ethh->ether_dhost); if (mac) { mac_h = (u32) ((mac & 0xffff00000000ULL) >> 16); mac_l = (u32) (mac & 0xffffffff); tx_desc->ctrl.srcrb_flags |= cpu_to_be32(mac_h); tx_desc->ctrl.imm = cpu_to_be32(mac_l); } } /* update producer counter */ ring->prod += tx_info->nr_txbb; if (ring->bf_enabled && bf_size <= MAX_BF && (tx_desc->ctrl.ins_vlan != MLX4_WQE_CTRL_INS_VLAN)) { /* store doorbell number */ *(volatile __be32 *) (&tx_desc->ctrl.vlan_tag) |= cpu_to_be32(ring->doorbell_qpn); /* or in producer number for this WQE */ opcode |= cpu_to_be32((bf_prod & 0xffff) << 8); /* * Ensure the new descriptor hits memory before * setting ownership of this descriptor to HW: */ wmb(); tx_desc->ctrl.owner_opcode = opcode; wmb(); mlx4_bf_copy(((u8 *)ring->bf.reg) + ring->bf.offset, (volatile unsigned long *) &tx_desc->ctrl, bf_size); wmb(); ring->bf.offset ^= ring->bf.buf_size; } else { /* * Ensure the new descriptor hits memory before * setting ownership of this descriptor to HW: */ wmb(); tx_desc->ctrl.owner_opcode = opcode; wmb(); writel(cpu_to_be32(ring->doorbell_qpn), ((u8 *)ring->bf.uar->map) + MLX4_SEND_DOORBELL); } return (0); tx_drop: *mbp = NULL; m_freem(mb); return (err); } static int mlx4_en_transmit_locked(struct ifnet *dev, int tx_ind, struct mbuf *m) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_tx_ring *ring; struct mbuf *next; int enqueued, err = 0; ring = priv->tx_ring[tx_ind]; if ((dev->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING || priv->port_up == 0) { if (m != NULL) err = drbr_enqueue(dev, ring->br, m); return (err); } enqueued = 0; if (m != NULL) /* * If we can't insert mbuf into drbr, try to xmit anyway. * We keep the error we got so we could return that after xmit. */ err = drbr_enqueue(dev, ring->br, m); /* Process the queue */ while ((next = drbr_peek(dev, ring->br)) != NULL) { if (mlx4_en_xmit(priv, tx_ind, &next) != 0) { if (next == NULL) { drbr_advance(dev, ring->br); } else { drbr_putback(dev, ring->br, next); } break; } drbr_advance(dev, ring->br); enqueued++; if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) break; } if (enqueued > 0) ring->watchdog_time = ticks; return (err); } void mlx4_en_tx_que(void *context, int pending) { struct mlx4_en_tx_ring *ring; struct mlx4_en_priv *priv; struct net_device *dev; struct mlx4_en_cq *cq; int tx_ind; cq = context; dev = cq->dev; priv = dev->if_softc; tx_ind = cq->ring; ring = priv->tx_ring[tx_ind]; if (priv->port_up != 0 && (dev->if_drv_flags & IFF_DRV_RUNNING) != 0) { mlx4_en_xmit_poll(priv, tx_ind); spin_lock(&ring->tx_lock); if (!drbr_empty(dev, ring->br)) mlx4_en_transmit_locked(dev, tx_ind, NULL); spin_unlock(&ring->tx_lock); } } int mlx4_en_transmit(struct ifnet *dev, struct mbuf *m) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_tx_ring *ring; struct mlx4_en_cq *cq; int i, err = 0; if (priv->port_up == 0) { m_freem(m); return (ENETDOWN); } /* Compute which queue to use */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { i = (m->m_pkthdr.flowid % 128) % priv->tx_ring_num; } else { i = mlx4_en_select_queue(dev, m); } ring = priv->tx_ring[i]; if (spin_trylock(&ring->tx_lock)) { err = mlx4_en_transmit_locked(dev, i, m); spin_unlock(&ring->tx_lock); /* Poll CQ here */ mlx4_en_xmit_poll(priv, i); } else { err = drbr_enqueue(dev, ring->br, m); cq = priv->tx_cq[i]; taskqueue_enqueue(cq->tq, &cq->cq_task); } return (err); } /* * Flush ring buffers. */ void mlx4_en_qflush(struct ifnet *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_tx_ring *ring; struct mbuf *m; if (priv->port_up == 0) return; for (int i = 0; i < priv->tx_ring_num; i++) { ring = priv->tx_ring[i]; spin_lock(&ring->tx_lock); while ((m = buf_ring_dequeue_sc(ring->br)) != NULL) m_freem(m); spin_unlock(&ring->tx_lock); } if_qflush(dev); } Index: head/sys/ofed/drivers/net/mlx4/fw.c =================================================================== --- head/sys/ofed/drivers/net/mlx4/fw.c (revision 300675) +++ head/sys/ofed/drivers/net/mlx4/fw.c (revision 300676) @@ -1,1954 +1,1956 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005, 2006, 2007, 2008, 2014 Mellanox Technologies. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#define LINUXKPI_PARAM_PREFIX mlx4_ + #include #include #include #include #include "fw.h" #include "icm.h" enum { MLX4_COMMAND_INTERFACE_MIN_REV = 2, MLX4_COMMAND_INTERFACE_MAX_REV = 3, MLX4_COMMAND_INTERFACE_NEW_PORT_CMDS = 3, }; extern void __buggy_use_of_MLX4_GET(void); extern void __buggy_use_of_MLX4_PUT(void); -static bool enable_qos; -module_param(enable_qos, bool, 0444); +static u8 enable_qos; +module_param(enable_qos, byte, 0444); MODULE_PARM_DESC(enable_qos, "Enable Quality of Service support in the HCA (default: off)"); #define MLX4_GET(dest, source, offset) \ do { \ void *__p = (char *) (source) + (offset); \ switch (sizeof (dest)) { \ case 1: (dest) = *(u8 *) __p; break; \ case 2: (dest) = be16_to_cpup(__p); break; \ case 4: (dest) = be32_to_cpup(__p); break; \ case 8: (dest) = be64_to_cpup(__p); break; \ default: __buggy_use_of_MLX4_GET(); \ } \ } while (0) #define MLX4_PUT(dest, source, offset) \ do { \ void *__d = ((char *) (dest) + (offset)); \ switch (sizeof(source)) { \ case 1: *(u8 *) __d = (source); break; \ case 2: *(__be16 *) __d = cpu_to_be16(source); break; \ case 4: *(__be32 *) __d = cpu_to_be32(source); break; \ case 8: *(__be64 *) __d = cpu_to_be64(source); break; \ default: __buggy_use_of_MLX4_PUT(); \ } \ } while (0) static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags) { static const char *fname[] = { [ 0] = "RC transport", [ 1] = "UC transport", [ 2] = "UD transport", [ 3] = "XRC transport", [ 4] = "reliable multicast", [ 5] = "FCoIB support", [ 6] = "SRQ support", [ 7] = "IPoIB checksum offload", [ 8] = "P_Key violation counter", [ 9] = "Q_Key violation counter", [10] = "VMM", [12] = "DPDP", [15] = "Big LSO headers", [16] = "MW support", [17] = "APM support", [18] = "Atomic ops support", [19] = "Raw multicast support", [20] = "Address vector port checking support", [21] = "UD multicast support", [24] = "Demand paging support", [25] = "Router support", [30] = "IBoE support", [32] = "Unicast loopback support", [34] = "FCS header control", [38] = "Wake On LAN support", [40] = "UDP RSS support", [41] = "Unicast VEP steering support", [42] = "Multicast VEP steering support", [44] = "Cross-channel (sync_qp) operations support", [48] = "Counters support", [59] = "Port management change event support", [60] = "eSwitch support", [61] = "64 byte EQE support", [62] = "64 byte CQE support", }; int i; mlx4_dbg(dev, "DEV_CAP flags:\n"); for (i = 0; i < ARRAY_SIZE(fname); ++i) if (fname[i] && (flags & (1LL << i))) mlx4_dbg(dev, " %s\n", fname[i]); } static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) { static const char * const fname[] = { [0] = "RSS support", [1] = "RSS Toeplitz Hash Function support", [2] = "RSS XOR Hash Function support", [3] = "Device manage flow steering support", [4] = "FSM (MAC unti-spoofing) support", [5] = "VST (control vlan insertion/stripping) support", [6] = "Dynamic QP updates support", [7] = "Loopback source checks support", [8] = "Device managed flow steering IPoIB support", [9] = "ETS configuration support", [10] = "ETH backplane autoneg report", [11] = "Ethernet Flow control statistics support", [12] = "Recoverable error events support", [13] = "Time stamping support", [14] = "Report driver version to FW support" }; int i; for (i = 0; i < ARRAY_SIZE(fname); ++i) if (fname[i] && (flags & (1LL << i))) mlx4_dbg(dev, " %s\n", fname[i]); } int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg) { struct mlx4_cmd_mailbox *mailbox; u32 *inbox; int err = 0; #define MOD_STAT_CFG_IN_SIZE 0x100 #define MOD_STAT_CFG_PG_SZ_M_OFFSET 0x002 #define MOD_STAT_CFG_PG_SZ_OFFSET 0x003 mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); inbox = mailbox->buf; memset(inbox, 0, MOD_STAT_CFG_IN_SIZE); MLX4_PUT(inbox, cfg->log_pg_sz, MOD_STAT_CFG_PG_SZ_OFFSET); MLX4_PUT(inbox, cfg->log_pg_sz_m, MOD_STAT_CFG_PG_SZ_M_OFFSET); err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev, mailbox); return err; } int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox, struct mlx4_cmd_info *cmd) { struct mlx4_priv *priv = mlx4_priv(dev); u8 field, port; u32 size; int err = 0; #define QUERY_FUNC_CAP_FLAGS_OFFSET 0x0 #define QUERY_FUNC_CAP_NUM_PORTS_OFFSET 0x1 #define QUERY_FUNC_CAP_PF_BHVR_OFFSET 0x4 #define QUERY_FUNC_CAP_FMR_OFFSET 0x8 #define QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP 0x10 #define QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP 0x14 #define QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP 0x18 #define QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP 0x20 #define QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP 0x24 #define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP 0x28 #define QUERY_FUNC_CAP_MAX_EQ_OFFSET 0x2c #define QUERY_FUNC_CAP_RESERVED_EQ_OFFSET 0x30 #define QUERY_FUNC_CAP_QP_QUOTA_OFFSET 0x50 #define QUERY_FUNC_CAP_CQ_QUOTA_OFFSET 0x54 #define QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET 0x58 #define QUERY_FUNC_CAP_MPT_QUOTA_OFFSET 0x60 #define QUERY_FUNC_CAP_MTT_QUOTA_OFFSET 0x64 #define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET 0x68 #define QUERY_FUNC_CAP_FMR_FLAG 0x80 #define QUERY_FUNC_CAP_FLAG_RDMA 0x40 #define QUERY_FUNC_CAP_FLAG_ETH 0x80 #define QUERY_FUNC_CAP_FLAG_QUOTAS 0x10 /* when opcode modifier = 1 */ #define QUERY_FUNC_CAP_PHYS_PORT_OFFSET 0x3 #define QUERY_FUNC_CAP_FLAGS0_OFFSET 0x8 #define QUERY_FUNC_CAP_FLAGS1_OFFSET 0xc #define QUERY_FUNC_CAP_COUNTER_INDEX_OFFSET 0xd #define QUERY_FUNC_CAP_QP0_TUNNEL 0x10 #define QUERY_FUNC_CAP_QP0_PROXY 0x14 #define QUERY_FUNC_CAP_QP1_TUNNEL 0x18 #define QUERY_FUNC_CAP_QP1_PROXY 0x1c #define QUERY_FUNC_CAP_ETH_PROPS_FORCE_MAC 0x40 #define QUERY_FUNC_CAP_ETH_PROPS_FORCE_VLAN 0x80 #define QUERY_FUNC_CAP_PROPS_DEF_COUNTER 0x20 #define QUERY_FUNC_CAP_RDMA_PROPS_FORCE_PHY_WQE_GID 0x80 if (vhcr->op_modifier == 1) { port = vhcr->in_modifier; /* phys-port = logical-port */ MLX4_PUT(outbox->buf, port, QUERY_FUNC_CAP_PHYS_PORT_OFFSET); field = 0; /* ensure that phy_wqe_gid bit is not set */ MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS0_OFFSET); /* ensure force vlan and force mac bits are not set * and that default counter bit is set */ field = QUERY_FUNC_CAP_PROPS_DEF_COUNTER; /* def counter */ MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS1_OFFSET); /* There is always default counter legal or sink counter */ field = mlx4_get_default_counter_index(dev, slave, vhcr->in_modifier); MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_COUNTER_INDEX_OFFSET); /* size is now the QP number */ size = dev->phys_caps.base_tunnel_sqpn + 8 * slave + port - 1; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_TUNNEL); size += 2; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_TUNNEL); size = dev->phys_caps.base_proxy_sqpn + 8 * slave + port - 1; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_PROXY); size += 2; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_PROXY); } else if (vhcr->op_modifier == 0) { /* enable rdma and ethernet interfaces, and new quota locations */ field = (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA | QUERY_FUNC_CAP_FLAG_QUOTAS); MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS_OFFSET); field = dev->caps.num_ports; MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_NUM_PORTS_OFFSET); size = dev->caps.function_caps; /* set PF behaviours */ MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_PF_BHVR_OFFSET); field = 0; /* protected FMR support not available as yet */ MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FMR_OFFSET); size = priv->mfunc.master.res_tracker.res_alloc[RES_QP].quota[slave]; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_QUOTA_OFFSET); size = dev->caps.num_qps; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP); size = priv->mfunc.master.res_tracker.res_alloc[RES_SRQ].quota[slave]; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET); size = dev->caps.num_srqs; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP); size = priv->mfunc.master.res_tracker.res_alloc[RES_CQ].quota[slave]; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET); size = dev->caps.num_cqs; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP); size = dev->caps.num_eqs; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MAX_EQ_OFFSET); size = dev->caps.reserved_eqs; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET); size = priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[slave]; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET); size = dev->caps.num_mpts; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP); size = priv->mfunc.master.res_tracker.res_alloc[RES_MTT].quota[slave]; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET); size = dev->caps.num_mtts; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP); size = dev->caps.num_mgms + dev->caps.num_amgms; MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET); MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP); } else err = -EINVAL; return err; } int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port, struct mlx4_func_cap *func_cap) { struct mlx4_cmd_mailbox *mailbox; u32 *outbox; u8 field, op_modifier; u32 size; int err = 0, quotas = 0; op_modifier = !!gen_or_port; /* 0 = general, 1 = logical port */ mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); err = mlx4_cmd_box(dev, 0, mailbox->dma, gen_or_port, op_modifier, MLX4_CMD_QUERY_FUNC_CAP, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (err) goto out; outbox = mailbox->buf; if (!op_modifier) { MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS_OFFSET); if (!(field & (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA))) { mlx4_err(dev, "The host supports neither eth nor rdma interfaces\n"); err = -EPROTONOSUPPORT; goto out; } func_cap->flags = field; quotas = !!(func_cap->flags & QUERY_FUNC_CAP_FLAG_QUOTAS); MLX4_GET(field, outbox, QUERY_FUNC_CAP_NUM_PORTS_OFFSET); func_cap->num_ports = field; MLX4_GET(size, outbox, QUERY_FUNC_CAP_PF_BHVR_OFFSET); func_cap->pf_context_behaviour = size; if (quotas) { MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_QUOTA_OFFSET); func_cap->qp_quota = size & 0xFFFFFF; MLX4_GET(size, outbox, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET); func_cap->srq_quota = size & 0xFFFFFF; MLX4_GET(size, outbox, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET); func_cap->cq_quota = size & 0xFFFFFF; MLX4_GET(size, outbox, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET); func_cap->mpt_quota = size & 0xFFFFFF; MLX4_GET(size, outbox, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET); func_cap->mtt_quota = size & 0xFFFFFF; MLX4_GET(size, outbox, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET); func_cap->mcg_quota = size & 0xFFFFFF; } else { MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP); func_cap->qp_quota = size & 0xFFFFFF; MLX4_GET(size, outbox, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP); func_cap->srq_quota = size & 0xFFFFFF; MLX4_GET(size, outbox, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP); func_cap->cq_quota = size & 0xFFFFFF; MLX4_GET(size, outbox, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP); func_cap->mpt_quota = size & 0xFFFFFF; MLX4_GET(size, outbox, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP); func_cap->mtt_quota = size & 0xFFFFFF; MLX4_GET(size, outbox, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP); func_cap->mcg_quota = size & 0xFFFFFF; } MLX4_GET(size, outbox, QUERY_FUNC_CAP_MAX_EQ_OFFSET); func_cap->max_eq = size & 0xFFFFFF; MLX4_GET(size, outbox, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET); func_cap->reserved_eq = size & 0xFFFFFF; goto out; } /* logical port query */ if (gen_or_port > dev->caps.num_ports) { err = -EINVAL; goto out; } if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_ETH) { MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS1_OFFSET); if (field & QUERY_FUNC_CAP_ETH_PROPS_FORCE_VLAN) { mlx4_err(dev, "VLAN is enforced on this port\n"); err = -EPROTONOSUPPORT; goto out; } if (field & QUERY_FUNC_CAP_ETH_PROPS_FORCE_MAC) { mlx4_err(dev, "Force mac is enabled on this port\n"); err = -EPROTONOSUPPORT; goto out; } } else if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_IB) { MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS0_OFFSET); if (field & QUERY_FUNC_CAP_RDMA_PROPS_FORCE_PHY_WQE_GID) { mlx4_err(dev, "phy_wqe_gid is " "enforced on this ib port\n"); err = -EPROTONOSUPPORT; goto out; } } MLX4_GET(field, outbox, QUERY_FUNC_CAP_PHYS_PORT_OFFSET); func_cap->physical_port = field; if (func_cap->physical_port != gen_or_port) { err = -ENOSYS; goto out; } MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS1_OFFSET); if (field & QUERY_FUNC_CAP_PROPS_DEF_COUNTER) { MLX4_GET(field, outbox, QUERY_FUNC_CAP_COUNTER_INDEX_OFFSET); func_cap->def_counter_index = field; } else { func_cap->def_counter_index = MLX4_SINK_COUNTER_INDEX; } MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_TUNNEL); func_cap->qp0_tunnel_qpn = size & 0xFFFFFF; MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_PROXY); func_cap->qp0_proxy_qpn = size & 0xFFFFFF; MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_TUNNEL); func_cap->qp1_tunnel_qpn = size & 0xFFFFFF; MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_PROXY); func_cap->qp1_proxy_qpn = size & 0xFFFFFF; /* All other resources are allocated by the master, but we still report * 'num' and 'reserved' capabilities as follows: * - num remains the maximum resource index * - 'num - reserved' is the total available objects of a resource, but * resource indices may be less than 'reserved' * TODO: set per-resource quotas */ out: mlx4_free_cmd_mailbox(dev, mailbox); return err; } int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) { struct mlx4_cmd_mailbox *mailbox; u32 *outbox; u8 field; u32 field32, flags, ext_flags; u16 size; u16 stat_rate; int err; int i; #define QUERY_DEV_CAP_OUT_SIZE 0x100 #define QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET 0x10 #define QUERY_DEV_CAP_MAX_QP_SZ_OFFSET 0x11 #define QUERY_DEV_CAP_RSVD_QP_OFFSET 0x12 #define QUERY_DEV_CAP_MAX_QP_OFFSET 0x13 #define QUERY_DEV_CAP_RSVD_SRQ_OFFSET 0x14 #define QUERY_DEV_CAP_MAX_SRQ_OFFSET 0x15 #define QUERY_DEV_CAP_RSVD_EEC_OFFSET 0x16 #define QUERY_DEV_CAP_MAX_EEC_OFFSET 0x17 #define QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET 0x19 #define QUERY_DEV_CAP_RSVD_CQ_OFFSET 0x1a #define QUERY_DEV_CAP_MAX_CQ_OFFSET 0x1b #define QUERY_DEV_CAP_MAX_MPT_OFFSET 0x1d #define QUERY_DEV_CAP_RSVD_EQ_OFFSET 0x1e #define QUERY_DEV_CAP_MAX_EQ_OFFSET 0x1f #define QUERY_DEV_CAP_RSVD_MTT_OFFSET 0x20 #define QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET 0x21 #define QUERY_DEV_CAP_RSVD_MRW_OFFSET 0x22 #define QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET 0x23 #define QUERY_DEV_CAP_MAX_AV_OFFSET 0x27 #define QUERY_DEV_CAP_MAX_REQ_QP_OFFSET 0x29 #define QUERY_DEV_CAP_MAX_RES_QP_OFFSET 0x2b #define QUERY_DEV_CAP_MAX_GSO_OFFSET 0x2d #define QUERY_DEV_CAP_RSS_OFFSET 0x2e #define QUERY_DEV_CAP_MAX_RDMA_OFFSET 0x2f #define QUERY_DEV_CAP_RSZ_SRQ_OFFSET 0x33 #define QUERY_DEV_CAP_ACK_DELAY_OFFSET 0x35 #define QUERY_DEV_CAP_MTU_WIDTH_OFFSET 0x36 #define QUERY_DEV_CAP_VL_PORT_OFFSET 0x37 #define QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET 0x38 #define QUERY_DEV_CAP_MAX_GID_OFFSET 0x3b #define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET 0x3c #define QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET 0x3e #define QUERY_DEV_CAP_MAX_PKEY_OFFSET 0x3f #define QUERY_DEV_CAP_EXT_FLAGS_OFFSET 0x40 #define QUERY_DEV_CAP_SYNC_QP_OFFSET 0x42 #define QUERY_DEV_CAP_FLAGS_OFFSET 0x44 #define QUERY_DEV_CAP_RSVD_UAR_OFFSET 0x48 #define QUERY_DEV_CAP_UAR_SZ_OFFSET 0x49 #define QUERY_DEV_CAP_PAGE_SZ_OFFSET 0x4b #define QUERY_DEV_CAP_BF_OFFSET 0x4c #define QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET 0x4d #define QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET 0x4e #define QUERY_DEV_CAP_LOG_MAX_BF_PAGES_OFFSET 0x4f #define QUERY_DEV_CAP_MAX_SG_SQ_OFFSET 0x51 #define QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET 0x52 #define QUERY_DEV_CAP_MAX_SG_RQ_OFFSET 0x55 #define QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET 0x56 #define QUERY_DEV_CAP_MAX_QP_MCG_OFFSET 0x61 #define QUERY_DEV_CAP_RSVD_MCG_OFFSET 0x62 #define QUERY_DEV_CAP_MAX_MCG_OFFSET 0x63 #define QUERY_DEV_CAP_RSVD_PD_OFFSET 0x64 #define QUERY_DEV_CAP_MAX_PD_OFFSET 0x65 #define QUERY_DEV_CAP_RSVD_XRC_OFFSET 0x66 #define QUERY_DEV_CAP_MAX_XRC_OFFSET 0x67 #define QUERY_DEV_CAP_MAX_BASIC_COUNTERS_OFFSET 0x68 #define QUERY_DEV_CAP_MAX_EXTENDED_COUNTERS_OFFSET 0x6c #define QUERY_DEV_CAP_PORT_FLOWSTATS_COUNTERS_OFFSET 0x70 #define QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET 0x76 #define QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET 0x70 #define QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET 0x74 #define QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET 0x77 #define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET 0x80 #define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET 0x82 #define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET 0x84 #define QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET 0x86 #define QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET 0x88 #define QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET 0x8a #define QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET 0x8c #define QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET 0x8e #define QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET 0x90 #define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET 0x92 #define QUERY_DEV_CAP_BMME_FLAGS_OFFSET 0x94 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET 0x98 #define QUERY_DEV_CAP_ETS_CFG_OFFSET 0x9c #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET 0xa0 dev_cap->flags2 = 0; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); outbox = mailbox->buf; err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) goto out; MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_QP_OFFSET); dev_cap->reserved_qps = 1 << (field & 0xf); MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_OFFSET); dev_cap->max_qps = 1 << (field & 0x1f); MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_SRQ_OFFSET); dev_cap->reserved_srqs = 1 << (field >> 4); MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_OFFSET); dev_cap->max_srqs = 1 << (field & 0x1f); MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET); dev_cap->max_cq_sz = 1 << field; MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_CQ_OFFSET); dev_cap->reserved_cqs = 1 << (field & 0xf); MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_OFFSET); dev_cap->max_cqs = 1 << (field & 0x1f); MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MPT_OFFSET); dev_cap->max_mpts = 1 << (field & 0x3f); MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_EQ_OFFSET); dev_cap->reserved_eqs = field & 0xf; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_EQ_OFFSET); dev_cap->max_eqs = 1 << (field & 0xf); MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MTT_OFFSET); dev_cap->reserved_mtts = 1 << (field >> 4); MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET); dev_cap->max_mrw_sz = 1 << field; MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MRW_OFFSET); dev_cap->reserved_mrws = 1 << (field & 0xf); MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET); dev_cap->max_mtt_seg = 1 << (field & 0x3f); MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_REQ_QP_OFFSET); dev_cap->max_requester_per_qp = 1 << (field & 0x3f); MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RES_QP_OFFSET); dev_cap->max_responder_per_qp = 1 << (field & 0x3f); MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GSO_OFFSET); field &= 0x1f; if (!field) dev_cap->max_gso_sz = 0; else dev_cap->max_gso_sz = 1 << field; MLX4_GET(field, outbox, QUERY_DEV_CAP_RSS_OFFSET); if (field & 0x20) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS_XOR; if (field & 0x10) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS_TOP; field &= 0xf; if (field) { dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS; dev_cap->max_rss_tbl_sz = 1 << field; } else dev_cap->max_rss_tbl_sz = 0; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RDMA_OFFSET); dev_cap->max_rdma_global = 1 << (field & 0x3f); MLX4_GET(field, outbox, QUERY_DEV_CAP_ACK_DELAY_OFFSET); dev_cap->local_ca_ack_delay = field & 0x1f; MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET); dev_cap->num_ports = field & 0xf; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET); dev_cap->max_msg_sz = 1 << (field & 0x1f); MLX4_GET(field, outbox, QUERY_DEV_CAP_PORT_FLOWSTATS_COUNTERS_OFFSET); if (field & 0x10) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN; MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET); if (field & 0x80) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FS_EN; MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET); if (field & 0x80) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DMFS_IPOIB; dev_cap->fs_log_max_ucast_qp_range_size = field & 0x1f; MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET); dev_cap->fs_max_num_qp_per_entry = field; MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET); dev_cap->stat_rate_support = stat_rate; MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET); if (field & 0x80) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_TS; MLX4_GET(ext_flags, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); MLX4_GET(flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET); dev_cap->flags = flags | (u64)ext_flags << 32; MLX4_GET(field, outbox, QUERY_DEV_CAP_SYNC_QP_OFFSET); dev_cap->sync_qp = field & 0x10; MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET); dev_cap->reserved_uars = field >> 4; MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET); dev_cap->uar_size = 1 << ((field & 0x3f) + 20); MLX4_GET(field, outbox, QUERY_DEV_CAP_PAGE_SZ_OFFSET); dev_cap->min_page_sz = 1 << field; MLX4_GET(field, outbox, QUERY_DEV_CAP_BF_OFFSET); if (field & 0x80) { MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET); dev_cap->bf_reg_size = 1 << (field & 0x1f); MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET); if ((1 << (field & 0x3f)) > (PAGE_SIZE / dev_cap->bf_reg_size)) field = 3; dev_cap->bf_regs_per_page = 1 << (field & 0x3f); mlx4_dbg(dev, "BlueFlame available (reg size %d, regs/page %d)\n", dev_cap->bf_reg_size, dev_cap->bf_regs_per_page); } else { dev_cap->bf_reg_size = 0; mlx4_dbg(dev, "BlueFlame not available\n"); } MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_SQ_OFFSET); dev_cap->max_sq_sg = field; MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET); dev_cap->max_sq_desc_sz = size; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_MCG_OFFSET); dev_cap->max_qp_per_mcg = 1 << field; MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MCG_OFFSET); dev_cap->reserved_mgms = field & 0xf; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MCG_OFFSET); dev_cap->max_mcgs = 1 << field; MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_PD_OFFSET); dev_cap->reserved_pds = field >> 4; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PD_OFFSET); dev_cap->max_pds = 1 << (field & 0x3f); MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_XRC_OFFSET); dev_cap->reserved_xrcds = field >> 4; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_XRC_OFFSET); dev_cap->max_xrcds = 1 << (field & 0x1f); MLX4_GET(size, outbox, QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET); dev_cap->rdmarc_entry_sz = size; MLX4_GET(size, outbox, QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET); dev_cap->qpc_entry_sz = size; MLX4_GET(size, outbox, QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET); dev_cap->aux_entry_sz = size; MLX4_GET(size, outbox, QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET); dev_cap->altc_entry_sz = size; MLX4_GET(size, outbox, QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET); dev_cap->eqc_entry_sz = size; MLX4_GET(size, outbox, QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET); dev_cap->cqc_entry_sz = size; MLX4_GET(size, outbox, QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET); dev_cap->srq_entry_sz = size; MLX4_GET(size, outbox, QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET); dev_cap->cmpt_entry_sz = size; MLX4_GET(size, outbox, QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET); dev_cap->mtt_entry_sz = size; MLX4_GET(size, outbox, QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET); dev_cap->dmpt_entry_sz = size; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET); dev_cap->max_srq_sz = 1 << field; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_SZ_OFFSET); dev_cap->max_qp_sz = 1 << field; MLX4_GET(field, outbox, QUERY_DEV_CAP_RSZ_SRQ_OFFSET); dev_cap->resize_srq = field & 1; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_RQ_OFFSET); dev_cap->max_rq_sg = field; MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET); dev_cap->max_rq_desc_sz = size; MLX4_GET(dev_cap->bmme_flags, outbox, QUERY_DEV_CAP_BMME_FLAGS_OFFSET); MLX4_GET(dev_cap->reserved_lkey, outbox, QUERY_DEV_CAP_RSVD_LKEY_OFFSET); MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETS_CFG_OFFSET); if (field32 & (1 << 0)) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP; if (field32 & (1 << 7)) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT; if (field32 & (1 << 8)) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW; if (field32 & (1 << 13)) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETS_CFG; MLX4_GET(dev_cap->max_icm_sz, outbox, QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET); if (dev_cap->flags & MLX4_DEV_CAP_FLAG_COUNTERS) MLX4_GET(dev_cap->max_basic_counters, outbox, QUERY_DEV_CAP_MAX_BASIC_COUNTERS_OFFSET); /* FW reports 256 however real value is 255 */ dev_cap->max_basic_counters = min_t(u32, dev_cap->max_basic_counters, 255); if (dev_cap->flags & MLX4_DEV_CAP_FLAG_COUNTERS_EXT) MLX4_GET(dev_cap->max_extended_counters, outbox, QUERY_DEV_CAP_MAX_EXTENDED_COUNTERS_OFFSET); MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET); if (field32 & (1 << 16)) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP; if (field32 & (1 << 19)) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_LB_SRC_CHK; if (field32 & (1 << 20)) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FSM; if (field32 & (1 << 26)) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_VLAN_CONTROL; if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { for (i = 1; i <= dev_cap->num_ports; ++i) { MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET); dev_cap->max_vl[i] = field >> 4; MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET); dev_cap->ib_mtu[i] = field >> 4; dev_cap->max_port_width[i] = field & 0xf; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET); dev_cap->max_gids[i] = 1 << (field & 0xf); MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PKEY_OFFSET); dev_cap->max_pkeys[i] = 1 << (field & 0xf); } } else { #define QUERY_PORT_SUPPORTED_TYPE_OFFSET 0x00 #define QUERY_PORT_MTU_OFFSET 0x01 #define QUERY_PORT_ETH_MTU_OFFSET 0x02 #define QUERY_PORT_WIDTH_OFFSET 0x06 #define QUERY_PORT_MAX_GID_PKEY_OFFSET 0x07 #define QUERY_PORT_MAX_MACVLAN_OFFSET 0x0a #define QUERY_PORT_MAX_VL_OFFSET 0x0b #define QUERY_PORT_MAC_OFFSET 0x10 #define QUERY_PORT_TRANS_VENDOR_OFFSET 0x18 #define QUERY_PORT_WAVELENGTH_OFFSET 0x1c #define QUERY_PORT_TRANS_CODE_OFFSET 0x20 for (i = 1; i <= dev_cap->num_ports; ++i) { err = mlx4_cmd_box(dev, 0, mailbox->dma, i, 0, MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); if (err) goto out; MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET); dev_cap->supported_port_types[i] = field & 3; dev_cap->suggested_type[i] = (field >> 3) & 1; dev_cap->default_sense[i] = (field >> 4) & 1; MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET); dev_cap->ib_mtu[i] = field & 0xf; MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET); dev_cap->max_port_width[i] = field & 0xf; MLX4_GET(field, outbox, QUERY_PORT_MAX_GID_PKEY_OFFSET); dev_cap->max_gids[i] = 1 << (field >> 4); dev_cap->max_pkeys[i] = 1 << (field & 0xf); MLX4_GET(field, outbox, QUERY_PORT_MAX_VL_OFFSET); dev_cap->max_vl[i] = field & 0xf; MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET); dev_cap->log_max_macs[i] = field & 0xf; dev_cap->log_max_vlans[i] = field >> 4; MLX4_GET(dev_cap->eth_mtu[i], outbox, QUERY_PORT_ETH_MTU_OFFSET); MLX4_GET(dev_cap->def_mac[i], outbox, QUERY_PORT_MAC_OFFSET); MLX4_GET(field32, outbox, QUERY_PORT_TRANS_VENDOR_OFFSET); dev_cap->trans_type[i] = field32 >> 24; dev_cap->vendor_oui[i] = field32 & 0xffffff; MLX4_GET(dev_cap->wavelength[i], outbox, QUERY_PORT_WAVELENGTH_OFFSET); MLX4_GET(dev_cap->trans_code[i], outbox, QUERY_PORT_TRANS_CODE_OFFSET); } } mlx4_dbg(dev, "Base MM extensions: flags %08x, rsvd L_Key %08x\n", dev_cap->bmme_flags, dev_cap->reserved_lkey); /* * Each UAR has 4 EQ doorbells; so if a UAR is reserved, then * we can't use any EQs whose doorbell falls on that page, * even if the EQ itself isn't reserved. */ dev_cap->reserved_eqs = max(dev_cap->reserved_uars * 4, dev_cap->reserved_eqs); mlx4_dbg(dev, "Max ICM size %lld MB\n", (unsigned long long) dev_cap->max_icm_sz >> 20); mlx4_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n", dev_cap->max_qps, dev_cap->reserved_qps, dev_cap->qpc_entry_sz); mlx4_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n", dev_cap->max_srqs, dev_cap->reserved_srqs, dev_cap->srq_entry_sz); mlx4_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n", dev_cap->max_cqs, dev_cap->reserved_cqs, dev_cap->cqc_entry_sz); mlx4_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n", dev_cap->max_eqs, dev_cap->reserved_eqs, dev_cap->eqc_entry_sz); mlx4_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n", dev_cap->reserved_mrws, dev_cap->reserved_mtts); mlx4_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n", dev_cap->max_pds, dev_cap->reserved_pds, dev_cap->reserved_uars); mlx4_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n", dev_cap->max_pds, dev_cap->reserved_mgms); mlx4_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n", dev_cap->max_cq_sz, dev_cap->max_qp_sz, dev_cap->max_srq_sz); mlx4_dbg(dev, "Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n", dev_cap->local_ca_ack_delay, 128 << dev_cap->ib_mtu[1], dev_cap->max_port_width[1]); mlx4_dbg(dev, "Max SQ desc size: %d, max SQ S/G: %d\n", dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg); mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n", dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg); mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz); mlx4_dbg(dev, "Max basic counters: %d\n", dev_cap->max_basic_counters); mlx4_dbg(dev, "Max extended counters: %d\n", dev_cap->max_extended_counters); mlx4_dbg(dev, "Max RSS Table size: %d\n", dev_cap->max_rss_tbl_sz); dump_dev_cap_flags(dev, dev_cap->flags); dump_dev_cap_flags2(dev, dev_cap->flags2); out: mlx4_free_cmd_mailbox(dev, mailbox); return err; } int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox, struct mlx4_cmd_info *cmd) { u64 flags; int err = 0; u8 field; err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) return err; /* add port mng change event capability unconditionally to slaves */ MLX4_GET(flags, outbox->buf, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); flags |= MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV; MLX4_PUT(outbox->buf, flags, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); /* For guests, report Blueflame disabled */ MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_BF_OFFSET); field &= 0x7f; MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_BF_OFFSET); /* turn off device-managed steering capability if not enabled */ if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) { MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET); field &= 0x7f; MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET); } return 0; } int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox, struct mlx4_cmd_info *cmd) { struct mlx4_priv *priv = mlx4_priv(dev); u64 def_mac; u8 port_type; u16 short_field; int err; int admin_link_state; #define MLX4_VF_PORT_NO_LINK_SENSE_MASK 0xE0 #define MLX4_PORT_LINK_UP_MASK 0x80 #define QUERY_PORT_CUR_MAX_PKEY_OFFSET 0x0c #define QUERY_PORT_CUR_MAX_GID_OFFSET 0x0e err = mlx4_cmd_box(dev, 0, outbox->dma, vhcr->in_modifier, 0, MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); if (!err && dev->caps.function != slave) { /* set slave default_mac address to be zero MAC */ def_mac = priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.mac; MLX4_PUT(outbox->buf, def_mac, QUERY_PORT_MAC_OFFSET); /* get port type - currently only eth is enabled */ MLX4_GET(port_type, outbox->buf, QUERY_PORT_SUPPORTED_TYPE_OFFSET); /* No link sensing allowed */ port_type &= MLX4_VF_PORT_NO_LINK_SENSE_MASK; /* set port type to currently operating port type */ port_type |= (dev->caps.port_type[vhcr->in_modifier] & 0x3); admin_link_state = priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.link_state; if (IFLA_VF_LINK_STATE_ENABLE == admin_link_state) port_type |= MLX4_PORT_LINK_UP_MASK; else if (IFLA_VF_LINK_STATE_DISABLE == admin_link_state) port_type &= ~MLX4_PORT_LINK_UP_MASK; MLX4_PUT(outbox->buf, port_type, QUERY_PORT_SUPPORTED_TYPE_OFFSET); if (dev->caps.port_type[vhcr->in_modifier] == MLX4_PORT_TYPE_ETH) short_field = mlx4_get_slave_num_gids(dev, slave); else short_field = 1; /* slave max gids */ MLX4_PUT(outbox->buf, short_field, QUERY_PORT_CUR_MAX_GID_OFFSET); short_field = dev->caps.pkey_table_len[vhcr->in_modifier]; MLX4_PUT(outbox->buf, short_field, QUERY_PORT_CUR_MAX_PKEY_OFFSET); } return err; } int mlx4_get_slave_pkey_gid_tbl_len(struct mlx4_dev *dev, u8 port, int *gid_tbl_len, int *pkey_tbl_len) { struct mlx4_cmd_mailbox *mailbox; u32 *outbox; u16 field; int err; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); err = mlx4_cmd_box(dev, 0, mailbox->dma, port, 0, MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) goto out; outbox = mailbox->buf; MLX4_GET(field, outbox, QUERY_PORT_CUR_MAX_GID_OFFSET); *gid_tbl_len = field; MLX4_GET(field, outbox, QUERY_PORT_CUR_MAX_PKEY_OFFSET); *pkey_tbl_len = field; out: mlx4_free_cmd_mailbox(dev, mailbox); return err; } EXPORT_SYMBOL(mlx4_get_slave_pkey_gid_tbl_len); int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt) { struct mlx4_cmd_mailbox *mailbox; struct mlx4_icm_iter iter; __be64 *pages; int lg; int nent = 0; int i; int err = 0; int ts = 0, tc = 0; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE); pages = mailbox->buf; for (mlx4_icm_first(icm, &iter); !mlx4_icm_last(&iter); mlx4_icm_next(&iter)) { /* * We have to pass pages that are aligned to their * size, so find the least significant 1 in the * address or size and use that as our log2 size. */ lg = ffs(mlx4_icm_addr(&iter) | mlx4_icm_size(&iter)) - 1; if (lg < MLX4_ICM_PAGE_SHIFT) { mlx4_warn(dev, "Got FW area not aligned to %d (%llx/%lx).\n", MLX4_ICM_PAGE_SIZE, (unsigned long long) mlx4_icm_addr(&iter), mlx4_icm_size(&iter)); err = -EINVAL; goto out; } for (i = 0; i < mlx4_icm_size(&iter) >> lg; ++i) { if (virt != -1) { pages[nent * 2] = cpu_to_be64(virt); virt += 1 << lg; } pages[nent * 2 + 1] = cpu_to_be64((mlx4_icm_addr(&iter) + (i << lg)) | (lg - MLX4_ICM_PAGE_SHIFT)); ts += 1 << (lg - 10); ++tc; if (++nent == MLX4_MAILBOX_SIZE / 16) { err = mlx4_cmd(dev, mailbox->dma, nent, 0, op, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); if (err) goto out; nent = 0; } } } if (nent) err = mlx4_cmd(dev, mailbox->dma, nent, 0, op, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); if (err) goto out; switch (op) { case MLX4_CMD_MAP_FA: mlx4_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts); break; case MLX4_CMD_MAP_ICM_AUX: mlx4_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts); break; case MLX4_CMD_MAP_ICM: mlx4_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n", tc, ts, (unsigned long long) virt - (ts << 10)); break; } out: mlx4_free_cmd_mailbox(dev, mailbox); return err; } int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm) { return mlx4_map_cmd(dev, MLX4_CMD_MAP_FA, icm, -1); } int mlx4_UNMAP_FA(struct mlx4_dev *dev) { return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_FA, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); } int mlx4_RUN_FW(struct mlx4_dev *dev) { return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_RUN_FW, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); } int mlx4_QUERY_FW(struct mlx4_dev *dev) { struct mlx4_fw *fw = &mlx4_priv(dev)->fw; struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd; struct mlx4_cmd_mailbox *mailbox; u32 *outbox; int err = 0; u64 fw_ver; u16 cmd_if_rev; u8 lg; #define QUERY_FW_OUT_SIZE 0x100 #define QUERY_FW_VER_OFFSET 0x00 #define QUERY_FW_PPF_ID 0x09 #define QUERY_FW_CMD_IF_REV_OFFSET 0x0a #define QUERY_FW_MAX_CMD_OFFSET 0x0f #define QUERY_FW_ERR_START_OFFSET 0x30 #define QUERY_FW_ERR_SIZE_OFFSET 0x38 #define QUERY_FW_ERR_BAR_OFFSET 0x3c #define QUERY_FW_SIZE_OFFSET 0x00 #define QUERY_FW_CLR_INT_BASE_OFFSET 0x20 #define QUERY_FW_CLR_INT_BAR_OFFSET 0x28 #define QUERY_FW_COMM_BASE_OFFSET 0x40 #define QUERY_FW_COMM_BAR_OFFSET 0x48 #define QUERY_FW_CLOCK_OFFSET 0x50 #define QUERY_FW_CLOCK_BAR 0x58 mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); outbox = mailbox->buf; err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_FW, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) goto out; MLX4_GET(fw_ver, outbox, QUERY_FW_VER_OFFSET); /* * FW subminor version is at more significant bits than minor * version, so swap here. */ dev->caps.fw_ver = (fw_ver & 0xffff00000000ull) | ((fw_ver & 0xffff0000ull) >> 16) | ((fw_ver & 0x0000ffffull) << 16); MLX4_GET(lg, outbox, QUERY_FW_PPF_ID); dev->caps.function = lg; if (mlx4_is_slave(dev)) goto out; MLX4_GET(cmd_if_rev, outbox, QUERY_FW_CMD_IF_REV_OFFSET); if (cmd_if_rev < MLX4_COMMAND_INTERFACE_MIN_REV || cmd_if_rev > MLX4_COMMAND_INTERFACE_MAX_REV) { mlx4_err(dev, "Installed FW has unsupported " "command interface revision %d.\n", cmd_if_rev); mlx4_err(dev, "(Installed FW version is %d.%d.%03d)\n", (int) (dev->caps.fw_ver >> 32), (int) (dev->caps.fw_ver >> 16) & 0xffff, (int) dev->caps.fw_ver & 0xffff); mlx4_err(dev, "This driver version supports only revisions %d to %d.\n", MLX4_COMMAND_INTERFACE_MIN_REV, MLX4_COMMAND_INTERFACE_MAX_REV); err = -ENODEV; goto out; } if (cmd_if_rev < MLX4_COMMAND_INTERFACE_NEW_PORT_CMDS) dev->flags |= MLX4_FLAG_OLD_PORT_CMDS; MLX4_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET); cmd->max_cmds = 1 << lg; mlx4_dbg(dev, "FW version %d.%d.%03d (cmd intf rev %d), max commands %d\n", (int) (dev->caps.fw_ver >> 32), (int) (dev->caps.fw_ver >> 16) & 0xffff, (int) dev->caps.fw_ver & 0xffff, cmd_if_rev, cmd->max_cmds); MLX4_GET(fw->catas_offset, outbox, QUERY_FW_ERR_START_OFFSET); MLX4_GET(fw->catas_size, outbox, QUERY_FW_ERR_SIZE_OFFSET); MLX4_GET(fw->catas_bar, outbox, QUERY_FW_ERR_BAR_OFFSET); fw->catas_bar = (fw->catas_bar >> 6) * 2; mlx4_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x, BAR %d\n", (unsigned long long) fw->catas_offset, fw->catas_size, fw->catas_bar); MLX4_GET(fw->fw_pages, outbox, QUERY_FW_SIZE_OFFSET); MLX4_GET(fw->clr_int_base, outbox, QUERY_FW_CLR_INT_BASE_OFFSET); MLX4_GET(fw->clr_int_bar, outbox, QUERY_FW_CLR_INT_BAR_OFFSET); fw->clr_int_bar = (fw->clr_int_bar >> 6) * 2; MLX4_GET(fw->comm_base, outbox, QUERY_FW_COMM_BASE_OFFSET); MLX4_GET(fw->comm_bar, outbox, QUERY_FW_COMM_BAR_OFFSET); fw->comm_bar = (fw->comm_bar >> 6) * 2; mlx4_dbg(dev, "Communication vector bar:%d offset:0x%llx\n", fw->comm_bar, (unsigned long long)fw->comm_base); mlx4_dbg(dev, "FW size %d KB\n", fw->fw_pages >> 2); MLX4_GET(fw->clock_offset, outbox, QUERY_FW_CLOCK_OFFSET); MLX4_GET(fw->clock_bar, outbox, QUERY_FW_CLOCK_BAR); fw->clock_bar = (fw->clock_bar >> 6) * 2; mlx4_dbg(dev, "Internal clock bar:%d offset:0x%llx\n", fw->comm_bar, (unsigned long long)fw->comm_base); /* * Round up number of system pages needed in case * MLX4_ICM_PAGE_SIZE < PAGE_SIZE. */ fw->fw_pages = ALIGN(fw->fw_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >> (PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT); mlx4_dbg(dev, "Clear int @ %llx, BAR %d\n", (unsigned long long) fw->clr_int_base, fw->clr_int_bar); out: mlx4_free_cmd_mailbox(dev, mailbox); return err; } int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox, struct mlx4_cmd_info *cmd) { u8 *outbuf; int err; outbuf = outbox->buf; err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_FW, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) return err; /* for slaves, set pci PPF ID to invalid and zero out everything * else except FW version */ outbuf[0] = outbuf[1] = 0; memset(&outbuf[8], 0, QUERY_FW_OUT_SIZE - 8); outbuf[QUERY_FW_PPF_ID] = MLX4_INVALID_SLAVE_ID; return 0; } static void get_board_id(void *vsd, char *board_id, char *vsdstr) { int i; #define VSD_OFFSET_SIG1 0x00 #define VSD_OFFSET_SIG2 0xde #define VSD_OFFSET_MLX_BOARD_ID 0xd0 #define VSD_OFFSET_TS_BOARD_ID 0x20 #define VSD_LEN 0xd0 #define VSD_SIGNATURE_TOPSPIN 0x5ad memset(vsdstr, 0, MLX4_VSD_LEN); for (i = 0; i < VSD_LEN / 4; i++) ((u32 *)vsdstr)[i] = swab32(*(u32 *)(vsd + i * 4)); memset(board_id, 0, MLX4_BOARD_ID_LEN); if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN && be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) { strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MLX4_BOARD_ID_LEN); } else { /* * The board ID is a string but the firmware byte * swaps each 4-byte word before passing it back to * us. Therefore we need to swab it before printing. */ for (i = 0; i < 4; ++i) ((u32 *) board_id)[i] = swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4)); } } int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter) { struct mlx4_cmd_mailbox *mailbox; u32 *outbox; int err; #define QUERY_ADAPTER_OUT_SIZE 0x100 #define QUERY_ADAPTER_INTA_PIN_OFFSET 0x10 #define QUERY_ADAPTER_VSD_OFFSET 0x20 #define QUERY_ADAPTER_VSD_VENDOR_ID_OFFSET 0x1e mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); outbox = mailbox->buf; err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_ADAPTER, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) goto out; MLX4_GET(adapter->inta_pin, outbox, QUERY_ADAPTER_INTA_PIN_OFFSET); adapter->vsd_vendor_id = be16_to_cpup((u16 *)outbox + QUERY_ADAPTER_VSD_VENDOR_ID_OFFSET / 2); get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4, adapter->board_id, adapter->vsd); out: mlx4_free_cmd_mailbox(dev, mailbox); return err; } int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) { struct mlx4_cmd_mailbox *mailbox; __be32 *inbox; u32 mw_enable; int err; #define INIT_HCA_IN_SIZE 0x200 #define INIT_HCA_DRV_NAME_FOR_FW_MAX_SIZE 64 #define INIT_HCA_VERSION_OFFSET 0x000 #define INIT_HCA_VERSION 2 #define INIT_HCA_CACHELINE_SZ_OFFSET 0x0e #define INIT_HCA_FLAGS_OFFSET 0x014 #define INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET 0x018 #define INIT_HCA_QPC_OFFSET 0x020 #define INIT_HCA_QPC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x10) #define INIT_HCA_LOG_QP_OFFSET (INIT_HCA_QPC_OFFSET + 0x17) #define INIT_HCA_SRQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x28) #define INIT_HCA_LOG_SRQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x2f) #define INIT_HCA_CQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x30) #define INIT_HCA_LOG_CQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x37) #define INIT_HCA_EQE_CQE_OFFSETS (INIT_HCA_QPC_OFFSET + 0x38) #define INIT_HCA_ALTC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x40) #define INIT_HCA_AUXC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x50) #define INIT_HCA_EQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x60) #define INIT_HCA_LOG_EQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x67) #define INIT_HCA_RDMARC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x70) #define INIT_HCA_LOG_RD_OFFSET (INIT_HCA_QPC_OFFSET + 0x77) #define INIT_HCA_MCAST_OFFSET 0x0c0 #define INIT_HCA_MC_BASE_OFFSET (INIT_HCA_MCAST_OFFSET + 0x00) #define INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12) #define INIT_HCA_LOG_MC_HASH_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x16) #define INIT_HCA_UC_STEERING_OFFSET (INIT_HCA_MCAST_OFFSET + 0x18) #define INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b) #define INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN 0x6 #define INIT_HCA_DRIVER_VERSION_OFFSET 0x140 #define INIT_HCA_FS_PARAM_OFFSET 0x1d0 #define INIT_HCA_FS_BASE_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x00) #define INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x12) #define INIT_HCA_FS_LOG_TABLE_SZ_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x1b) #define INIT_HCA_FS_ETH_BITS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x21) #define INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x22) #define INIT_HCA_FS_IB_BITS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x25) #define INIT_HCA_FS_IB_NUM_ADDRS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x26) #define INIT_HCA_TPT_OFFSET 0x0f0 #define INIT_HCA_DMPT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x00) #define INIT_HCA_TPT_MW_OFFSET (INIT_HCA_TPT_OFFSET + 0x08) #define INIT_HCA_TPT_MW_ENABLE (1 << 31) #define INIT_HCA_LOG_MPT_SZ_OFFSET (INIT_HCA_TPT_OFFSET + 0x0b) #define INIT_HCA_MTT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x10) #define INIT_HCA_CMPT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x18) #define INIT_HCA_UAR_OFFSET 0x120 #define INIT_HCA_LOG_UAR_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0a) #define INIT_HCA_UAR_PAGE_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0b) mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); inbox = mailbox->buf; memset(inbox, 0, INIT_HCA_IN_SIZE); *((u8 *) mailbox->buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION; *((u8 *) mailbox->buf + INIT_HCA_CACHELINE_SZ_OFFSET) = ((ilog2(cache_line_size()) - 4) << 5) | (1 << 4); #if defined(__LITTLE_ENDIAN) *(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1); #elif defined(__BIG_ENDIAN) *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 1); #else #error Host endianness not defined #endif /* Check port for UD address vector: */ *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1); /* Enable IPoIB checksumming if we can: */ if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 3); /* Enable QoS support if module parameter set */ if (enable_qos) *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 2); /* Enable fast drop performance optimization */ if (dev->caps.fast_drop) *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 7); /* enable counters */ if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS) *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 4); /* CX3 is capable of extending CQEs\EQEs from 32 to 64 bytes */ if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_EQE) { *(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 29); dev->caps.eqe_size = 64; dev->caps.eqe_factor = 1; } else { dev->caps.eqe_size = 32; dev->caps.eqe_factor = 0; } if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_CQE) { *(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 30); dev->caps.cqe_size = 64; dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE; } else { dev->caps.cqe_size = 32; } if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT) *(inbox + INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET / 4) |= cpu_to_be32(1 << 31); if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW) { strncpy((u8 *)mailbox->buf + INIT_HCA_DRIVER_VERSION_OFFSET, DRV_NAME_FOR_FW, INIT_HCA_DRV_NAME_FOR_FW_MAX_SIZE - 1); mlx4_dbg(dev, "Reporting Driver Version to FW: %s\n", (u8 *)mailbox->buf + INIT_HCA_DRIVER_VERSION_OFFSET); } /* QPC/EEC/CQC/EQC/RDMARC attributes */ MLX4_PUT(inbox, param->qpc_base, INIT_HCA_QPC_BASE_OFFSET); MLX4_PUT(inbox, param->log_num_qps, INIT_HCA_LOG_QP_OFFSET); MLX4_PUT(inbox, param->srqc_base, INIT_HCA_SRQC_BASE_OFFSET); MLX4_PUT(inbox, param->log_num_srqs, INIT_HCA_LOG_SRQ_OFFSET); MLX4_PUT(inbox, param->cqc_base, INIT_HCA_CQC_BASE_OFFSET); MLX4_PUT(inbox, param->log_num_cqs, INIT_HCA_LOG_CQ_OFFSET); MLX4_PUT(inbox, param->altc_base, INIT_HCA_ALTC_BASE_OFFSET); MLX4_PUT(inbox, param->auxc_base, INIT_HCA_AUXC_BASE_OFFSET); MLX4_PUT(inbox, param->eqc_base, INIT_HCA_EQC_BASE_OFFSET); MLX4_PUT(inbox, param->log_num_eqs, INIT_HCA_LOG_EQ_OFFSET); MLX4_PUT(inbox, param->rdmarc_base, INIT_HCA_RDMARC_BASE_OFFSET); MLX4_PUT(inbox, param->log_rd_per_qp, INIT_HCA_LOG_RD_OFFSET); /* steering attributes */ if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN); MLX4_PUT(inbox, param->mc_base, INIT_HCA_FS_BASE_OFFSET); MLX4_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET); MLX4_PUT(inbox, param->log_mc_table_sz, INIT_HCA_FS_LOG_TABLE_SZ_OFFSET); /* Enable Ethernet flow steering * with udp unicast and tcp unicast */ MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN), INIT_HCA_FS_ETH_BITS_OFFSET); MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR, INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET); /* Enable IPoIB flow steering * with udp unicast and tcp unicast */ MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN), INIT_HCA_FS_IB_BITS_OFFSET); MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR, INIT_HCA_FS_IB_NUM_ADDRS_OFFSET); } else { MLX4_PUT(inbox, param->mc_base, INIT_HCA_MC_BASE_OFFSET); MLX4_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET); MLX4_PUT(inbox, param->log_mc_hash_sz, INIT_HCA_LOG_MC_HASH_SZ_OFFSET); MLX4_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET); if (dev->caps.steering_mode == MLX4_STEERING_MODE_B0) MLX4_PUT(inbox, (u8) (1 << 3), INIT_HCA_UC_STEERING_OFFSET); } /* TPT attributes */ MLX4_PUT(inbox, param->dmpt_base, INIT_HCA_DMPT_BASE_OFFSET); mw_enable = param->mw_enable ? INIT_HCA_TPT_MW_ENABLE : 0; MLX4_PUT(inbox, mw_enable, INIT_HCA_TPT_MW_OFFSET); MLX4_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET); MLX4_PUT(inbox, param->mtt_base, INIT_HCA_MTT_BASE_OFFSET); MLX4_PUT(inbox, param->cmpt_base, INIT_HCA_CMPT_BASE_OFFSET); /* UAR attributes */ MLX4_PUT(inbox, param->uar_page_sz, INIT_HCA_UAR_PAGE_SZ_OFFSET); MLX4_PUT(inbox, param->log_uar_sz, INIT_HCA_LOG_UAR_SZ_OFFSET); err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000, MLX4_CMD_NATIVE); if (err) mlx4_err(dev, "INIT_HCA returns %d\n", err); mlx4_free_cmd_mailbox(dev, mailbox); return err; } int mlx4_QUERY_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) { struct mlx4_cmd_mailbox *mailbox; __be32 *outbox; u32 dword_field; u32 mw_enable; int err; u8 byte_field; #define QUERY_HCA_GLOBAL_CAPS_OFFSET 0x04 #define QUERY_HCA_CORE_CLOCK_OFFSET 0x0c mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); outbox = mailbox->buf; err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_HCA, MLX4_CMD_TIME_CLASS_B, !mlx4_is_slave(dev)); if (err) goto out; MLX4_GET(param->global_caps, outbox, QUERY_HCA_GLOBAL_CAPS_OFFSET); MLX4_GET(param->hca_core_clock, outbox, QUERY_HCA_CORE_CLOCK_OFFSET); /* QPC/EEC/CQC/EQC/RDMARC attributes */ MLX4_GET(param->qpc_base, outbox, INIT_HCA_QPC_BASE_OFFSET); MLX4_GET(param->log_num_qps, outbox, INIT_HCA_LOG_QP_OFFSET); MLX4_GET(param->srqc_base, outbox, INIT_HCA_SRQC_BASE_OFFSET); MLX4_GET(param->log_num_srqs, outbox, INIT_HCA_LOG_SRQ_OFFSET); MLX4_GET(param->cqc_base, outbox, INIT_HCA_CQC_BASE_OFFSET); MLX4_GET(param->log_num_cqs, outbox, INIT_HCA_LOG_CQ_OFFSET); MLX4_GET(param->altc_base, outbox, INIT_HCA_ALTC_BASE_OFFSET); MLX4_GET(param->auxc_base, outbox, INIT_HCA_AUXC_BASE_OFFSET); MLX4_GET(param->eqc_base, outbox, INIT_HCA_EQC_BASE_OFFSET); MLX4_GET(param->log_num_eqs, outbox, INIT_HCA_LOG_EQ_OFFSET); MLX4_GET(param->rdmarc_base, outbox, INIT_HCA_RDMARC_BASE_OFFSET); MLX4_GET(param->log_rd_per_qp, outbox, INIT_HCA_LOG_RD_OFFSET); MLX4_GET(dword_field, outbox, INIT_HCA_FLAGS_OFFSET); if (dword_field & (1 << INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN)) { param->steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED; } else { MLX4_GET(byte_field, outbox, INIT_HCA_UC_STEERING_OFFSET); if (byte_field & 0x8) param->steering_mode = MLX4_STEERING_MODE_B0; else param->steering_mode = MLX4_STEERING_MODE_A0; } /* steering attributes */ if (param->steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { MLX4_GET(param->mc_base, outbox, INIT_HCA_FS_BASE_OFFSET); MLX4_GET(param->log_mc_entry_sz, outbox, INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET); MLX4_GET(param->log_mc_table_sz, outbox, INIT_HCA_FS_LOG_TABLE_SZ_OFFSET); } else { MLX4_GET(param->mc_base, outbox, INIT_HCA_MC_BASE_OFFSET); MLX4_GET(param->log_mc_entry_sz, outbox, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET); MLX4_GET(param->log_mc_hash_sz, outbox, INIT_HCA_LOG_MC_HASH_SZ_OFFSET); MLX4_GET(param->log_mc_table_sz, outbox, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET); } /* CX3 is capable of extending CQEs\EQEs from 32 to 64 bytes */ MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_OFFSETS); if (byte_field & 0x20) /* 64-bytes eqe enabled */ param->dev_cap_enabled |= MLX4_DEV_CAP_64B_EQE_ENABLED; if (byte_field & 0x40) /* 64-bytes cqe enabled */ param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED; /* TPT attributes */ MLX4_GET(param->dmpt_base, outbox, INIT_HCA_DMPT_BASE_OFFSET); MLX4_GET(mw_enable, outbox, INIT_HCA_TPT_MW_OFFSET); param->mw_enable = (mw_enable & INIT_HCA_TPT_MW_ENABLE) == INIT_HCA_TPT_MW_ENABLE; MLX4_GET(param->log_mpt_sz, outbox, INIT_HCA_LOG_MPT_SZ_OFFSET); MLX4_GET(param->mtt_base, outbox, INIT_HCA_MTT_BASE_OFFSET); MLX4_GET(param->cmpt_base, outbox, INIT_HCA_CMPT_BASE_OFFSET); /* UAR attributes */ MLX4_GET(param->uar_page_sz, outbox, INIT_HCA_UAR_PAGE_SZ_OFFSET); MLX4_GET(param->log_uar_sz, outbox, INIT_HCA_LOG_UAR_SZ_OFFSET); out: mlx4_free_cmd_mailbox(dev, mailbox); return err; } /* for IB-type ports only in SRIOV mode. Checks that both proxy QP0 * and real QP0 are active, so that the paravirtualized QP0 is ready * to operate */ static int check_qp0_state(struct mlx4_dev *dev, int function, int port) { struct mlx4_priv *priv = mlx4_priv(dev); /* irrelevant if not infiniband */ if (priv->mfunc.master.qp0_state[port].proxy_qp0_active && priv->mfunc.master.qp0_state[port].qp0_active) return 1; return 0; } int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox, struct mlx4_cmd_info *cmd) { struct mlx4_priv *priv = mlx4_priv(dev); int port = vhcr->in_modifier; int err; if (priv->mfunc.master.slave_state[slave].init_port_mask & (1 << port)) return 0; if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) { /* Enable port only if it was previously disabled */ if (!priv->mfunc.master.init_port_ref[port]) { err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) return err; } priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port); } else { if (slave == mlx4_master_func_num(dev)) { if (check_qp0_state(dev, slave, port) && !priv->mfunc.master.qp0_state[port].port_active) { err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) return err; priv->mfunc.master.qp0_state[port].port_active = 1; priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port); } } else priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port); } ++priv->mfunc.master.init_port_ref[port]; return 0; } int mlx4_INIT_PORT(struct mlx4_dev *dev, int port) { struct mlx4_cmd_mailbox *mailbox; u32 *inbox; int err; u32 flags; u16 field; if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { #define INIT_PORT_IN_SIZE 256 #define INIT_PORT_FLAGS_OFFSET 0x00 #define INIT_PORT_FLAG_SIG (1 << 18) #define INIT_PORT_FLAG_NG (1 << 17) #define INIT_PORT_FLAG_G0 (1 << 16) #define INIT_PORT_VL_SHIFT 4 #define INIT_PORT_PORT_WIDTH_SHIFT 8 #define INIT_PORT_MTU_OFFSET 0x04 #define INIT_PORT_MAX_GID_OFFSET 0x06 #define INIT_PORT_MAX_PKEY_OFFSET 0x0a #define INIT_PORT_GUID0_OFFSET 0x10 #define INIT_PORT_NODE_GUID_OFFSET 0x18 #define INIT_PORT_SI_GUID_OFFSET 0x20 mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); inbox = mailbox->buf; memset(inbox, 0, INIT_PORT_IN_SIZE); flags = 0; flags |= (dev->caps.vl_cap[port] & 0xf) << INIT_PORT_VL_SHIFT; flags |= (dev->caps.port_width_cap[port] & 0xf) << INIT_PORT_PORT_WIDTH_SHIFT; MLX4_PUT(inbox, flags, INIT_PORT_FLAGS_OFFSET); field = 128 << dev->caps.ib_mtu_cap[port]; MLX4_PUT(inbox, field, INIT_PORT_MTU_OFFSET); field = dev->caps.gid_table_len[port]; MLX4_PUT(inbox, field, INIT_PORT_MAX_GID_OFFSET); field = dev->caps.pkey_table_len[port]; MLX4_PUT(inbox, field, INIT_PORT_MAX_PKEY_OFFSET); err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_INIT_PORT, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev, mailbox); } else err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); return err; } EXPORT_SYMBOL_GPL(mlx4_INIT_PORT); int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox, struct mlx4_cmd_info *cmd) { struct mlx4_priv *priv = mlx4_priv(dev); int port = vhcr->in_modifier; int err; if (!(priv->mfunc.master.slave_state[slave].init_port_mask & (1 << port))) return 0; if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) { if (priv->mfunc.master.init_port_ref[port] == 1) { err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000, MLX4_CMD_NATIVE); if (err) return err; } priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port); } else { /* infiniband port */ if (slave == mlx4_master_func_num(dev)) { if (!priv->mfunc.master.qp0_state[port].qp0_active && priv->mfunc.master.qp0_state[port].port_active) { err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000, MLX4_CMD_NATIVE); if (err) return err; priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port); priv->mfunc.master.qp0_state[port].port_active = 0; } } else priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port); } --priv->mfunc.master.init_port_ref[port]; return 0; } int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port) { return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000, MLX4_CMD_WRAPPED); } EXPORT_SYMBOL_GPL(mlx4_CLOSE_PORT); int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic) { return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000, MLX4_CMD_NATIVE); } int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages) { int ret = mlx4_cmd_imm(dev, icm_size, aux_pages, 0, 0, MLX4_CMD_SET_ICM_SIZE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (ret) return ret; /* * Round up number of system pages needed in case * MLX4_ICM_PAGE_SIZE < PAGE_SIZE. */ *aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >> (PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT); return 0; } int mlx4_NOP(struct mlx4_dev *dev) { /* Input modifier of 0x1f means "finish as soon as possible." */ return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); } int mlx4_query_diag_counters(struct mlx4_dev *dev, int array_length, u8 op_modifier, u32 in_offset[], u32 counter_out[]) { struct mlx4_cmd_mailbox *mailbox; u32 *outbox; int ret; int i; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); outbox = mailbox->buf; ret = mlx4_cmd_box(dev, 0, mailbox->dma, 0, op_modifier, MLX4_CMD_DIAG_RPRT, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (ret) goto out; for (i = 0; i < array_length; i++) { if (in_offset[i] > MLX4_MAILBOX_SIZE) { ret = -EINVAL; goto out; } MLX4_GET(counter_out[i], outbox, in_offset[i]); } out: mlx4_free_cmd_mailbox(dev, mailbox); return ret; } EXPORT_SYMBOL_GPL(mlx4_query_diag_counters); int mlx4_MOD_STAT_CFG_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox, struct mlx4_cmd_info *cmd) { return -EPERM; } #define MLX4_WOL_SETUP_MODE (5 << 28) int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port) { u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8; return mlx4_cmd_imm(dev, 0, config, in_mod, 0x3, MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); } EXPORT_SYMBOL_GPL(mlx4_wol_read); int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port) { u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8; return mlx4_cmd(dev, config, in_mod, 0x1, MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); } EXPORT_SYMBOL_GPL(mlx4_wol_write); enum { ADD_TO_MCG = 0x26, }; void mlx4_opreq_action(struct work_struct *work) { struct mlx4_priv *priv = container_of(work, struct mlx4_priv, opreq_task); struct mlx4_dev *dev = &priv->dev; int num_tasks = atomic_read(&priv->opreq_count); struct mlx4_cmd_mailbox *mailbox; struct mlx4_mgm *mgm; u32 *outbox; u32 modifier; u16 token; u16 type_m; u16 type; int err; u32 num_qps; struct mlx4_qp qp; int i; u8 rem_mcg; u8 prot; #define GET_OP_REQ_MODIFIER_OFFSET 0x08 #define GET_OP_REQ_TOKEN_OFFSET 0x14 #define GET_OP_REQ_TYPE_OFFSET 0x1a #define GET_OP_REQ_DATA_OFFSET 0x20 mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { mlx4_err(dev, "Failed to allocate mailbox for GET_OP_REQ\n"); return; } outbox = mailbox->buf; while (num_tasks) { err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) { mlx4_err(dev, "Failed to retrieve required operation: %d\n", err); return; } MLX4_GET(modifier, outbox, GET_OP_REQ_MODIFIER_OFFSET); MLX4_GET(token, outbox, GET_OP_REQ_TOKEN_OFFSET); MLX4_GET(type, outbox, GET_OP_REQ_TYPE_OFFSET); type_m = type >> 12; type &= 0xfff; switch (type) { case ADD_TO_MCG: if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { mlx4_warn(dev, "ADD MCG operation is not supported in " "DEVICE_MANAGED steerign mode\n"); err = EPERM; break; } mgm = (struct mlx4_mgm *) ((u8 *) (outbox) + GET_OP_REQ_DATA_OFFSET); num_qps = be32_to_cpu(mgm->members_count) & MGM_QPN_MASK; rem_mcg = ((u8 *) (&mgm->members_count))[0] & 1; prot = ((u8 *) (&mgm->members_count))[0] >> 6; for (i = 0; i < num_qps; i++) { qp.qpn = be32_to_cpu(mgm->qp[i]); if (rem_mcg) err = mlx4_multicast_detach(dev, &qp, mgm->gid, prot, 0); else err = mlx4_multicast_attach(dev, &qp, mgm->gid, mgm->gid[5] ,0, prot, NULL); if (err) break; } break; default: mlx4_warn(dev, "Bad type for required operation\n"); err = EINVAL; break; } err = mlx4_cmd(dev, 0, ((u32) err | cpu_to_be32(token) << 16), 1, MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) { mlx4_err(dev, "Failed to acknowledge required request: %d\n", err); goto out; } memset(outbox, 0, 0xffc); num_tasks = atomic_dec_return(&priv->opreq_count); } out: mlx4_free_cmd_mailbox(dev, mailbox); } Index: head/sys/ofed/drivers/net/mlx4/main.c =================================================================== --- head/sys/ofed/drivers/net/mlx4/main.c (revision 300675) +++ head/sys/ofed/drivers/net/mlx4/main.c (revision 300676) @@ -1,3881 +1,3883 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006, 2007, 2008, 2014 Mellanox Technologies. All rights reserved. * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#define LINUXKPI_PARAM_PREFIX mlx4_ + #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mlx4.h" #include "fw.h" #include "icm.h" #include "mlx4_stats.h" /* Mellanox ConnectX HCA low-level driver */ struct workqueue_struct *mlx4_wq; #ifdef CONFIG_MLX4_DEBUG int mlx4_debug_level = 0; module_param_named(debug_level, mlx4_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif /* CONFIG_MLX4_DEBUG */ #ifdef CONFIG_PCI_MSI static int msi_x = 1; module_param(msi_x, int, 0444); MODULE_PARM_DESC(msi_x, "0 - don't use MSI-X, 1 - use MSI-X, >1 - limit number of MSI-X irqs to msi_x (non-SRIOV only)"); #else /* CONFIG_PCI_MSI */ #define msi_x (0) #endif /* CONFIG_PCI_MSI */ static int enable_sys_tune = 0; module_param(enable_sys_tune, int, 0444); MODULE_PARM_DESC(enable_sys_tune, "Tune the cpu's for better performance (default 0)"); int mlx4_blck_lb = 1; module_param_named(block_loopback, mlx4_blck_lb, int, 0644); MODULE_PARM_DESC(block_loopback, "Block multicast loopback packets if > 0 " "(default: 1)"); enum { DEFAULT_DOMAIN = 0, BDF_STR_SIZE = 8, /* bb:dd.f- */ DBDF_STR_SIZE = 13 /* mmmm:bb:dd.f- */ }; enum { NUM_VFS, PROBE_VF, PORT_TYPE_ARRAY }; enum { VALID_DATA, INVALID_DATA, INVALID_STR }; struct param_data { int id; struct mlx4_dbdf2val_lst dbdf2val; }; static struct param_data num_vfs = { .id = NUM_VFS, .dbdf2val = { .name = "num_vfs param", .num_vals = 1, .def_val = {0}, .range = {0, MLX4_MAX_NUM_VF} } }; module_param_string(num_vfs, num_vfs.dbdf2val.str, sizeof(num_vfs.dbdf2val.str), 0444); MODULE_PARM_DESC(num_vfs, "Either single value (e.g. '5') to define uniform num_vfs value for all devices functions\n" "\t\tor a string to map device function numbers to their num_vfs values (e.g. '0000:04:00.0-5,002b:1c:0b.a-15').\n" "\t\tHexadecimal digits for the device function (e.g. 002b:1c:0b.a) and decimal for num_vfs value (e.g. 15)."); static struct param_data probe_vf = { .id = PROBE_VF, .dbdf2val = { .name = "probe_vf param", .num_vals = 1, .def_val = {0}, .range = {0, MLX4_MAX_NUM_VF} } }; module_param_string(probe_vf, probe_vf.dbdf2val.str, sizeof(probe_vf.dbdf2val.str), 0444); MODULE_PARM_DESC(probe_vf, "Either single value (e.g. '3') to define uniform number of VFs to probe by the pf driver for all devices functions\n" "\t\tor a string to map device function numbers to their probe_vf values (e.g. '0000:04:00.0-3,002b:1c:0b.a-13').\n" "\t\tHexadecimal digits for the device function (e.g. 002b:1c:0b.a) and decimal for probe_vf value (e.g. 13)."); int mlx4_log_num_mgm_entry_size = MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE; module_param_named(log_num_mgm_entry_size, mlx4_log_num_mgm_entry_size, int, 0444); MODULE_PARM_DESC(log_num_mgm_entry_size, "log mgm size, that defines the num" " of qp per mcg, for example:" " 10 gives 248.range: 7 <=" " log_num_mgm_entry_size <= 12." " To activate device managed" " flow steering when available, set to -1"); static int high_rate_steer; module_param(high_rate_steer, int, 0444); MODULE_PARM_DESC(high_rate_steer, "Enable steering mode for higher packet rate" " (default off)"); static int fast_drop; module_param_named(fast_drop, fast_drop, int, 0444); MODULE_PARM_DESC(fast_drop, "Enable fast packet drop when no receive WQEs are posted"); int mlx4_enable_64b_cqe_eqe = 1; module_param_named(enable_64b_cqe_eqe, mlx4_enable_64b_cqe_eqe, int, 0644); MODULE_PARM_DESC(enable_64b_cqe_eqe, "Enable 64 byte CQEs/EQEs when the FW supports this if non-zero (default: 1)"); #define HCA_GLOBAL_CAP_MASK 0 #define PF_CONTEXT_BEHAVIOUR_MASK MLX4_FUNC_CAP_64B_EQE_CQE static char mlx4_version[] __devinitdata = DRV_NAME ": Mellanox ConnectX VPI driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; static int log_num_mac = 7; module_param_named(log_num_mac, log_num_mac, int, 0444); MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)"); static int log_num_vlan; module_param_named(log_num_vlan, log_num_vlan, int, 0444); MODULE_PARM_DESC(log_num_vlan, "(Obsolete) Log2 max number of VLANs per ETH port (0-7)"); /* Log2 max number of VLANs per ETH port (0-7) */ #define MLX4_LOG_NUM_VLANS 7 int log_mtts_per_seg = ilog2(1); module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment " "(0-7) (default: 0)"); static struct param_data port_type_array = { .id = PORT_TYPE_ARRAY, .dbdf2val = { .name = "port_type_array param", .num_vals = 2, .def_val = {MLX4_PORT_TYPE_ETH, MLX4_PORT_TYPE_ETH}, .range = {MLX4_PORT_TYPE_IB, MLX4_PORT_TYPE_NA} } }; module_param_string(port_type_array, port_type_array.dbdf2val.str, sizeof(port_type_array.dbdf2val.str), 0444); MODULE_PARM_DESC(port_type_array, "Either pair of values (e.g. '1,2') to define uniform port1/port2 types configuration for all devices functions\n" "\t\tor a string to map device function numbers to their pair of port types values (e.g. '0000:04:00.0-1;2,002b:1c:0b.a-1;1').\n" "\t\tValid port types: 1-ib, 2-eth, 3-auto, 4-N/A\n" "\t\tIn case that only one port is available use the N/A port type for port2 (e.g '1,4')."); struct mlx4_port_config { struct list_head list; enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1]; struct pci_dev *pdev; }; #define MLX4_LOG_NUM_MTT 20 /* We limit to 30 as of a bit map issue which uses int and not uint. see mlx4_buddy_init -> bitmap_zero which gets int. */ #define MLX4_MAX_LOG_NUM_MTT 30 static struct mlx4_profile mod_param_profile = { .num_qp = 19, .num_srq = 16, .rdmarc_per_qp = 4, .num_cq = 16, .num_mcg = 13, .num_mpt = 19, .num_mtt_segs = 0, /* max(20, 2*MTTs for host memory)) */ }; module_param_named(log_num_qp, mod_param_profile.num_qp, int, 0444); MODULE_PARM_DESC(log_num_qp, "log maximum number of QPs per HCA (default: 19)"); module_param_named(log_num_srq, mod_param_profile.num_srq, int, 0444); MODULE_PARM_DESC(log_num_srq, "log maximum number of SRQs per HCA " "(default: 16)"); module_param_named(log_rdmarc_per_qp, mod_param_profile.rdmarc_per_qp, int, 0444); MODULE_PARM_DESC(log_rdmarc_per_qp, "log number of RDMARC buffers per QP " "(default: 4)"); module_param_named(log_num_cq, mod_param_profile.num_cq, int, 0444); MODULE_PARM_DESC(log_num_cq, "log maximum number of CQs per HCA (default: 16)"); module_param_named(log_num_mcg, mod_param_profile.num_mcg, int, 0444); MODULE_PARM_DESC(log_num_mcg, "log maximum number of multicast groups per HCA " "(default: 13)"); module_param_named(log_num_mpt, mod_param_profile.num_mpt, int, 0444); MODULE_PARM_DESC(log_num_mpt, "log maximum number of memory protection table entries per " "HCA (default: 19)"); module_param_named(log_num_mtt, mod_param_profile.num_mtt_segs, int, 0444); MODULE_PARM_DESC(log_num_mtt, "log maximum number of memory translation table segments per " "HCA (default: max(20, 2*MTTs for register all of the host memory limited to 30))"); enum { MLX4_IF_STATE_BASIC, MLX4_IF_STATE_EXTENDED }; static inline u64 dbdf_to_u64(int domain, int bus, int dev, int fn) { return (domain << 20) | (bus << 12) | (dev << 4) | fn; } static inline void pr_bdf_err(const char *dbdf, const char *pname) { pr_warn("mlx4_core: '%s' is not valid bdf in '%s'\n", dbdf, pname); } static inline void pr_val_err(const char *dbdf, const char *pname, const char *val) { pr_warn("mlx4_core: value '%s' of bdf '%s' in '%s' is not valid\n" , val, dbdf, pname); } static inline void pr_out_of_range_bdf(const char *dbdf, int val, struct mlx4_dbdf2val_lst *dbdf2val) { pr_warn("mlx4_core: value %d in bdf '%s' of '%s' is out of its valid range (%d,%d)\n" , val, dbdf, dbdf2val->name , dbdf2val->range.min, dbdf2val->range.max); } static inline void pr_out_of_range(struct mlx4_dbdf2val_lst *dbdf2val) { pr_warn("mlx4_core: value of '%s' is out of its valid range (%d,%d)\n" , dbdf2val->name , dbdf2val->range.min, dbdf2val->range.max); } static inline int is_in_range(int val, struct mlx4_range *r) { return (val >= r->min && val <= r->max); } static int update_defaults(struct param_data *pdata) { long int val[MLX4_MAX_BDF_VALS]; int ret; char *t, *p = pdata->dbdf2val.str; char sval[32]; int val_len; if (!strlen(p) || strchr(p, ':') || strchr(p, '.') || strchr(p, ';')) return INVALID_STR; switch (pdata->id) { case PORT_TYPE_ARRAY: t = strchr(p, ','); if (!t || t == p || (t - p) > sizeof(sval)) return INVALID_STR; val_len = t - p; strncpy(sval, p, val_len); sval[val_len] = 0; ret = kstrtol(sval, 0, &val[0]); if (ret == -EINVAL) return INVALID_STR; if (ret || !is_in_range(val[0], &pdata->dbdf2val.range)) { pr_out_of_range(&pdata->dbdf2val); return INVALID_DATA; } ret = kstrtol(t + 1, 0, &val[1]); if (ret == -EINVAL) return INVALID_STR; if (ret || !is_in_range(val[1], &pdata->dbdf2val.range)) { pr_out_of_range(&pdata->dbdf2val); return INVALID_DATA; } pdata->dbdf2val.tbl[0].val[0] = val[0]; pdata->dbdf2val.tbl[0].val[1] = val[1]; break; case NUM_VFS: case PROBE_VF: ret = kstrtol(p, 0, &val[0]); if (ret == -EINVAL) return INVALID_STR; if (ret || !is_in_range(val[0], &pdata->dbdf2val.range)) { pr_out_of_range(&pdata->dbdf2val); return INVALID_DATA; } pdata->dbdf2val.tbl[0].val[0] = val[0]; break; } pdata->dbdf2val.tbl[1].dbdf = MLX4_ENDOF_TBL; return VALID_DATA; } int mlx4_fill_dbdf2val_tbl(struct mlx4_dbdf2val_lst *dbdf2val_lst) { int domain, bus, dev, fn; u64 dbdf; char *p, *t, *v; char tmp[32]; char sbdf[32]; char sep = ','; int j, k, str_size, i = 1; int prfx_size; p = dbdf2val_lst->str; for (j = 0; j < dbdf2val_lst->num_vals; j++) dbdf2val_lst->tbl[0].val[j] = dbdf2val_lst->def_val[j]; dbdf2val_lst->tbl[1].dbdf = MLX4_ENDOF_TBL; str_size = strlen(dbdf2val_lst->str); if (str_size == 0) return 0; while (strlen(p)) { prfx_size = BDF_STR_SIZE; sbdf[prfx_size] = 0; strncpy(sbdf, p, prfx_size); domain = DEFAULT_DOMAIN; if (sscanf(sbdf, "%02x:%02x.%x-", &bus, &dev, &fn) != 3) { prfx_size = DBDF_STR_SIZE; sbdf[prfx_size] = 0; strncpy(sbdf, p, prfx_size); if (sscanf(sbdf, "%04x:%02x:%02x.%x-", &domain, &bus, &dev, &fn) != 4) { pr_bdf_err(sbdf, dbdf2val_lst->name); goto err; } sprintf(tmp, "%04x:%02x:%02x.%x-", domain, bus, dev, fn); } else { sprintf(tmp, "%02x:%02x.%x-", bus, dev, fn); } if (strnicmp(sbdf, tmp, sizeof(tmp))) { pr_bdf_err(sbdf, dbdf2val_lst->name); goto err; } dbdf = dbdf_to_u64(domain, bus, dev, fn); for (j = 1; j < i; j++) if (dbdf2val_lst->tbl[j].dbdf == dbdf) { pr_warn("mlx4_core: in '%s', %s appears multiple times\n" , dbdf2val_lst->name, sbdf); goto err; } if (i >= MLX4_DEVS_TBL_SIZE) { pr_warn("mlx4_core: Too many devices in '%s'\n" , dbdf2val_lst->name); goto err; } p += prfx_size; t = strchr(p, sep); t = t ? t : p + strlen(p); if (p >= t) { pr_val_err(sbdf, dbdf2val_lst->name, ""); goto err; } for (k = 0; k < dbdf2val_lst->num_vals; k++) { char sval[32]; long int val; int ret, val_len; char vsep = ';'; v = (k == dbdf2val_lst->num_vals - 1) ? t : strchr(p, vsep); if (!v || v > t || v == p || (v - p) > sizeof(sval)) { pr_val_err(sbdf, dbdf2val_lst->name, p); goto err; } val_len = v - p; strncpy(sval, p, val_len); sval[val_len] = 0; ret = kstrtol(sval, 0, &val); if (ret) { if (strchr(p, vsep)) pr_warn("mlx4_core: too many vals in bdf '%s' of '%s'\n" , sbdf, dbdf2val_lst->name); else pr_val_err(sbdf, dbdf2val_lst->name, sval); goto err; } if (!is_in_range(val, &dbdf2val_lst->range)) { pr_out_of_range_bdf(sbdf, val, dbdf2val_lst); goto err; } dbdf2val_lst->tbl[i].val[k] = val; p = v; if (p[0] == vsep) p++; } dbdf2val_lst->tbl[i].dbdf = dbdf; if (strlen(p)) { if (p[0] != sep) { pr_warn("mlx4_core: expect separator '%c' before '%s' in '%s'\n" , sep, p, dbdf2val_lst->name); goto err; } p++; } i++; if (i < MLX4_DEVS_TBL_SIZE) dbdf2val_lst->tbl[i].dbdf = MLX4_ENDOF_TBL; } return 0; err: dbdf2val_lst->tbl[1].dbdf = MLX4_ENDOF_TBL; pr_warn("mlx4_core: The value of '%s' is incorrect. The value is discarded!\n" , dbdf2val_lst->name); return -EINVAL; } EXPORT_SYMBOL(mlx4_fill_dbdf2val_tbl); int mlx4_get_val(struct mlx4_dbdf2val *tbl, struct pci_dev *pdev, int idx, int *val) { u64 dbdf; int i = 1; *val = tbl[0].val[idx]; if (!pdev) return -EINVAL; dbdf = dbdf_to_u64(pci_get_domain(pdev->dev.bsddev), pci_get_bus(pdev->dev.bsddev), PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); while ((i < MLX4_DEVS_TBL_SIZE) && (tbl[i].dbdf != MLX4_ENDOF_TBL)) { if (tbl[i].dbdf == dbdf) { *val = tbl[i].val[idx]; return 0; } i++; } return 0; } EXPORT_SYMBOL(mlx4_get_val); static void process_mod_param_profile(struct mlx4_profile *profile) { vm_size_t hwphyssz; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.realmem", (u_long *) &hwphyssz); profile->num_qp = 1 << mod_param_profile.num_qp; profile->num_srq = 1 << mod_param_profile.num_srq; profile->rdmarc_per_qp = 1 << mod_param_profile.rdmarc_per_qp; profile->num_cq = 1 << mod_param_profile.num_cq; profile->num_mcg = 1 << mod_param_profile.num_mcg; profile->num_mpt = 1 << mod_param_profile.num_mpt; /* * We want to scale the number of MTTs with the size of the * system memory, since it makes sense to register a lot of * memory on a system with a lot of memory. As a heuristic, * make sure we have enough MTTs to register twice the system * memory (with PAGE_SIZE entries). * * This number has to be a power of two and fit into 32 bits * due to device limitations. We cap this at 2^30 as of bit map * limitation to work with int instead of uint (mlx4_buddy_init -> bitmap_zero) * That limits us to 4TB of memory registration per HCA with * 4KB pages, which is probably OK for the next few months. */ if (mod_param_profile.num_mtt_segs) profile->num_mtt_segs = 1 << mod_param_profile.num_mtt_segs; else { profile->num_mtt_segs = roundup_pow_of_two(max_t(unsigned, 1 << (MLX4_LOG_NUM_MTT - log_mtts_per_seg), min(1UL << (MLX4_MAX_LOG_NUM_MTT - log_mtts_per_seg), (hwphyssz << 1) >> log_mtts_per_seg))); /* set the actual value, so it will be reflected to the user using the sysfs */ mod_param_profile.num_mtt_segs = ilog2(profile->num_mtt_segs); } } int mlx4_check_port_params(struct mlx4_dev *dev, enum mlx4_port_type *port_type) { int i; for (i = 0; i < dev->caps.num_ports - 1; i++) { if (port_type[i] != port_type[i + 1]) { if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) { mlx4_err(dev, "Only same port types supported " "on this HCA, aborting.\n"); return -EINVAL; } } } for (i = 0; i < dev->caps.num_ports; i++) { if (!(port_type[i] & dev->caps.supported_type[i+1])) { mlx4_err(dev, "Requested port type for port %d is not " "supported on this HCA\n", i + 1); return -EINVAL; } } return 0; } static void mlx4_set_port_mask(struct mlx4_dev *dev) { int i; for (i = 1; i <= dev->caps.num_ports; ++i) dev->caps.port_mask[i] = dev->caps.port_type[i]; } static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) { int err; int i; err = mlx4_QUERY_DEV_CAP(dev, dev_cap); if (err) { mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n"); return err; } if (dev_cap->min_page_sz > PAGE_SIZE) { mlx4_err(dev, "HCA minimum page size of %d bigger than " "kernel PAGE_SIZE of %d, aborting.\n", dev_cap->min_page_sz, (int)PAGE_SIZE); return -ENODEV; } if (dev_cap->num_ports > MLX4_MAX_PORTS) { mlx4_err(dev, "HCA has %d ports, but we only support %d, " "aborting.\n", dev_cap->num_ports, MLX4_MAX_PORTS); return -ENODEV; } if (dev_cap->uar_size > pci_resource_len(dev->pdev, 2)) { mlx4_err(dev, "HCA reported UAR size of 0x%x bigger than " "PCI resource 2 size of 0x%llx, aborting.\n", dev_cap->uar_size, (unsigned long long) pci_resource_len(dev->pdev, 2)); return -ENODEV; } dev->caps.num_ports = dev_cap->num_ports; dev->phys_caps.num_phys_eqs = MLX4_MAX_EQ_NUM; for (i = 1; i <= dev->caps.num_ports; ++i) { dev->caps.vl_cap[i] = dev_cap->max_vl[i]; dev->caps.ib_mtu_cap[i] = dev_cap->ib_mtu[i]; dev->phys_caps.gid_phys_table_len[i] = dev_cap->max_gids[i]; dev->phys_caps.pkey_phys_table_len[i] = dev_cap->max_pkeys[i]; /* set gid and pkey table operating lengths by default * to non-sriov values */ dev->caps.gid_table_len[i] = dev_cap->max_gids[i]; dev->caps.pkey_table_len[i] = dev_cap->max_pkeys[i]; dev->caps.port_width_cap[i] = dev_cap->max_port_width[i]; dev->caps.eth_mtu_cap[i] = dev_cap->eth_mtu[i]; dev->caps.def_mac[i] = dev_cap->def_mac[i]; dev->caps.supported_type[i] = dev_cap->supported_port_types[i]; dev->caps.suggested_type[i] = dev_cap->suggested_type[i]; dev->caps.default_sense[i] = dev_cap->default_sense[i]; dev->caps.trans_type[i] = dev_cap->trans_type[i]; dev->caps.vendor_oui[i] = dev_cap->vendor_oui[i]; dev->caps.wavelength[i] = dev_cap->wavelength[i]; dev->caps.trans_code[i] = dev_cap->trans_code[i]; } dev->caps.uar_page_size = PAGE_SIZE; dev->caps.num_uars = dev_cap->uar_size / PAGE_SIZE; dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay; dev->caps.bf_reg_size = dev_cap->bf_reg_size; dev->caps.bf_regs_per_page = dev_cap->bf_regs_per_page; dev->caps.max_sq_sg = dev_cap->max_sq_sg; dev->caps.max_rq_sg = dev_cap->max_rq_sg; dev->caps.max_wqes = dev_cap->max_qp_sz; dev->caps.max_qp_init_rdma = dev_cap->max_requester_per_qp; dev->caps.max_srq_wqes = dev_cap->max_srq_sz; dev->caps.max_srq_sge = dev_cap->max_rq_sg - 1; dev->caps.reserved_srqs = dev_cap->reserved_srqs; dev->caps.max_sq_desc_sz = dev_cap->max_sq_desc_sz; dev->caps.max_rq_desc_sz = dev_cap->max_rq_desc_sz; /* * Subtract 1 from the limit because we need to allocate a * spare CQE to enable resizing the CQ */ dev->caps.max_cqes = dev_cap->max_cq_sz - 1; dev->caps.reserved_cqs = dev_cap->reserved_cqs; dev->caps.reserved_eqs = dev_cap->reserved_eqs; dev->caps.reserved_mtts = dev_cap->reserved_mtts; dev->caps.reserved_mrws = dev_cap->reserved_mrws; /* The first 128 UARs are used for EQ doorbells */ dev->caps.reserved_uars = max_t(int, 128, dev_cap->reserved_uars); dev->caps.reserved_pds = dev_cap->reserved_pds; dev->caps.reserved_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ? dev_cap->reserved_xrcds : 0; dev->caps.max_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ? dev_cap->max_xrcds : 0; dev->caps.mtt_entry_sz = dev_cap->mtt_entry_sz; dev->caps.max_msg_sz = dev_cap->max_msg_sz; dev->caps.page_size_cap = ~(u32) (dev_cap->min_page_sz - 1); dev->caps.flags = dev_cap->flags; dev->caps.flags2 = dev_cap->flags2; dev->caps.bmme_flags = dev_cap->bmme_flags; dev->caps.reserved_lkey = dev_cap->reserved_lkey; dev->caps.stat_rate_support = dev_cap->stat_rate_support; dev->caps.cq_timestamp = dev_cap->timestamp_support; dev->caps.max_gso_sz = dev_cap->max_gso_sz; dev->caps.max_rss_tbl_sz = dev_cap->max_rss_tbl_sz; /* Sense port always allowed on supported devices for ConnectX-1 and -2 */ if (mlx4_priv(dev)->pci_dev_data & MLX4_PCI_DEV_FORCE_SENSE_PORT) dev->caps.flags |= MLX4_DEV_CAP_FLAG_SENSE_SUPPORT; /* Don't do sense port on multifunction devices (for now at least) */ if (mlx4_is_mfunc(dev)) dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_SENSE_SUPPORT; dev->caps.log_num_macs = log_num_mac; dev->caps.log_num_vlans = MLX4_LOG_NUM_VLANS; dev->caps.fast_drop = fast_drop ? !!(dev->caps.flags & MLX4_DEV_CAP_FLAG_FAST_DROP) : 0; for (i = 1; i <= dev->caps.num_ports; ++i) { dev->caps.port_type[i] = MLX4_PORT_TYPE_NONE; if (dev->caps.supported_type[i]) { /* if only ETH is supported - assign ETH */ if (dev->caps.supported_type[i] == MLX4_PORT_TYPE_ETH) dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH; /* if only IB is supported, assign IB */ else if (dev->caps.supported_type[i] == MLX4_PORT_TYPE_IB) dev->caps.port_type[i] = MLX4_PORT_TYPE_IB; else { /* * if IB and ETH are supported, we set the port * type according to user selection of port type; * if there is no user selection, take the FW hint */ int pta; mlx4_get_val(port_type_array.dbdf2val.tbl, pci_physfn(dev->pdev), i - 1, &pta); if (pta == MLX4_PORT_TYPE_NONE) { dev->caps.port_type[i] = dev->caps.suggested_type[i] ? MLX4_PORT_TYPE_ETH : MLX4_PORT_TYPE_IB; } else if (pta == MLX4_PORT_TYPE_NA) { mlx4_err(dev, "Port %d is valid port. " "It is not allowed to configure its type to N/A(%d)\n", i, MLX4_PORT_TYPE_NA); return -EINVAL; } else { dev->caps.port_type[i] = pta; } } } /* * Link sensing is allowed on the port if 3 conditions are true: * 1. Both protocols are supported on the port. * 2. Different types are supported on the port * 3. FW declared that it supports link sensing */ mlx4_priv(dev)->sense.sense_allowed[i] = ((dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO) && (dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) && (dev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT)); /* Disablling auto sense for default Eth ports support */ mlx4_priv(dev)->sense.sense_allowed[i] = 0; /* * If "default_sense" bit is set, we move the port to "AUTO" mode * and perform sense_port FW command to try and set the correct * port type from beginning */ if (mlx4_priv(dev)->sense.sense_allowed[i] && dev->caps.default_sense[i]) { enum mlx4_port_type sensed_port = MLX4_PORT_TYPE_NONE; dev->caps.possible_type[i] = MLX4_PORT_TYPE_AUTO; mlx4_SENSE_PORT(dev, i, &sensed_port); if (sensed_port != MLX4_PORT_TYPE_NONE) dev->caps.port_type[i] = sensed_port; } else { dev->caps.possible_type[i] = dev->caps.port_type[i]; } if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) { dev->caps.log_num_macs = dev_cap->log_max_macs[i]; mlx4_warn(dev, "Requested number of MACs is too much " "for port %d, reducing to %d.\n", i, 1 << dev->caps.log_num_macs); } if (dev->caps.log_num_vlans > dev_cap->log_max_vlans[i]) { dev->caps.log_num_vlans = dev_cap->log_max_vlans[i]; mlx4_warn(dev, "Requested number of VLANs is too much " "for port %d, reducing to %d.\n", i, 1 << dev->caps.log_num_vlans); } } dev->caps.max_basic_counters = dev_cap->max_basic_counters; dev->caps.max_extended_counters = dev_cap->max_extended_counters; /* support extended counters if available */ if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS_EXT) dev->caps.max_counters = dev->caps.max_extended_counters; else dev->caps.max_counters = dev->caps.max_basic_counters; dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps; dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] = (1 << dev->caps.log_num_macs) * (1 << dev->caps.log_num_vlans) * dev->caps.num_ports; dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH; dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH]; dev->caps.sync_qp = dev_cap->sync_qp; if (dev->pdev->device == 0x1003) dev->caps.cq_flags |= MLX4_DEV_CAP_CQ_FLAG_IO; dev->caps.sqp_demux = (mlx4_is_master(dev)) ? MLX4_MAX_NUM_SLAVES : 0; if (!mlx4_enable_64b_cqe_eqe && !mlx4_is_slave(dev)) { if (dev_cap->flags & (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) { mlx4_warn(dev, "64B EQEs/CQEs supported by the device but not enabled\n"); dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE; dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE; } } if ((dev->caps.flags & (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) && mlx4_is_master(dev)) dev->caps.function_caps |= MLX4_FUNC_CAP_64B_EQE_CQE; if (!mlx4_is_slave(dev)) { for (i = 0; i < dev->caps.num_ports; ++i) dev->caps.def_counter_index[i] = i << 1; } return 0; } /*The function checks if there are live vf, return the num of them*/ static int mlx4_how_many_lives_vf(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_slave_state *s_state; int i; int ret = 0; for (i = 1/*the ppf is 0*/; i < dev->num_slaves; ++i) { s_state = &priv->mfunc.master.slave_state[i]; if (s_state->active && s_state->last_cmd != MLX4_COMM_CMD_RESET) { mlx4_warn(dev, "%s: slave: %d is still active\n", __func__, i); ret++; } } return ret; } int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey) { u32 qk = MLX4_RESERVED_QKEY_BASE; if (qpn >= dev->phys_caps.base_tunnel_sqpn + 8 * MLX4_MFUNC_MAX || qpn < dev->phys_caps.base_proxy_sqpn) return -EINVAL; if (qpn >= dev->phys_caps.base_tunnel_sqpn) /* tunnel qp */ qk += qpn - dev->phys_caps.base_tunnel_sqpn; else qk += qpn - dev->phys_caps.base_proxy_sqpn; *qkey = qk; return 0; } EXPORT_SYMBOL(mlx4_get_parav_qkey); void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port, int i, int val) { struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev); if (!mlx4_is_master(dev)) return; priv->virt2phys_pkey[slave][port - 1][i] = val; } EXPORT_SYMBOL(mlx4_sync_pkey_table); void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid) { struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev); if (!mlx4_is_master(dev)) return; priv->slave_node_guids[slave] = guid; } EXPORT_SYMBOL(mlx4_put_slave_node_guid); __be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave) { struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev); if (!mlx4_is_master(dev)) return 0; return priv->slave_node_guids[slave]; } EXPORT_SYMBOL(mlx4_get_slave_node_guid); int mlx4_is_slave_active(struct mlx4_dev *dev, int slave) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_slave_state *s_slave; if (!mlx4_is_master(dev)) return 0; s_slave = &priv->mfunc.master.slave_state[slave]; return !!s_slave->active; } EXPORT_SYMBOL(mlx4_is_slave_active); static void slave_adjust_steering_mode(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, struct mlx4_init_hca_param *hca_param) { dev->caps.steering_mode = hca_param->steering_mode; if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry; else dev->caps.num_qp_per_mgm = 4 * ((1 << hca_param->log_mc_entry_sz)/16 - 2); mlx4_dbg(dev, "Steering mode is: %s\n", mlx4_steering_mode_str(dev->caps.steering_mode)); } static int mlx4_slave_cap(struct mlx4_dev *dev) { int err; u32 page_size; struct mlx4_dev_cap dev_cap; struct mlx4_func_cap func_cap; struct mlx4_init_hca_param hca_param; int i; memset(&hca_param, 0, sizeof(hca_param)); err = mlx4_QUERY_HCA(dev, &hca_param); if (err) { mlx4_err(dev, "QUERY_HCA command failed, aborting.\n"); return err; } /*fail if the hca has an unknown capability */ if ((hca_param.global_caps | HCA_GLOBAL_CAP_MASK) != HCA_GLOBAL_CAP_MASK) { mlx4_err(dev, "Unknown hca global capabilities\n"); return -ENOSYS; } mlx4_log_num_mgm_entry_size = hca_param.log_mc_entry_sz; dev->caps.hca_core_clock = hca_param.hca_core_clock; memset(&dev_cap, 0, sizeof(dev_cap)); dev->caps.max_qp_dest_rdma = 1 << hca_param.log_rd_per_qp; err = mlx4_dev_cap(dev, &dev_cap); if (err) { mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n"); return err; } err = mlx4_QUERY_FW(dev); if (err) mlx4_err(dev, "QUERY_FW command failed: could not get FW version.\n"); if (!hca_param.mw_enable) { dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_MEM_WINDOW; dev->caps.bmme_flags &= ~MLX4_BMME_FLAG_TYPE_2_WIN; } page_size = ~dev->caps.page_size_cap + 1; mlx4_warn(dev, "HCA minimum page size:%d\n", page_size); if (page_size > PAGE_SIZE) { mlx4_err(dev, "HCA minimum page size of %d bigger than " "kernel PAGE_SIZE of %d, aborting.\n", page_size, (int)PAGE_SIZE); return -ENODEV; } /* slave gets uar page size from QUERY_HCA fw command */ dev->caps.uar_page_size = 1 << (hca_param.uar_page_sz + 12); /* TODO: relax this assumption */ if (dev->caps.uar_page_size != PAGE_SIZE) { mlx4_err(dev, "UAR size:%d != kernel PAGE_SIZE of %d\n", dev->caps.uar_page_size, (int)PAGE_SIZE); return -ENODEV; } memset(&func_cap, 0, sizeof(func_cap)); err = mlx4_QUERY_FUNC_CAP(dev, 0, &func_cap); if (err) { mlx4_err(dev, "QUERY_FUNC_CAP general command failed, aborting (%d).\n", err); return err; } if ((func_cap.pf_context_behaviour | PF_CONTEXT_BEHAVIOUR_MASK) != PF_CONTEXT_BEHAVIOUR_MASK) { mlx4_err(dev, "Unknown pf context behaviour\n"); return -ENOSYS; } dev->caps.num_ports = func_cap.num_ports; dev->quotas.qp = func_cap.qp_quota; dev->quotas.srq = func_cap.srq_quota; dev->quotas.cq = func_cap.cq_quota; dev->quotas.mpt = func_cap.mpt_quota; dev->quotas.mtt = func_cap.mtt_quota; dev->caps.num_qps = 1 << hca_param.log_num_qps; dev->caps.num_srqs = 1 << hca_param.log_num_srqs; dev->caps.num_cqs = 1 << hca_param.log_num_cqs; dev->caps.num_mpts = 1 << hca_param.log_mpt_sz; dev->caps.num_eqs = func_cap.max_eq; dev->caps.reserved_eqs = func_cap.reserved_eq; dev->caps.num_pds = MLX4_NUM_PDS; dev->caps.num_mgms = 0; dev->caps.num_amgms = 0; if (dev->caps.num_ports > MLX4_MAX_PORTS) { mlx4_err(dev, "HCA has %d ports, but we only support %d, " "aborting.\n", dev->caps.num_ports, MLX4_MAX_PORTS); return -ENODEV; } dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy || !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy) { err = -ENOMEM; goto err_mem; } for (i = 1; i <= dev->caps.num_ports; ++i) { err = mlx4_QUERY_FUNC_CAP(dev, (u32) i, &func_cap); if (err) { mlx4_err(dev, "QUERY_FUNC_CAP port command failed for" " port %d, aborting (%d).\n", i, err); goto err_mem; } dev->caps.qp0_tunnel[i - 1] = func_cap.qp0_tunnel_qpn; dev->caps.qp0_proxy[i - 1] = func_cap.qp0_proxy_qpn; dev->caps.qp1_tunnel[i - 1] = func_cap.qp1_tunnel_qpn; dev->caps.qp1_proxy[i - 1] = func_cap.qp1_proxy_qpn; dev->caps.def_counter_index[i - 1] = func_cap.def_counter_index; dev->caps.port_mask[i] = dev->caps.port_type[i]; err = mlx4_get_slave_pkey_gid_tbl_len(dev, i, &dev->caps.gid_table_len[i], &dev->caps.pkey_table_len[i]); if (err) goto err_mem; } if (dev->caps.uar_page_size * (dev->caps.num_uars - dev->caps.reserved_uars) > pci_resource_len(dev->pdev, 2)) { mlx4_err(dev, "HCA reported UAR region size of 0x%x bigger than " "PCI resource 2 size of 0x%llx, aborting.\n", dev->caps.uar_page_size * dev->caps.num_uars, (unsigned long long) pci_resource_len(dev->pdev, 2)); err = -ENOMEM; goto err_mem; } if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_EQE_ENABLED) { dev->caps.eqe_size = 64; dev->caps.eqe_factor = 1; } else { dev->caps.eqe_size = 32; dev->caps.eqe_factor = 0; } if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_CQE_ENABLED) { dev->caps.cqe_size = 64; dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE; } else { dev->caps.cqe_size = 32; } dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS; mlx4_warn(dev, "Timestamping is not supported in slave mode.\n"); slave_adjust_steering_mode(dev, &dev_cap, &hca_param); return 0; err_mem: kfree(dev->caps.qp0_tunnel); kfree(dev->caps.qp0_proxy); kfree(dev->caps.qp1_tunnel); kfree(dev->caps.qp1_proxy); dev->caps.qp0_tunnel = dev->caps.qp0_proxy = dev->caps.qp1_tunnel = dev->caps.qp1_proxy = NULL; return err; } static void mlx4_request_modules(struct mlx4_dev *dev) { int port; int has_ib_port = false; int has_eth_port = false; #define EN_DRV_NAME "mlx4_en" #define IB_DRV_NAME "mlx4_ib" for (port = 1; port <= dev->caps.num_ports; port++) { if (dev->caps.port_type[port] == MLX4_PORT_TYPE_IB) has_ib_port = true; else if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) has_eth_port = true; } if (has_ib_port) request_module_nowait(IB_DRV_NAME); if (has_eth_port) request_module_nowait(EN_DRV_NAME); } /* * Change the port configuration of the device. * Every user of this function must hold the port mutex. */ int mlx4_change_port_types(struct mlx4_dev *dev, enum mlx4_port_type *port_types) { int err = 0; int change = 0; int port; for (port = 0; port < dev->caps.num_ports; port++) { /* Change the port type only if the new type is different * from the current, and not set to Auto */ if (port_types[port] != dev->caps.port_type[port + 1]) change = 1; } if (change) { mlx4_unregister_device(dev); for (port = 1; port <= dev->caps.num_ports; port++) { mlx4_CLOSE_PORT(dev, port); dev->caps.port_type[port] = port_types[port - 1]; err = mlx4_SET_PORT(dev, port, -1); if (err) { mlx4_err(dev, "Failed to set port %d, " "aborting\n", port); goto out; } } mlx4_set_port_mask(dev); err = mlx4_register_device(dev); if (err) { mlx4_err(dev, "Failed to register device\n"); goto out; } mlx4_request_modules(dev); } out: return err; } static ssize_t show_port_type(struct device *dev, struct device_attribute *attr, char *buf) { struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info, port_attr); struct mlx4_dev *mdev = info->dev; char type[8]; sprintf(type, "%s", (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_IB) ? "ib" : "eth"); if (mdev->caps.possible_type[info->port] == MLX4_PORT_TYPE_AUTO) sprintf(buf, "auto (%s)\n", type); else sprintf(buf, "%s\n", type); return strlen(buf); } static ssize_t set_port_type(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info, port_attr); struct mlx4_dev *mdev = info->dev; struct mlx4_priv *priv = mlx4_priv(mdev); enum mlx4_port_type types[MLX4_MAX_PORTS]; enum mlx4_port_type new_types[MLX4_MAX_PORTS]; int i; int err = 0; if (!strcmp(buf, "ib\n")) info->tmp_type = MLX4_PORT_TYPE_IB; else if (!strcmp(buf, "eth\n")) info->tmp_type = MLX4_PORT_TYPE_ETH; else if (!strcmp(buf, "auto\n")) info->tmp_type = MLX4_PORT_TYPE_AUTO; else { mlx4_err(mdev, "%s is not supported port type\n", buf); return -EINVAL; } if ((info->tmp_type & mdev->caps.supported_type[info->port]) != info->tmp_type) { mlx4_err(mdev, "Requested port type for port %d is not supported on this HCA\n", info->port); return -EINVAL; } mlx4_stop_sense(mdev); mutex_lock(&priv->port_mutex); /* Possible type is always the one that was delivered */ mdev->caps.possible_type[info->port] = info->tmp_type; for (i = 0; i < mdev->caps.num_ports; i++) { types[i] = priv->port[i+1].tmp_type ? priv->port[i+1].tmp_type : mdev->caps.possible_type[i+1]; if (types[i] == MLX4_PORT_TYPE_AUTO) types[i] = mdev->caps.port_type[i+1]; } if (!(mdev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) && !(mdev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT)) { for (i = 1; i <= mdev->caps.num_ports; i++) { if (mdev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) { mdev->caps.possible_type[i] = mdev->caps.port_type[i]; err = -EINVAL; } } } if (err) { mlx4_err(mdev, "Auto sensing is not supported on this HCA. " "Set only 'eth' or 'ib' for both ports " "(should be the same)\n"); goto out; } mlx4_do_sense_ports(mdev, new_types, types); err = mlx4_check_port_params(mdev, new_types); if (err) goto out; /* We are about to apply the changes after the configuration * was verified, no need to remember the temporary types * any more */ for (i = 0; i < mdev->caps.num_ports; i++) priv->port[i + 1].tmp_type = 0; err = mlx4_change_port_types(mdev, new_types); out: mlx4_start_sense(mdev); mutex_unlock(&priv->port_mutex); return err ? err : count; } enum ibta_mtu { IB_MTU_256 = 1, IB_MTU_512 = 2, IB_MTU_1024 = 3, IB_MTU_2048 = 4, IB_MTU_4096 = 5 }; static inline int int_to_ibta_mtu(int mtu) { switch (mtu) { case 256: return IB_MTU_256; case 512: return IB_MTU_512; case 1024: return IB_MTU_1024; case 2048: return IB_MTU_2048; case 4096: return IB_MTU_4096; default: return -1; } } static inline int ibta_mtu_to_int(enum ibta_mtu mtu) { switch (mtu) { case IB_MTU_256: return 256; case IB_MTU_512: return 512; case IB_MTU_1024: return 1024; case IB_MTU_2048: return 2048; case IB_MTU_4096: return 4096; default: return -1; } } static ssize_t show_board(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_hca_info *info = container_of(attr, struct mlx4_hca_info, board_attr); struct mlx4_dev *mdev = info->dev; return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, mdev->board_id); } static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_hca_info *info = container_of(attr, struct mlx4_hca_info, hca_attr); struct mlx4_dev *mdev = info->dev; return sprintf(buf, "MT%d\n", mdev->pdev->device); } static ssize_t show_firmware_version(struct device *dev, struct device_attribute *attr, char *buf) { struct mlx4_hca_info *info = container_of(attr, struct mlx4_hca_info, firmware_attr); struct mlx4_dev *mdev = info->dev; return sprintf(buf, "%d.%d.%d\n", (int)(mdev->caps.fw_ver >> 32), (int)(mdev->caps.fw_ver >> 16) & 0xffff, (int)mdev->caps.fw_ver & 0xffff); } static ssize_t show_port_ib_mtu(struct device *dev, struct device_attribute *attr, char *buf) { struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info, port_mtu_attr); struct mlx4_dev *mdev = info->dev; /* When port type is eth, port mtu value isn't used. */ if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH) return -EINVAL; sprintf(buf, "%d\n", ibta_mtu_to_int(mdev->caps.port_ib_mtu[info->port])); return strlen(buf); } static ssize_t set_port_ib_mtu(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info, port_mtu_attr); struct mlx4_dev *mdev = info->dev; struct mlx4_priv *priv = mlx4_priv(mdev); int err, port, mtu, ibta_mtu = -1; if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH) { mlx4_warn(mdev, "port level mtu is only used for IB ports\n"); return -EINVAL; } mtu = (int) simple_strtol(buf, NULL, 0); ibta_mtu = int_to_ibta_mtu(mtu); if (ibta_mtu < 0) { mlx4_err(mdev, "%s is invalid IBTA mtu\n", buf); return -EINVAL; } mdev->caps.port_ib_mtu[info->port] = ibta_mtu; mlx4_stop_sense(mdev); mutex_lock(&priv->port_mutex); mlx4_unregister_device(mdev); for (port = 1; port <= mdev->caps.num_ports; port++) { mlx4_CLOSE_PORT(mdev, port); err = mlx4_SET_PORT(mdev, port, -1); if (err) { mlx4_err(mdev, "Failed to set port %d, " "aborting\n", port); goto err_set_port; } } err = mlx4_register_device(mdev); err_set_port: mutex_unlock(&priv->port_mutex); mlx4_start_sense(mdev); return err ? err : count; } static int mlx4_load_fw(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int err, unmap_flag = 0; priv->fw.fw_icm = mlx4_alloc_icm(dev, priv->fw.fw_pages, GFP_HIGHUSER | __GFP_NOWARN, 0); if (!priv->fw.fw_icm) { mlx4_err(dev, "Couldn't allocate FW area, aborting.\n"); return -ENOMEM; } err = mlx4_MAP_FA(dev, priv->fw.fw_icm); if (err) { mlx4_err(dev, "MAP_FA command failed, aborting.\n"); goto err_free; } err = mlx4_RUN_FW(dev); if (err) { mlx4_err(dev, "RUN_FW command failed, aborting.\n"); goto err_unmap_fa; } return 0; err_unmap_fa: unmap_flag = mlx4_UNMAP_FA(dev); if (unmap_flag) pr_warn("mlx4_core: mlx4_UNMAP_FA failed.\n"); err_free: if (!unmap_flag) mlx4_free_icm(dev, priv->fw.fw_icm, 0); return err; } static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, int cmpt_entry_sz) { struct mlx4_priv *priv = mlx4_priv(dev); int err; int num_eqs; err = mlx4_init_icm_table(dev, &priv->qp_table.cmpt_table, cmpt_base + ((u64) (MLX4_CMPT_TYPE_QP * cmpt_entry_sz) << MLX4_CMPT_SHIFT), cmpt_entry_sz, dev->caps.num_qps, dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 0, 0); if (err) goto err; err = mlx4_init_icm_table(dev, &priv->srq_table.cmpt_table, cmpt_base + ((u64) (MLX4_CMPT_TYPE_SRQ * cmpt_entry_sz) << MLX4_CMPT_SHIFT), cmpt_entry_sz, dev->caps.num_srqs, dev->caps.reserved_srqs, 0, 0); if (err) goto err_qp; err = mlx4_init_icm_table(dev, &priv->cq_table.cmpt_table, cmpt_base + ((u64) (MLX4_CMPT_TYPE_CQ * cmpt_entry_sz) << MLX4_CMPT_SHIFT), cmpt_entry_sz, dev->caps.num_cqs, dev->caps.reserved_cqs, 0, 0); if (err) goto err_srq; num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs : dev->caps.num_eqs; err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table, cmpt_base + ((u64) (MLX4_CMPT_TYPE_EQ * cmpt_entry_sz) << MLX4_CMPT_SHIFT), cmpt_entry_sz, num_eqs, num_eqs, 0, 0); if (err) goto err_cq; return 0; err_cq: mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table); err_srq: mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table); err_qp: mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table); err: return err; } static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, struct mlx4_init_hca_param *init_hca, u64 icm_size) { struct mlx4_priv *priv = mlx4_priv(dev); u64 aux_pages; int num_eqs; int err, unmap_flag = 0; err = mlx4_SET_ICM_SIZE(dev, icm_size, &aux_pages); if (err) { mlx4_err(dev, "SET_ICM_SIZE command failed, aborting.\n"); return err; } mlx4_dbg(dev, "%lld KB of HCA context requires %lld KB aux memory.\n", (unsigned long long) icm_size >> 10, (unsigned long long) aux_pages << 2); priv->fw.aux_icm = mlx4_alloc_icm(dev, aux_pages, GFP_HIGHUSER | __GFP_NOWARN, 0); if (!priv->fw.aux_icm) { mlx4_err(dev, "Couldn't allocate aux memory, aborting.\n"); return -ENOMEM; } err = mlx4_MAP_ICM_AUX(dev, priv->fw.aux_icm); if (err) { mlx4_err(dev, "MAP_ICM_AUX command failed, aborting.\n"); goto err_free_aux; } err = mlx4_init_cmpt_table(dev, init_hca->cmpt_base, dev_cap->cmpt_entry_sz); if (err) { mlx4_err(dev, "Failed to map cMPT context memory, aborting.\n"); goto err_unmap_aux; } num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs : dev->caps.num_eqs; err = mlx4_init_icm_table(dev, &priv->eq_table.table, init_hca->eqc_base, dev_cap->eqc_entry_sz, num_eqs, num_eqs, 0, 0); if (err) { mlx4_err(dev, "Failed to map EQ context memory, aborting.\n"); goto err_unmap_cmpt; } /* * Reserved MTT entries must be aligned up to a cacheline * boundary, since the FW will write to them, while the driver * writes to all other MTT entries. (The variable * dev->caps.mtt_entry_sz below is really the MTT segment * size, not the raw entry size) */ dev->caps.reserved_mtts = ALIGN(dev->caps.reserved_mtts * dev->caps.mtt_entry_sz, dma_get_cache_alignment()) / dev->caps.mtt_entry_sz; err = mlx4_init_icm_table(dev, &priv->mr_table.mtt_table, init_hca->mtt_base, dev->caps.mtt_entry_sz, dev->caps.num_mtts, dev->caps.reserved_mtts, 1, 0); if (err) { mlx4_err(dev, "Failed to map MTT context memory, aborting.\n"); goto err_unmap_eq; } err = mlx4_init_icm_table(dev, &priv->mr_table.dmpt_table, init_hca->dmpt_base, dev_cap->dmpt_entry_sz, dev->caps.num_mpts, dev->caps.reserved_mrws, 1, 1); if (err) { mlx4_err(dev, "Failed to map dMPT context memory, aborting.\n"); goto err_unmap_mtt; } err = mlx4_init_icm_table(dev, &priv->qp_table.qp_table, init_hca->qpc_base, dev_cap->qpc_entry_sz, dev->caps.num_qps, dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 0, 0); if (err) { mlx4_err(dev, "Failed to map QP context memory, aborting.\n"); goto err_unmap_dmpt; } err = mlx4_init_icm_table(dev, &priv->qp_table.auxc_table, init_hca->auxc_base, dev_cap->aux_entry_sz, dev->caps.num_qps, dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 0, 0); if (err) { mlx4_err(dev, "Failed to map AUXC context memory, aborting.\n"); goto err_unmap_qp; } err = mlx4_init_icm_table(dev, &priv->qp_table.altc_table, init_hca->altc_base, dev_cap->altc_entry_sz, dev->caps.num_qps, dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 0, 0); if (err) { mlx4_err(dev, "Failed to map ALTC context memory, aborting.\n"); goto err_unmap_auxc; } err = mlx4_init_icm_table(dev, &priv->qp_table.rdmarc_table, init_hca->rdmarc_base, dev_cap->rdmarc_entry_sz << priv->qp_table.rdmarc_shift, dev->caps.num_qps, dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 0, 0); if (err) { mlx4_err(dev, "Failed to map RDMARC context memory, aborting\n"); goto err_unmap_altc; } err = mlx4_init_icm_table(dev, &priv->cq_table.table, init_hca->cqc_base, dev_cap->cqc_entry_sz, dev->caps.num_cqs, dev->caps.reserved_cqs, 0, 0); if (err) { mlx4_err(dev, "Failed to map CQ context memory, aborting.\n"); goto err_unmap_rdmarc; } err = mlx4_init_icm_table(dev, &priv->srq_table.table, init_hca->srqc_base, dev_cap->srq_entry_sz, dev->caps.num_srqs, dev->caps.reserved_srqs, 0, 0); if (err) { mlx4_err(dev, "Failed to map SRQ context memory, aborting.\n"); goto err_unmap_cq; } /* * For flow steering device managed mode it is required to use * mlx4_init_icm_table. For B0 steering mode it's not strictly * required, but for simplicity just map the whole multicast * group table now. The table isn't very big and it's a lot * easier than trying to track ref counts. */ err = mlx4_init_icm_table(dev, &priv->mcg_table.table, init_hca->mc_base, mlx4_get_mgm_entry_size(dev), dev->caps.num_mgms + dev->caps.num_amgms, dev->caps.num_mgms + dev->caps.num_amgms, 0, 0); if (err) { mlx4_err(dev, "Failed to map MCG context memory, aborting.\n"); goto err_unmap_srq; } return 0; err_unmap_srq: mlx4_cleanup_icm_table(dev, &priv->srq_table.table); err_unmap_cq: mlx4_cleanup_icm_table(dev, &priv->cq_table.table); err_unmap_rdmarc: mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table); err_unmap_altc: mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table); err_unmap_auxc: mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table); err_unmap_qp: mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table); err_unmap_dmpt: mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table); err_unmap_mtt: mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table); err_unmap_eq: mlx4_cleanup_icm_table(dev, &priv->eq_table.table); err_unmap_cmpt: mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table); err_unmap_aux: unmap_flag = mlx4_UNMAP_ICM_AUX(dev); if (unmap_flag) pr_warn("mlx4_core: mlx4_UNMAP_ICM_AUX failed.\n"); err_free_aux: if (!unmap_flag) mlx4_free_icm(dev, priv->fw.aux_icm, 0); return err; } static void mlx4_free_icms(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); mlx4_cleanup_icm_table(dev, &priv->mcg_table.table); mlx4_cleanup_icm_table(dev, &priv->srq_table.table); mlx4_cleanup_icm_table(dev, &priv->cq_table.table); mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table); mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table); mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table); mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table); mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table); mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table); mlx4_cleanup_icm_table(dev, &priv->eq_table.table); mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table); if (!mlx4_UNMAP_ICM_AUX(dev)) mlx4_free_icm(dev, priv->fw.aux_icm, 0); else pr_warn("mlx4_core: mlx4_UNMAP_ICM_AUX failed.\n"); } static void mlx4_slave_exit(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); mutex_lock(&priv->cmd.slave_cmd_mutex); if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME)) mlx4_warn(dev, "Failed to close slave function.\n"); mutex_unlock(&priv->cmd.slave_cmd_mutex); } static int map_bf_area(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); resource_size_t bf_start; resource_size_t bf_len; int err = 0; if (!dev->caps.bf_reg_size) return -ENXIO; bf_start = pci_resource_start(dev->pdev, 2) + (dev->caps.num_uars << PAGE_SHIFT); bf_len = pci_resource_len(dev->pdev, 2) - (dev->caps.num_uars << PAGE_SHIFT); priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len); if (!priv->bf_mapping) err = -ENOMEM; return err; } static void unmap_bf_area(struct mlx4_dev *dev) { if (mlx4_priv(dev)->bf_mapping) io_mapping_free(mlx4_priv(dev)->bf_mapping); } int mlx4_read_clock(struct mlx4_dev *dev) { u32 clockhi, clocklo, clockhi1; cycle_t cycles; int i; struct mlx4_priv *priv = mlx4_priv(dev); if (!priv->clock_mapping) return -ENOTSUPP; for (i = 0; i < 10; i++) { clockhi = swab32(readl(priv->clock_mapping)); clocklo = swab32(readl(priv->clock_mapping + 4)); clockhi1 = swab32(readl(priv->clock_mapping)); if (clockhi == clockhi1) break; } cycles = (u64) clockhi << 32 | (u64) clocklo; return cycles; } EXPORT_SYMBOL_GPL(mlx4_read_clock); static int map_internal_clock(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); priv->clock_mapping = ioremap(pci_resource_start(dev->pdev, priv->fw.clock_bar) + priv->fw.clock_offset, MLX4_CLOCK_SIZE); if (!priv->clock_mapping) return -ENOMEM; return 0; } int mlx4_get_internal_clock_params(struct mlx4_dev *dev, struct mlx4_clock_params *params) { struct mlx4_priv *priv = mlx4_priv(dev); if (mlx4_is_slave(dev)) return -ENOTSUPP; if (!params) return -EINVAL; params->bar = priv->fw.clock_bar; params->offset = priv->fw.clock_offset; params->size = MLX4_CLOCK_SIZE; return 0; } EXPORT_SYMBOL_GPL(mlx4_get_internal_clock_params); static void unmap_internal_clock(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); if (priv->clock_mapping) iounmap(priv->clock_mapping); } static void mlx4_close_hca(struct mlx4_dev *dev) { unmap_internal_clock(dev); unmap_bf_area(dev); if (mlx4_is_slave(dev)) { mlx4_slave_exit(dev); } else { mlx4_CLOSE_HCA(dev, 0); mlx4_free_icms(dev); if (!mlx4_UNMAP_FA(dev)) mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0); else pr_warn("mlx4_core: mlx4_UNMAP_FA failed.\n"); } } static int mlx4_init_slave(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); u64 dma = (u64) priv->mfunc.vhcr_dma; int num_of_reset_retries = NUM_OF_RESET_RETRIES; int ret_from_reset = 0; u32 slave_read; u32 cmd_channel_ver; mutex_lock(&priv->cmd.slave_cmd_mutex); priv->cmd.max_cmds = 1; mlx4_warn(dev, "Sending reset\n"); ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME); /* if we are in the middle of flr the slave will try * NUM_OF_RESET_RETRIES times before leaving.*/ if (ret_from_reset) { if (MLX4_DELAY_RESET_SLAVE == ret_from_reset) { msleep(SLEEP_TIME_IN_RESET); while (ret_from_reset && num_of_reset_retries) { mlx4_warn(dev, "slave is currently in the" "middle of FLR. retrying..." "(try num:%d)\n", (NUM_OF_RESET_RETRIES - num_of_reset_retries + 1)); ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME); num_of_reset_retries = num_of_reset_retries - 1; } } else goto err; } /* check the driver version - the slave I/F revision * must match the master's */ slave_read = swab32(readl(&priv->mfunc.comm->slave_read)); cmd_channel_ver = mlx4_comm_get_version(); if (MLX4_COMM_GET_IF_REV(cmd_channel_ver) != MLX4_COMM_GET_IF_REV(slave_read)) { mlx4_err(dev, "slave driver version is not supported" " by the master\n"); goto err; } mlx4_warn(dev, "Sending vhcr0\n"); if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR0, dma >> 48, MLX4_COMM_TIME)) goto err; if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR1, dma >> 32, MLX4_COMM_TIME)) goto err; if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR2, dma >> 16, MLX4_COMM_TIME)) goto err; if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_EN, dma, MLX4_COMM_TIME)) goto err; mutex_unlock(&priv->cmd.slave_cmd_mutex); return 0; err: mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, 0); mutex_unlock(&priv->cmd.slave_cmd_mutex); return -EIO; } static void mlx4_parav_master_pf_caps(struct mlx4_dev *dev) { int i; for (i = 1; i <= dev->caps.num_ports; i++) { if (dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH) dev->caps.gid_table_len[i] = mlx4_get_slave_num_gids(dev, 0); else dev->caps.gid_table_len[i] = 1; dev->caps.pkey_table_len[i] = dev->phys_caps.pkey_phys_table_len[i] - 1; } } static int choose_log_fs_mgm_entry_size(int qp_per_entry) { int i = MLX4_MIN_MGM_LOG_ENTRY_SIZE; for (i = MLX4_MIN_MGM_LOG_ENTRY_SIZE; i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE; i++) { if (qp_per_entry <= 4 * ((1 << i) / 16 - 2)) break; } return (i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE) ? i : -1; } static void choose_steering_mode(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) { int nvfs; mlx4_get_val(num_vfs.dbdf2val.tbl, pci_physfn(dev->pdev), 0, &nvfs); if (high_rate_steer && !mlx4_is_mfunc(dev)) { dev->caps.flags &= ~(MLX4_DEV_CAP_FLAG_VEP_MC_STEER | MLX4_DEV_CAP_FLAG_VEP_UC_STEER); dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_FS_EN; } if (mlx4_log_num_mgm_entry_size == -1 && dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN && (!mlx4_is_mfunc(dev) || (dev_cap->fs_max_num_qp_per_entry >= (nvfs + 1))) && choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry) >= MLX4_MIN_MGM_LOG_ENTRY_SIZE) { dev->oper_log_mgm_entry_size = choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry); dev->caps.steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED; dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry; } else { if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER && dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) dev->caps.steering_mode = MLX4_STEERING_MODE_B0; else { dev->caps.steering_mode = MLX4_STEERING_MODE_A0; if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER || dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) mlx4_warn(dev, "Must have both UC_STEER and MC_STEER flags " "set to use B0 steering. Falling back to A0 steering mode.\n"); } dev->oper_log_mgm_entry_size = mlx4_log_num_mgm_entry_size > 0 ? mlx4_log_num_mgm_entry_size : MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE; dev->caps.num_qp_per_mgm = mlx4_get_qp_per_mgm(dev); } mlx4_dbg(dev, "Steering mode is: %s, oper_log_mgm_entry_size = %d, " "log_num_mgm_entry_size = %d\n", mlx4_steering_mode_str(dev->caps.steering_mode), dev->oper_log_mgm_entry_size, mlx4_log_num_mgm_entry_size); } static int mlx4_init_hca(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_dev_cap *dev_cap = NULL; struct mlx4_adapter adapter; struct mlx4_mod_stat_cfg mlx4_cfg; struct mlx4_profile profile; struct mlx4_init_hca_param init_hca; u64 icm_size; int err; if (!mlx4_is_slave(dev)) { err = mlx4_QUERY_FW(dev); if (err) { if (err == -EACCES) mlx4_info(dev, "non-primary physical function, skipping.\n"); else mlx4_err(dev, "QUERY_FW command failed, aborting.\n"); return err; } err = mlx4_load_fw(dev); if (err) { mlx4_err(dev, "Failed to start FW, aborting.\n"); return err; } mlx4_cfg.log_pg_sz_m = 1; mlx4_cfg.log_pg_sz = 0; err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg); if (err) mlx4_warn(dev, "Failed to override log_pg_sz parameter\n"); dev_cap = kzalloc(sizeof *dev_cap, GFP_KERNEL); if (!dev_cap) { mlx4_err(dev, "Failed to allocate memory for dev_cap\n"); err = -ENOMEM; goto err_stop_fw; } err = mlx4_dev_cap(dev, dev_cap); if (err) { mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n"); goto err_stop_fw; } choose_steering_mode(dev, dev_cap); if (mlx4_is_master(dev)) mlx4_parav_master_pf_caps(dev); process_mod_param_profile(&profile); if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) profile.num_mcg = MLX4_FS_NUM_MCG; icm_size = mlx4_make_profile(dev, &profile, dev_cap, &init_hca); if ((long long) icm_size < 0) { err = icm_size; goto err_stop_fw; } dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1; init_hca.log_uar_sz = ilog2(dev->caps.num_uars); init_hca.uar_page_sz = PAGE_SHIFT - 12; err = mlx4_init_icm(dev, dev_cap, &init_hca, icm_size); if (err) goto err_stop_fw; init_hca.mw_enable = 1; err = mlx4_INIT_HCA(dev, &init_hca); if (err) { mlx4_err(dev, "INIT_HCA command failed, aborting.\n"); goto err_free_icm; } /* * Read HCA frequency by QUERY_HCA command */ if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) { memset(&init_hca, 0, sizeof(init_hca)); err = mlx4_QUERY_HCA(dev, &init_hca); if (err) { mlx4_err(dev, "QUERY_HCA command failed, disable timestamp.\n"); dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS; } else { dev->caps.hca_core_clock = init_hca.hca_core_clock; } /* In case we got HCA frequency 0 - disable timestamping * to avoid dividing by zero */ if (!dev->caps.hca_core_clock) { dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS; mlx4_err(dev, "HCA frequency is 0. Timestamping is not supported."); } else if (map_internal_clock(dev)) { /* Map internal clock, * in case of failure disable timestamping */ dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS; mlx4_err(dev, "Failed to map internal clock. Timestamping is not supported.\n"); } } } else { err = mlx4_init_slave(dev); if (err) { mlx4_err(dev, "Failed to initialize slave\n"); return err; } err = mlx4_slave_cap(dev); if (err) { mlx4_err(dev, "Failed to obtain slave caps\n"); goto err_close; } } if (map_bf_area(dev)) mlx4_dbg(dev, "Failed to map blue flame area\n"); /* Only the master set the ports, all the rest got it from it.*/ if (!mlx4_is_slave(dev)) mlx4_set_port_mask(dev); err = mlx4_QUERY_ADAPTER(dev, &adapter); if (err) { mlx4_err(dev, "QUERY_ADAPTER command failed, aborting.\n"); goto unmap_bf; } priv->eq_table.inta_pin = adapter.inta_pin; memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id); memcpy(dev->vsd, adapter.vsd, sizeof(dev->vsd)); dev->vsd_vendor_id = adapter.vsd_vendor_id; if (!mlx4_is_slave(dev)) kfree(dev_cap); return 0; unmap_bf: if (!mlx4_is_slave(dev)) unmap_internal_clock(dev); unmap_bf_area(dev); if (mlx4_is_slave(dev)) { kfree(dev->caps.qp0_tunnel); kfree(dev->caps.qp0_proxy); kfree(dev->caps.qp1_tunnel); kfree(dev->caps.qp1_proxy); } err_close: if (mlx4_is_slave(dev)) mlx4_slave_exit(dev); else mlx4_CLOSE_HCA(dev, 0); err_free_icm: if (!mlx4_is_slave(dev)) mlx4_free_icms(dev); err_stop_fw: if (!mlx4_is_slave(dev)) { if (!mlx4_UNMAP_FA(dev)) mlx4_free_icm(dev, priv->fw.fw_icm, 0); else pr_warn("mlx4_core: mlx4_UNMAP_FA failed.\n"); kfree(dev_cap); } return err; } static int mlx4_init_counters_table(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int nent_pow2, port_indx, vf_index, num_counters; int res, index = 0; struct counter_index *new_counter_index; if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS)) return -ENOENT; if (!mlx4_is_slave(dev) && dev->caps.max_counters == dev->caps.max_extended_counters) { res = mlx4_cmd(dev, MLX4_IF_STATE_EXTENDED, 0, 0, MLX4_CMD_SET_IF_STAT, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (res) { mlx4_err(dev, "Failed to set extended counters (err=%d)\n", res); return res; } } mutex_init(&priv->counters_table.mutex); if (mlx4_is_slave(dev)) { for (port_indx = 0; port_indx < dev->caps.num_ports; port_indx++) { INIT_LIST_HEAD(&priv->counters_table.global_port_list[port_indx]); if (dev->caps.def_counter_index[port_indx] != 0xFF) { new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL); if (!new_counter_index) return -ENOMEM; new_counter_index->index = dev->caps.def_counter_index[port_indx]; list_add_tail(&new_counter_index->list, &priv->counters_table.global_port_list[port_indx]); } } mlx4_dbg(dev, "%s: slave allocated %d counters for %d ports\n", __func__, dev->caps.num_ports, dev->caps.num_ports); return 0; } nent_pow2 = roundup_pow_of_two(dev->caps.max_counters); for (port_indx = 0; port_indx < dev->caps.num_ports; port_indx++) { INIT_LIST_HEAD(&priv->counters_table.global_port_list[port_indx]); /* allocating 2 counters per port for PFs */ /* For the PF, the ETH default counters are 0,2; */ /* and the RoCE default counters are 1,3 */ for (num_counters = 0; num_counters < 2; num_counters++, index++) { new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL); if (!new_counter_index) return -ENOMEM; new_counter_index->index = index; list_add_tail(&new_counter_index->list, &priv->counters_table.global_port_list[port_indx]); } } if (mlx4_is_master(dev)) { for (vf_index = 0; vf_index < dev->num_vfs; vf_index++) { for (port_indx = 0; port_indx < dev->caps.num_ports; port_indx++) { INIT_LIST_HEAD(&priv->counters_table.vf_list[vf_index][port_indx]); new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL); if (!new_counter_index) return -ENOMEM; if (index < nent_pow2 - 2) { new_counter_index->index = index; index++; } else { new_counter_index->index = MLX4_SINK_COUNTER_INDEX; } list_add_tail(&new_counter_index->list, &priv->counters_table.vf_list[vf_index][port_indx]); } } res = mlx4_bitmap_init(&priv->counters_table.bitmap, nent_pow2, nent_pow2 - 1, index, 1); mlx4_dbg(dev, "%s: master allocated %d counters for %d VFs\n", __func__, index, dev->num_vfs); } else { res = mlx4_bitmap_init(&priv->counters_table.bitmap, nent_pow2, nent_pow2 - 1, index, 1); mlx4_dbg(dev, "%s: native allocated %d counters for %d ports\n", __func__, index, dev->caps.num_ports); } return 0; } static void mlx4_cleanup_counters_table(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int i, j; struct counter_index *port, *tmp_port; struct counter_index *vf, *tmp_vf; mutex_lock(&priv->counters_table.mutex); if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS) { for (i = 0; i < dev->caps.num_ports; i++) { list_for_each_entry_safe(port, tmp_port, &priv->counters_table.global_port_list[i], list) { list_del(&port->list); kfree(port); } } if (!mlx4_is_slave(dev)) { for (i = 0; i < dev->num_vfs; i++) { for (j = 0; j < dev->caps.num_ports; j++) { list_for_each_entry_safe(vf, tmp_vf, &priv->counters_table.vf_list[i][j], list) { /* clear the counter statistic */ if (__mlx4_clear_if_stat(dev, vf->index)) mlx4_dbg(dev, "%s: reset counter %d failed\n", __func__, vf->index); list_del(&vf->list); kfree(vf); } } } mlx4_bitmap_cleanup(&priv->counters_table.bitmap); } } mutex_unlock(&priv->counters_table.mutex); } int __mlx4_slave_counters_free(struct mlx4_dev *dev, int slave) { struct mlx4_priv *priv = mlx4_priv(dev); int i, first; struct counter_index *vf, *tmp_vf; /* clean VF's counters for the next useg */ if (slave > 0 && slave <= dev->num_vfs) { mlx4_dbg(dev, "%s: free counters of slave(%d)\n" , __func__, slave); mutex_lock(&priv->counters_table.mutex); for (i = 0; i < dev->caps.num_ports; i++) { first = 0; list_for_each_entry_safe(vf, tmp_vf, &priv->counters_table.vf_list[slave - 1][i], list) { /* clear the counter statistic */ if (__mlx4_clear_if_stat(dev, vf->index)) mlx4_dbg(dev, "%s: reset counter %d failed\n", __func__, vf->index); if (first++ && vf->index != MLX4_SINK_COUNTER_INDEX) { mlx4_dbg(dev, "%s: delete counter index %d for slave %d and port %d\n" , __func__, vf->index, slave, i + 1); mlx4_bitmap_free(&priv->counters_table.bitmap, vf->index, MLX4_USE_RR); list_del(&vf->list); kfree(vf); } else { mlx4_dbg(dev, "%s: can't delete default counter index %d for slave %d and port %d\n" , __func__, vf->index, slave, i + 1); } } } mutex_unlock(&priv->counters_table.mutex); } return 0; } int __mlx4_counter_alloc(struct mlx4_dev *dev, int slave, int port, u32 *idx) { struct mlx4_priv *priv = mlx4_priv(dev); struct counter_index *new_counter_index; if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS)) return -ENOENT; if ((slave > MLX4_MAX_NUM_VF) || (slave < 0) || (port < 0) || (port > MLX4_MAX_PORTS)) { mlx4_dbg(dev, "%s: invalid slave(%d) or port(%d) index\n", __func__, slave, port); return -EINVAL; } /* handle old guest request does not support request by port index */ if (port == 0) { *idx = MLX4_SINK_COUNTER_INDEX; mlx4_dbg(dev, "%s: allocated default counter index %d for slave %d port %d\n" , __func__, *idx, slave, port); return 0; } mutex_lock(&priv->counters_table.mutex); *idx = mlx4_bitmap_alloc(&priv->counters_table.bitmap); /* if no resources return the default counter of the slave and port */ if (*idx == -1) { if (slave == 0) { /* its the ethernet counter ?????? */ new_counter_index = list_entry(priv->counters_table.global_port_list[port - 1].next, struct counter_index, list); } else { new_counter_index = list_entry(priv->counters_table.vf_list[slave - 1][port - 1].next, struct counter_index, list); } *idx = new_counter_index->index; mlx4_dbg(dev, "%s: allocated defualt counter index %d for slave %d port %d\n" , __func__, *idx, slave, port); goto out; } if (slave == 0) { /* native or master */ new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL); if (!new_counter_index) goto no_mem; new_counter_index->index = *idx; list_add_tail(&new_counter_index->list, &priv->counters_table.global_port_list[port - 1]); } else { new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL); if (!new_counter_index) goto no_mem; new_counter_index->index = *idx; list_add_tail(&new_counter_index->list, &priv->counters_table.vf_list[slave - 1][port - 1]); } mlx4_dbg(dev, "%s: allocated counter index %d for slave %d port %d\n" , __func__, *idx, slave, port); out: mutex_unlock(&priv->counters_table.mutex); return 0; no_mem: mlx4_bitmap_free(&priv->counters_table.bitmap, *idx, MLX4_USE_RR); mutex_unlock(&priv->counters_table.mutex); *idx = MLX4_SINK_COUNTER_INDEX; mlx4_dbg(dev, "%s: failed err (%d)\n" , __func__, -ENOMEM); return -ENOMEM; } int mlx4_counter_alloc(struct mlx4_dev *dev, u8 port, u32 *idx) { u64 out_param; int err; struct mlx4_priv *priv = mlx4_priv(dev); struct counter_index *new_counter_index, *c_index; if (mlx4_is_mfunc(dev)) { err = mlx4_cmd_imm(dev, 0, &out_param, ((u32) port) << 8 | (u32) RES_COUNTER, RES_OP_RESERVE, MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (!err) { *idx = get_param_l(&out_param); if (*idx == MLX4_SINK_COUNTER_INDEX) return -ENOSPC; mutex_lock(&priv->counters_table.mutex); c_index = list_entry(priv->counters_table.global_port_list[port - 1].next, struct counter_index, list); mutex_unlock(&priv->counters_table.mutex); if (c_index->index == *idx) return -EEXIST; if (mlx4_is_slave(dev)) { new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL); if (!new_counter_index) { mlx4_counter_free(dev, port, *idx); return -ENOMEM; } new_counter_index->index = *idx; mutex_lock(&priv->counters_table.mutex); list_add_tail(&new_counter_index->list, &priv->counters_table.global_port_list[port - 1]); mutex_unlock(&priv->counters_table.mutex); mlx4_dbg(dev, "%s: allocated counter index %d for port %d\n" , __func__, *idx, port); } } return err; } return __mlx4_counter_alloc(dev, 0, port, idx); } EXPORT_SYMBOL_GPL(mlx4_counter_alloc); void __mlx4_counter_free(struct mlx4_dev *dev, int slave, int port, u32 idx) { /* check if native or slave and deletes accordingly */ struct mlx4_priv *priv = mlx4_priv(dev); struct counter_index *pf, *tmp_pf; struct counter_index *vf, *tmp_vf; int first; if (idx == MLX4_SINK_COUNTER_INDEX) { mlx4_dbg(dev, "%s: try to delete default counter index %d for port %d\n" , __func__, idx, port); return; } if ((slave > MLX4_MAX_NUM_VF) || (slave < 0) || (port < 0) || (port > MLX4_MAX_PORTS)) { mlx4_warn(dev, "%s: deletion failed due to invalid slave(%d) or port(%d) index\n" , __func__, slave, idx); return; } mutex_lock(&priv->counters_table.mutex); if (slave == 0) { first = 0; list_for_each_entry_safe(pf, tmp_pf, &priv->counters_table.global_port_list[port - 1], list) { /* the first 2 counters are reserved */ if (pf->index == idx) { /* clear the counter statistic */ if (__mlx4_clear_if_stat(dev, pf->index)) mlx4_dbg(dev, "%s: reset counter %d failed\n", __func__, pf->index); if (1 < first && idx != MLX4_SINK_COUNTER_INDEX) { list_del(&pf->list); kfree(pf); mlx4_dbg(dev, "%s: delete counter index %d for native device (%d) port %d\n" , __func__, idx, slave, port); mlx4_bitmap_free(&priv->counters_table.bitmap, idx, MLX4_USE_RR); goto out; } else { mlx4_dbg(dev, "%s: can't delete default counter index %d for native device (%d) port %d\n" , __func__, idx, slave, port); goto out; } } first++; } mlx4_dbg(dev, "%s: can't delete counter index %d for native device (%d) port %d\n" , __func__, idx, slave, port); } else { first = 0; list_for_each_entry_safe(vf, tmp_vf, &priv->counters_table.vf_list[slave - 1][port - 1], list) { /* the first element is reserved */ if (vf->index == idx) { /* clear the counter statistic */ if (__mlx4_clear_if_stat(dev, vf->index)) mlx4_dbg(dev, "%s: reset counter %d failed\n", __func__, vf->index); if (first) { list_del(&vf->list); kfree(vf); mlx4_dbg(dev, "%s: delete counter index %d for slave %d port %d\n", __func__, idx, slave, port); mlx4_bitmap_free(&priv->counters_table.bitmap, idx, MLX4_USE_RR); goto out; } else { mlx4_dbg(dev, "%s: can't delete default slave (%d) counter index %d for port %d\n" , __func__, slave, idx, port); goto out; } } first++; } mlx4_dbg(dev, "%s: can't delete slave (%d) counter index %d for port %d\n" , __func__, slave, idx, port); } out: mutex_unlock(&priv->counters_table.mutex); } void mlx4_counter_free(struct mlx4_dev *dev, u8 port, u32 idx) { u64 in_param = 0; struct mlx4_priv *priv = mlx4_priv(dev); struct counter_index *counter, *tmp_counter; int first = 0; if (mlx4_is_mfunc(dev)) { set_param_l(&in_param, idx); mlx4_cmd(dev, in_param, ((u32) port) << 8 | (u32) RES_COUNTER, RES_OP_RESERVE, MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (mlx4_is_slave(dev) && idx != MLX4_SINK_COUNTER_INDEX) { mutex_lock(&priv->counters_table.mutex); list_for_each_entry_safe(counter, tmp_counter, &priv->counters_table.global_port_list[port - 1], list) { if (counter->index == idx && first++) { list_del(&counter->list); kfree(counter); mlx4_dbg(dev, "%s: delete counter index %d for port %d\n" , __func__, idx, port); mutex_unlock(&priv->counters_table.mutex); return; } } mutex_unlock(&priv->counters_table.mutex); } return; } __mlx4_counter_free(dev, 0, port, idx); } EXPORT_SYMBOL_GPL(mlx4_counter_free); int __mlx4_clear_if_stat(struct mlx4_dev *dev, u8 counter_index) { struct mlx4_cmd_mailbox *if_stat_mailbox = NULL; int err = 0; u32 if_stat_in_mod = (counter_index & 0xff) | (1 << 31); if (counter_index == MLX4_SINK_COUNTER_INDEX) return -EINVAL; if (mlx4_is_slave(dev)) return 0; if_stat_mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(if_stat_mailbox)) { err = PTR_ERR(if_stat_mailbox); return err; } err = mlx4_cmd_box(dev, 0, if_stat_mailbox->dma, if_stat_in_mod, 0, MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev, if_stat_mailbox); return err; } u8 mlx4_get_default_counter_index(struct mlx4_dev *dev, int slave, int port) { struct mlx4_priv *priv = mlx4_priv(dev); struct counter_index *new_counter_index; if (dev->caps.port_type[port] == MLX4_PORT_TYPE_IB) { mlx4_dbg(dev, "%s: return counter index %d for slave %d port (MLX4_PORT_TYPE_IB) %d\n", __func__, MLX4_SINK_COUNTER_INDEX, slave, port); return (u8)MLX4_SINK_COUNTER_INDEX; } mutex_lock(&priv->counters_table.mutex); if (slave == 0) { new_counter_index = list_entry(priv->counters_table.global_port_list[port - 1].next, struct counter_index, list); } else { new_counter_index = list_entry(priv->counters_table.vf_list[slave - 1][port - 1].next, struct counter_index, list); } mutex_unlock(&priv->counters_table.mutex); mlx4_dbg(dev, "%s: return counter index %d for slave %d port %d\n", __func__, new_counter_index->index, slave, port); return (u8)new_counter_index->index; } int mlx4_get_vport_ethtool_stats(struct mlx4_dev *dev, int port, struct mlx4_en_vport_stats *vport_stats, int reset) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_cmd_mailbox *if_stat_mailbox = NULL; union mlx4_counter *counter; int err = 0; u32 if_stat_in_mod; struct counter_index *vport, *tmp_vport; if (!vport_stats) return -EINVAL; if_stat_mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(if_stat_mailbox)) { err = PTR_ERR(if_stat_mailbox); return err; } mutex_lock(&priv->counters_table.mutex); list_for_each_entry_safe(vport, tmp_vport, &priv->counters_table.global_port_list[port - 1], list) { if (vport->index == MLX4_SINK_COUNTER_INDEX) continue; memset(if_stat_mailbox->buf, 0, sizeof(union mlx4_counter)); if_stat_in_mod = (vport->index & 0xff) | ((reset & 1) << 31); err = mlx4_cmd_box(dev, 0, if_stat_mailbox->dma, if_stat_in_mod, 0, MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); if (err) { mlx4_dbg(dev, "%s: failed to read statistics for counter index %d\n", __func__, vport->index); goto if_stat_out; } counter = (union mlx4_counter *)if_stat_mailbox->buf; if ((counter->control.cnt_mode & 0xf) == 1) { vport_stats->rx_broadcast_packets += be64_to_cpu(counter->ext.counters[0].IfRxBroadcastFrames); vport_stats->rx_unicast_packets += be64_to_cpu(counter->ext.counters[0].IfRxUnicastFrames); vport_stats->rx_multicast_packets += be64_to_cpu(counter->ext.counters[0].IfRxMulticastFrames); vport_stats->tx_broadcast_packets += be64_to_cpu(counter->ext.counters[0].IfTxBroadcastFrames); vport_stats->tx_unicast_packets += be64_to_cpu(counter->ext.counters[0].IfTxUnicastFrames); vport_stats->tx_multicast_packets += be64_to_cpu(counter->ext.counters[0].IfTxMulticastFrames); vport_stats->rx_broadcast_bytes += be64_to_cpu(counter->ext.counters[0].IfRxBroadcastOctets); vport_stats->rx_unicast_bytes += be64_to_cpu(counter->ext.counters[0].IfRxUnicastOctets); vport_stats->rx_multicast_bytes += be64_to_cpu(counter->ext.counters[0].IfRxMulticastOctets); vport_stats->tx_broadcast_bytes += be64_to_cpu(counter->ext.counters[0].IfTxBroadcastOctets); vport_stats->tx_unicast_bytes += be64_to_cpu(counter->ext.counters[0].IfTxUnicastOctets); vport_stats->tx_multicast_bytes += be64_to_cpu(counter->ext.counters[0].IfTxMulticastOctets); vport_stats->rx_errors += be64_to_cpu(counter->ext.counters[0].IfRxErrorFrames); vport_stats->rx_dropped += be64_to_cpu(counter->ext.counters[0].IfRxNoBufferFrames); vport_stats->tx_errors += be64_to_cpu(counter->ext.counters[0].IfTxDroppedFrames); } } if_stat_out: mutex_unlock(&priv->counters_table.mutex); mlx4_free_cmd_mailbox(dev, if_stat_mailbox); return err; } EXPORT_SYMBOL_GPL(mlx4_get_vport_ethtool_stats); static int mlx4_setup_hca(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int err; int port; __be32 ib_port_default_caps; err = mlx4_init_uar_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "user access region table (err=%d), aborting.\n", err); return err; } err = mlx4_uar_alloc(dev, &priv->driver_uar); if (err) { mlx4_err(dev, "Failed to allocate driver access region " "(err=%d), aborting.\n", err); goto err_uar_table_free; } priv->kar = ioremap((phys_addr_t) priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!priv->kar) { mlx4_err(dev, "Couldn't map kernel access region, " "aborting.\n"); err = -ENOMEM; goto err_uar_free; } err = mlx4_init_pd_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "protection domain table (err=%d), aborting.\n", err); goto err_kar_unmap; } err = mlx4_init_xrcd_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "reliable connection domain table (err=%d), " "aborting.\n", err); goto err_pd_table_free; } err = mlx4_init_mr_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "memory region table (err=%d), aborting.\n", err); goto err_xrcd_table_free; } if (!mlx4_is_slave(dev)) { err = mlx4_init_mcg_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "multicast group table (err=%d), aborting.\n", err); goto err_mr_table_free; } } err = mlx4_init_eq_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "event queue table (err=%d), aborting.\n", err); goto err_mcg_table_free; } err = mlx4_cmd_use_events(dev); if (err) { mlx4_err(dev, "Failed to switch to event-driven " "firmware commands (err=%d), aborting.\n", err); goto err_eq_table_free; } err = mlx4_NOP(dev); if (err) { if (dev->flags & MLX4_FLAG_MSI_X) { mlx4_warn(dev, "NOP command failed to generate MSI-X " "interrupt IRQ %d).\n", priv->eq_table.eq[dev->caps.num_comp_vectors].irq); mlx4_warn(dev, "Trying again without MSI-X.\n"); } else { mlx4_err(dev, "NOP command failed to generate interrupt " "(IRQ %d), aborting.\n", priv->eq_table.eq[dev->caps.num_comp_vectors].irq); mlx4_err(dev, "BIOS or ACPI interrupt routing problem?\n"); } goto err_cmd_poll; } mlx4_dbg(dev, "NOP command IRQ test passed\n"); err = mlx4_init_cq_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "completion queue table (err=%d), aborting.\n", err); goto err_cmd_poll; } err = mlx4_init_srq_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "shared receive queue table (err=%d), aborting.\n", err); goto err_cq_table_free; } err = mlx4_init_qp_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "queue pair table (err=%d), aborting.\n", err); goto err_srq_table_free; } err = mlx4_init_counters_table(dev); if (err && err != -ENOENT) { mlx4_err(dev, "Failed to initialize counters table (err=%d), " "aborting.\n", err); goto err_qp_table_free; } if (!mlx4_is_slave(dev)) { for (port = 1; port <= dev->caps.num_ports; port++) { ib_port_default_caps = 0; err = mlx4_get_port_ib_caps(dev, port, &ib_port_default_caps); if (err) mlx4_warn(dev, "failed to get port %d default " "ib capabilities (%d). Continuing " "with caps = 0\n", port, err); dev->caps.ib_port_def_cap[port] = ib_port_default_caps; /* initialize per-slave default ib port capabilities */ if (mlx4_is_master(dev)) { int i; for (i = 0; i < dev->num_slaves; i++) { if (i == mlx4_master_func_num(dev)) continue; priv->mfunc.master.slave_state[i].ib_cap_mask[port] = ib_port_default_caps; } } dev->caps.port_ib_mtu[port] = IB_MTU_4096; err = mlx4_SET_PORT(dev, port, mlx4_is_master(dev) ? dev->caps.pkey_table_len[port] : -1); if (err) { mlx4_err(dev, "Failed to set port %d (err=%d), " "aborting\n", port, err); goto err_counters_table_free; } } } return 0; err_counters_table_free: mlx4_cleanup_counters_table(dev); err_qp_table_free: mlx4_cleanup_qp_table(dev); err_srq_table_free: mlx4_cleanup_srq_table(dev); err_cq_table_free: mlx4_cleanup_cq_table(dev); err_cmd_poll: mlx4_cmd_use_polling(dev); err_eq_table_free: mlx4_cleanup_eq_table(dev); err_mcg_table_free: if (!mlx4_is_slave(dev)) mlx4_cleanup_mcg_table(dev); err_mr_table_free: mlx4_cleanup_mr_table(dev); err_xrcd_table_free: mlx4_cleanup_xrcd_table(dev); err_pd_table_free: mlx4_cleanup_pd_table(dev); err_kar_unmap: iounmap(priv->kar); err_uar_free: mlx4_uar_free(dev, &priv->driver_uar); err_uar_table_free: mlx4_cleanup_uar_table(dev); return err; } static void mlx4_enable_msi_x(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); struct msix_entry *entries; int nreq = min_t(int, dev->caps.num_ports * min_t(int, num_possible_cpus() + 1, MAX_MSIX_P_PORT) + MSIX_LEGACY_SZ, MAX_MSIX); int err; int i; if (msi_x) { nreq = min_t(int, dev->caps.num_eqs - dev->caps.reserved_eqs, nreq); if (msi_x > 1 && !mlx4_is_mfunc(dev)) nreq = min_t(int, nreq, msi_x); entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL); if (!entries) goto no_msi; for (i = 0; i < nreq; ++i) entries[i].entry = i; retry: err = pci_enable_msix(dev->pdev, entries, nreq); if (err) { /* Try again if at least 2 vectors are available */ if (err > 1) { mlx4_info(dev, "Requested %d vectors, " "but only %d MSI-X vectors available, " "trying again\n", nreq, err); nreq = err; goto retry; } kfree(entries); /* if error, or can't alloc even 1 IRQ */ if (err < 0) { mlx4_err(dev, "No IRQs left, device can't " "be started.\n"); goto no_irq; } goto no_msi; } if (nreq < MSIX_LEGACY_SZ + dev->caps.num_ports * MIN_MSIX_P_PORT) { /*Working in legacy mode , all EQ's shared*/ dev->caps.comp_pool = 0; dev->caps.num_comp_vectors = nreq - 1; } else { dev->caps.comp_pool = nreq - MSIX_LEGACY_SZ; dev->caps.num_comp_vectors = MSIX_LEGACY_SZ - 1; } for (i = 0; i < nreq; ++i) priv->eq_table.eq[i].irq = entries[i].vector; dev->flags |= MLX4_FLAG_MSI_X; kfree(entries); return; } no_msi: dev->caps.num_comp_vectors = 1; dev->caps.comp_pool = 0; for (i = 0; i < 2; ++i) priv->eq_table.eq[i].irq = dev->pdev->irq; return; no_irq: dev->caps.num_comp_vectors = 0; dev->caps.comp_pool = 0; return; } static void mlx4_init_hca_info(struct mlx4_dev *dev) { struct mlx4_hca_info *info = &mlx4_priv(dev)->hca_info; info->dev = dev; info->firmware_attr = (struct device_attribute)__ATTR(fw_ver, S_IRUGO, show_firmware_version, NULL); if (device_create_file(&dev->pdev->dev, &info->firmware_attr)) mlx4_err(dev, "Failed to add file firmware version"); info->hca_attr = (struct device_attribute)__ATTR(hca, S_IRUGO, show_hca, NULL); if (device_create_file(&dev->pdev->dev, &info->hca_attr)) mlx4_err(dev, "Failed to add file hca type"); info->board_attr = (struct device_attribute)__ATTR(board_id, S_IRUGO, show_board, NULL); if (device_create_file(&dev->pdev->dev, &info->board_attr)) mlx4_err(dev, "Failed to add file board id type"); } static int mlx4_init_port_info(struct mlx4_dev *dev, int port) { struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; int err = 0; info->dev = dev; info->port = port; if (!mlx4_is_slave(dev)) { mlx4_init_mac_table(dev, &info->mac_table); mlx4_init_vlan_table(dev, &info->vlan_table); info->base_qpn = mlx4_get_base_qpn(dev, port); } sprintf(info->dev_name, "mlx4_port%d", port); info->port_attr.attr.name = info->dev_name; if (mlx4_is_mfunc(dev)) info->port_attr.attr.mode = S_IRUGO; else { info->port_attr.attr.mode = S_IRUGO | S_IWUSR; info->port_attr.store = set_port_type; } info->port_attr.show = show_port_type; sysfs_attr_init(&info->port_attr.attr); err = device_create_file(&dev->pdev->dev, &info->port_attr); if (err) { mlx4_err(dev, "Failed to create file for port %d\n", port); info->port = -1; } sprintf(info->dev_mtu_name, "mlx4_port%d_mtu", port); info->port_mtu_attr.attr.name = info->dev_mtu_name; if (mlx4_is_mfunc(dev)) info->port_mtu_attr.attr.mode = S_IRUGO; else { info->port_mtu_attr.attr.mode = S_IRUGO | S_IWUSR; info->port_mtu_attr.store = set_port_ib_mtu; } info->port_mtu_attr.show = show_port_ib_mtu; sysfs_attr_init(&info->port_mtu_attr.attr); err = device_create_file(&dev->pdev->dev, &info->port_mtu_attr); if (err) { mlx4_err(dev, "Failed to create mtu file for port %d\n", port); device_remove_file(&info->dev->pdev->dev, &info->port_attr); info->port = -1; } return err; } static void mlx4_cleanup_hca_info(struct mlx4_hca_info *info) { device_remove_file(&info->dev->pdev->dev, &info->firmware_attr); device_remove_file(&info->dev->pdev->dev, &info->board_attr); device_remove_file(&info->dev->pdev->dev, &info->hca_attr); } static void mlx4_cleanup_port_info(struct mlx4_port_info *info) { if (info->port < 0) return; device_remove_file(&info->dev->pdev->dev, &info->port_attr); device_remove_file(&info->dev->pdev->dev, &info->port_mtu_attr); } static int mlx4_init_steering(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int num_entries = dev->caps.num_ports; int i, j; priv->steer = kzalloc(sizeof(struct mlx4_steer) * num_entries, GFP_KERNEL); if (!priv->steer) return -ENOMEM; for (i = 0; i < num_entries; i++) for (j = 0; j < MLX4_NUM_STEERS; j++) { INIT_LIST_HEAD(&priv->steer[i].promisc_qps[j]); INIT_LIST_HEAD(&priv->steer[i].steer_entries[j]); } return 0; } static void mlx4_clear_steering(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_steer_index *entry, *tmp_entry; struct mlx4_promisc_qp *pqp, *tmp_pqp; int num_entries = dev->caps.num_ports; int i, j; for (i = 0; i < num_entries; i++) { for (j = 0; j < MLX4_NUM_STEERS; j++) { list_for_each_entry_safe(pqp, tmp_pqp, &priv->steer[i].promisc_qps[j], list) { list_del(&pqp->list); kfree(pqp); } list_for_each_entry_safe(entry, tmp_entry, &priv->steer[i].steer_entries[j], list) { list_del(&entry->list); list_for_each_entry_safe(pqp, tmp_pqp, &entry->duplicates, list) { list_del(&pqp->list); kfree(pqp); } kfree(entry); } } } kfree(priv->steer); } static int extended_func_num(struct pci_dev *pdev) { return PCI_SLOT(pdev->devfn) * 8 + PCI_FUNC(pdev->devfn); } #define MLX4_OWNER_BASE 0x8069c #define MLX4_OWNER_SIZE 4 static int mlx4_get_ownership(struct mlx4_dev *dev) { void __iomem *owner; u32 ret; if (pci_channel_offline(dev->pdev)) return -EIO; owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, MLX4_OWNER_SIZE); if (!owner) { mlx4_err(dev, "Failed to obtain ownership bit\n"); return -ENOMEM; } ret = readl(owner); iounmap(owner); return (int) !!ret; } static void mlx4_free_ownership(struct mlx4_dev *dev) { void __iomem *owner; if (pci_channel_offline(dev->pdev)) return; owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, MLX4_OWNER_SIZE); if (!owner) { mlx4_err(dev, "Failed to obtain ownership bit\n"); return; } writel(0, owner); msleep(1000); iounmap(owner); } static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) { struct mlx4_priv *priv; struct mlx4_dev *dev; int err; int port; int nvfs, prb_vf; pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev)); err = pci_enable_device(pdev); if (err) { dev_err(&pdev->dev, "Cannot enable PCI device, " "aborting.\n"); return err; } mlx4_get_val(num_vfs.dbdf2val.tbl, pci_physfn(pdev), 0, &nvfs); mlx4_get_val(probe_vf.dbdf2val.tbl, pci_physfn(pdev), 0, &prb_vf); if (nvfs > MLX4_MAX_NUM_VF) { dev_err(&pdev->dev, "There are more VF's (%d) than allowed(%d)\n", nvfs, MLX4_MAX_NUM_VF); return -EINVAL; } if (nvfs < 0) { dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n"); return -EINVAL; } /* * Check for BARs. */ if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) && !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { dev_err(&pdev->dev, "Missing DCS, aborting." "(driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%x)\n", pci_dev_data, pci_resource_flags(pdev, 0)); err = -ENODEV; goto err_disable_pdev; } if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) { dev_err(&pdev->dev, "Missing UAR, aborting.\n"); err = -ENODEV; goto err_disable_pdev; } err = pci_request_regions(pdev, DRV_NAME); if (err) { dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n"); goto err_disable_pdev; } pci_set_master(pdev); err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); if (err) { dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n"); err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n"); goto err_release_regions; } } err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); if (err) { dev_warn(&pdev->dev, "Warning: couldn't set 64-bit " "consistent PCI DMA mask.\n"); err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, " "aborting.\n"); goto err_release_regions; } } /* Allow large DMA segments, up to the firmware limit of 1 GB */ dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); priv = kzalloc(sizeof *priv, GFP_KERNEL); if (!priv) { dev_err(&pdev->dev, "Device struct alloc failed, " "aborting.\n"); err = -ENOMEM; goto err_release_regions; } dev = &priv->dev; dev->pdev = pdev; INIT_LIST_HEAD(&priv->dev_list); INIT_LIST_HEAD(&priv->ctx_list); spin_lock_init(&priv->ctx_lock); mutex_init(&priv->port_mutex); INIT_LIST_HEAD(&priv->pgdir_list); mutex_init(&priv->pgdir_mutex); INIT_LIST_HEAD(&priv->bf_list); mutex_init(&priv->bf_mutex); dev->rev_id = pdev->revision; dev->numa_node = dev_to_node(&pdev->dev); /* Detect if this device is a virtual function */ if (pci_dev_data & MLX4_PCI_DEV_IS_VF) { /* When acting as pf, we normally skip vfs unless explicitly * requested to probe them. */ if (nvfs && extended_func_num(pdev) > prb_vf) { mlx4_warn(dev, "Skipping virtual function:%d\n", extended_func_num(pdev)); err = -ENODEV; goto err_free_dev; } mlx4_warn(dev, "Detected virtual function - running in slave mode\n"); dev->flags |= MLX4_FLAG_SLAVE; } else { /* We reset the device and enable SRIOV only for physical * devices. Try to claim ownership on the device; * if already taken, skip -- do not allow multiple PFs */ err = mlx4_get_ownership(dev); if (err) { if (err < 0) goto err_free_dev; else { mlx4_warn(dev, "Multiple PFs not yet supported." " Skipping PF.\n"); err = -EINVAL; goto err_free_dev; } } if (nvfs) { mlx4_warn(dev, "Enabling SR-IOV with %d VFs\n", nvfs); err = pci_enable_sriov(pdev, nvfs); if (err) { mlx4_err(dev, "Failed to enable SR-IOV, continuing without SR-IOV (err = %d).\n", err); err = 0; } else { mlx4_warn(dev, "Running in master mode\n"); dev->flags |= MLX4_FLAG_SRIOV | MLX4_FLAG_MASTER; dev->num_vfs = nvfs; } } atomic_set(&priv->opreq_count, 0); INIT_WORK(&priv->opreq_task, mlx4_opreq_action); /* * Now reset the HCA before we touch the PCI capabilities or * attempt a firmware command, since a boot ROM may have left * the HCA in an undefined state. */ err = mlx4_reset(dev); if (err) { mlx4_err(dev, "Failed to reset HCA, aborting.\n"); goto err_sriov; } } slave_start: err = mlx4_cmd_init(dev); if (err) { mlx4_err(dev, "Failed to init command interface, aborting.\n"); goto err_sriov; } /* In slave functions, the communication channel must be initialized * before posting commands. Also, init num_slaves before calling * mlx4_init_hca */ if (mlx4_is_mfunc(dev)) { if (mlx4_is_master(dev)) dev->num_slaves = MLX4_MAX_NUM_SLAVES; else { dev->num_slaves = 0; err = mlx4_multi_func_init(dev); if (err) { mlx4_err(dev, "Failed to init slave mfunc" " interface, aborting.\n"); goto err_cmd; } } } err = mlx4_init_hca(dev); if (err) { if (err == -EACCES) { /* Not primary Physical function * Running in slave mode */ mlx4_cmd_cleanup(dev); dev->flags |= MLX4_FLAG_SLAVE; dev->flags &= ~MLX4_FLAG_MASTER; goto slave_start; } else goto err_mfunc; } /* In master functions, the communication channel must be initialized * after obtaining its address from fw */ if (mlx4_is_master(dev)) { err = mlx4_multi_func_init(dev); if (err) { mlx4_err(dev, "Failed to init master mfunc" "interface, aborting.\n"); goto err_close; } } err = mlx4_alloc_eq_table(dev); if (err) goto err_master_mfunc; priv->msix_ctl.pool_bm = 0; mutex_init(&priv->msix_ctl.pool_lock); mlx4_enable_msi_x(dev); /* no MSIX and no shared IRQ */ if (!dev->caps.num_comp_vectors && !dev->caps.comp_pool) { err = -ENOSPC; goto err_free_eq; } if ((mlx4_is_mfunc(dev)) && !(dev->flags & MLX4_FLAG_MSI_X)) { err = -ENOSYS; mlx4_err(dev, "INTx is not supported in multi-function mode." " aborting.\n"); goto err_free_eq; } if (!mlx4_is_slave(dev)) { err = mlx4_init_steering(dev); if (err) goto err_free_eq; } err = mlx4_setup_hca(dev); if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X) && !mlx4_is_mfunc(dev)) { dev->flags &= ~MLX4_FLAG_MSI_X; dev->caps.num_comp_vectors = 1; dev->caps.comp_pool = 0; pci_disable_msix(pdev); err = mlx4_setup_hca(dev); } if (err) goto err_steer; mlx4_init_quotas(dev); mlx4_init_hca_info(dev); for (port = 1; port <= dev->caps.num_ports; port++) { err = mlx4_init_port_info(dev, port); if (err) goto err_port; } err = mlx4_register_device(dev); if (err) goto err_port; mlx4_request_modules(dev); mlx4_sense_init(dev); mlx4_start_sense(dev); priv->pci_dev_data = pci_dev_data; pci_set_drvdata(pdev, dev); return 0; err_port: for (--port; port >= 1; --port) mlx4_cleanup_port_info(&priv->port[port]); mlx4_cleanup_counters_table(dev); mlx4_cleanup_qp_table(dev); mlx4_cleanup_srq_table(dev); mlx4_cleanup_cq_table(dev); mlx4_cmd_use_polling(dev); mlx4_cleanup_eq_table(dev); mlx4_cleanup_mcg_table(dev); mlx4_cleanup_mr_table(dev); mlx4_cleanup_xrcd_table(dev); mlx4_cleanup_pd_table(dev); mlx4_cleanup_uar_table(dev); err_steer: if (!mlx4_is_slave(dev)) mlx4_clear_steering(dev); err_free_eq: mlx4_free_eq_table(dev); err_master_mfunc: if (mlx4_is_master(dev)) { mlx4_free_resource_tracker(dev, RES_TR_FREE_STRUCTS_ONLY); mlx4_multi_func_cleanup(dev); } if (mlx4_is_slave(dev)) { kfree(dev->caps.qp0_tunnel); kfree(dev->caps.qp0_proxy); kfree(dev->caps.qp1_tunnel); kfree(dev->caps.qp1_proxy); } err_close: if (dev->flags & MLX4_FLAG_MSI_X) pci_disable_msix(pdev); mlx4_close_hca(dev); err_mfunc: if (mlx4_is_slave(dev)) mlx4_multi_func_cleanup(dev); err_cmd: mlx4_cmd_cleanup(dev); err_sriov: if (dev->flags & MLX4_FLAG_SRIOV) pci_disable_sriov(pdev); if (!mlx4_is_slave(dev)) mlx4_free_ownership(dev); err_free_dev: kfree(priv); err_release_regions: pci_release_regions(pdev); err_disable_pdev: pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); return err; } static int __devinit mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { device_set_desc(pdev->dev.bsddev, mlx4_version); return __mlx4_init_one(pdev, id->driver_data); } static void mlx4_remove_one(struct pci_dev *pdev) { struct mlx4_dev *dev = pci_get_drvdata(pdev); struct mlx4_priv *priv = mlx4_priv(dev); int p; if (dev) { /* in SRIOV it is not allowed to unload the pf's * driver while there are alive vf's */ if (mlx4_is_master(dev)) { if (mlx4_how_many_lives_vf(dev)) mlx4_err(dev, "Removing PF when there are assigned VF's !!!\n"); } mlx4_stop_sense(dev); mlx4_unregister_device(dev); mlx4_cleanup_hca_info(&priv->hca_info); for (p = 1; p <= dev->caps.num_ports; p++) { mlx4_cleanup_port_info(&priv->port[p]); mlx4_CLOSE_PORT(dev, p); } if (mlx4_is_master(dev)) mlx4_free_resource_tracker(dev, RES_TR_FREE_SLAVES_ONLY); mlx4_cleanup_counters_table(dev); mlx4_cleanup_qp_table(dev); mlx4_cleanup_srq_table(dev); mlx4_cleanup_cq_table(dev); mlx4_cmd_use_polling(dev); mlx4_cleanup_eq_table(dev); mlx4_cleanup_mcg_table(dev); mlx4_cleanup_mr_table(dev); mlx4_cleanup_xrcd_table(dev); mlx4_cleanup_pd_table(dev); if (mlx4_is_master(dev)) mlx4_free_resource_tracker(dev, RES_TR_FREE_STRUCTS_ONLY); iounmap(priv->kar); mlx4_uar_free(dev, &priv->driver_uar); mlx4_cleanup_uar_table(dev); if (!mlx4_is_slave(dev)) mlx4_clear_steering(dev); mlx4_free_eq_table(dev); if (mlx4_is_master(dev)) mlx4_multi_func_cleanup(dev); mlx4_close_hca(dev); if (mlx4_is_slave(dev)) mlx4_multi_func_cleanup(dev); mlx4_cmd_cleanup(dev); if (dev->flags & MLX4_FLAG_MSI_X) pci_disable_msix(pdev); if (dev->flags & MLX4_FLAG_SRIOV) { mlx4_warn(dev, "Disabling SR-IOV\n"); pci_disable_sriov(pdev); } if (!mlx4_is_slave(dev)) mlx4_free_ownership(dev); kfree(dev->caps.qp0_tunnel); kfree(dev->caps.qp0_proxy); kfree(dev->caps.qp1_tunnel); kfree(dev->caps.qp1_proxy); kfree(priv); pci_release_regions(pdev); pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); } } static int restore_current_port_types(struct mlx4_dev *dev, enum mlx4_port_type *types, enum mlx4_port_type *poss_types) { struct mlx4_priv *priv = mlx4_priv(dev); int err, i; mlx4_stop_sense(dev); mutex_lock(&priv->port_mutex); for (i = 0; i < dev->caps.num_ports; i++) dev->caps.possible_type[i + 1] = poss_types[i]; err = mlx4_change_port_types(dev, types); mlx4_start_sense(dev); mutex_unlock(&priv->port_mutex); return err; } int mlx4_restart_one(struct pci_dev *pdev) { struct mlx4_dev *dev = pci_get_drvdata(pdev); struct mlx4_priv *priv = mlx4_priv(dev); enum mlx4_port_type curr_type[MLX4_MAX_PORTS]; enum mlx4_port_type poss_type[MLX4_MAX_PORTS]; int pci_dev_data, err, i; pci_dev_data = priv->pci_dev_data; for (i = 0; i < dev->caps.num_ports; i++) { curr_type[i] = dev->caps.port_type[i + 1]; poss_type[i] = dev->caps.possible_type[i + 1]; } mlx4_remove_one(pdev); err = __mlx4_init_one(pdev, pci_dev_data); if (err) return err; dev = pci_get_drvdata(pdev); err = restore_current_port_types(dev, curr_type, poss_type); if (err) mlx4_err(dev, "mlx4_restart_one: could not restore original port types (%d)\n", err); return 0; } static DEFINE_PCI_DEVICE_TABLE(mlx4_pci_table) = { /* MT25408 "Hermon" SDR */ { PCI_VDEVICE(MELLANOX, 0x6340), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25408 "Hermon" DDR */ { PCI_VDEVICE(MELLANOX, 0x634a), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25408 "Hermon" QDR */ { PCI_VDEVICE(MELLANOX, 0x6354), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25408 "Hermon" DDR PCIe gen2 */ { PCI_VDEVICE(MELLANOX, 0x6732), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25408 "Hermon" QDR PCIe gen2 */ { PCI_VDEVICE(MELLANOX, 0x673c), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25408 "Hermon" EN 10GigE */ { PCI_VDEVICE(MELLANOX, 0x6368), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 */ { PCI_VDEVICE(MELLANOX, 0x6750), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25458 ConnectX EN 10GBASE-T 10GigE */ { PCI_VDEVICE(MELLANOX, 0x6372), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ { PCI_VDEVICE(MELLANOX, 0x675a), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT26468 ConnectX EN 10GigE PCIe gen2*/ { PCI_VDEVICE(MELLANOX, 0x6764), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */ { PCI_VDEVICE(MELLANOX, 0x6746), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT26478 ConnectX2 40GigE PCIe gen2 */ { PCI_VDEVICE(MELLANOX, 0x676e), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25400 Family [ConnectX-2 Virtual Function] */ { PCI_VDEVICE(MELLANOX, 0x1002), MLX4_PCI_DEV_IS_VF }, /* MT27500 Family [ConnectX-3] */ { PCI_VDEVICE(MELLANOX, 0x1003), 0 }, /* MT27500 Family [ConnectX-3 Virtual Function] */ { PCI_VDEVICE(MELLANOX, 0x1004), MLX4_PCI_DEV_IS_VF }, { PCI_VDEVICE(MELLANOX, 0x1005), 0 }, /* MT27510 Family */ { PCI_VDEVICE(MELLANOX, 0x1006), 0 }, /* MT27511 Family */ { PCI_VDEVICE(MELLANOX, 0x1007), 0 }, /* MT27520 Family */ { PCI_VDEVICE(MELLANOX, 0x1008), 0 }, /* MT27521 Family */ { PCI_VDEVICE(MELLANOX, 0x1009), 0 }, /* MT27530 Family */ { PCI_VDEVICE(MELLANOX, 0x100a), 0 }, /* MT27531 Family */ { PCI_VDEVICE(MELLANOX, 0x100b), 0 }, /* MT27540 Family */ { PCI_VDEVICE(MELLANOX, 0x100c), 0 }, /* MT27541 Family */ { PCI_VDEVICE(MELLANOX, 0x100d), 0 }, /* MT27550 Family */ { PCI_VDEVICE(MELLANOX, 0x100e), 0 }, /* MT27551 Family */ { PCI_VDEVICE(MELLANOX, 0x100f), 0 }, /* MT27560 Family */ { PCI_VDEVICE(MELLANOX, 0x1010), 0 }, /* MT27561 Family */ { 0, } }; MODULE_DEVICE_TABLE(pci, mlx4_pci_table); static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state) { mlx4_remove_one(pdev); return state == pci_channel_io_perm_failure ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; } static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev) { int ret = __mlx4_init_one(pdev, 0); return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; } static const struct pci_error_handlers mlx4_err_handler = { .error_detected = mlx4_pci_err_detected, .slot_reset = mlx4_pci_slot_reset, }; static int suspend(struct pci_dev *pdev, pm_message_t state) { mlx4_remove_one(pdev); return 0; } static int resume(struct pci_dev *pdev) { return __mlx4_init_one(pdev, 0); } static struct pci_driver mlx4_driver = { .name = DRV_NAME, .id_table = mlx4_pci_table, .probe = mlx4_init_one, .remove = __devexit_p(mlx4_remove_one), .suspend = suspend, .resume = resume, .err_handler = &mlx4_err_handler, }; static int __init mlx4_verify_params(void) { int status; status = update_defaults(&port_type_array); if (status == INVALID_STR) { if (mlx4_fill_dbdf2val_tbl(&port_type_array.dbdf2val)) return -1; } else if (status == INVALID_DATA) { return -1; } status = update_defaults(&num_vfs); if (status == INVALID_STR) { if (mlx4_fill_dbdf2val_tbl(&num_vfs.dbdf2val)) return -1; } else if (status == INVALID_DATA) { return -1; } status = update_defaults(&probe_vf); if (status == INVALID_STR) { if (mlx4_fill_dbdf2val_tbl(&probe_vf.dbdf2val)) return -1; } else if (status == INVALID_DATA) { return -1; } if (msi_x < 0) { pr_warn("mlx4_core: bad msi_x: %d\n", msi_x); return -1; } if ((log_num_mac < 0) || (log_num_mac > 7)) { pr_warning("mlx4_core: bad num_mac: %d\n", log_num_mac); return -1; } if (log_num_vlan != 0) pr_warning("mlx4_core: log_num_vlan - obsolete module param, using %d\n", MLX4_LOG_NUM_VLANS); if (mlx4_set_4k_mtu != -1) pr_warning("mlx4_core: set_4k_mtu - obsolete module param\n"); if ((log_mtts_per_seg < 0) || (log_mtts_per_seg > 7)) { pr_warning("mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg); return -1; } if (mlx4_log_num_mgm_entry_size != -1 && (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE || mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE)) { pr_warning("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not " "in legal range (-1 or %d..%d)\n", mlx4_log_num_mgm_entry_size, MLX4_MIN_MGM_LOG_ENTRY_SIZE, MLX4_MAX_MGM_LOG_ENTRY_SIZE); return -1; } if (mod_param_profile.num_qp < 18 || mod_param_profile.num_qp > 23) { pr_warning("mlx4_core: bad log_num_qp: %d\n", mod_param_profile.num_qp); return -1; } if (mod_param_profile.num_srq < 10) { pr_warning("mlx4_core: too low log_num_srq: %d\n", mod_param_profile.num_srq); return -1; } if (mod_param_profile.num_cq < 10) { pr_warning("mlx4_core: too low log_num_cq: %d\n", mod_param_profile.num_cq); return -1; } if (mod_param_profile.num_mpt < 10) { pr_warning("mlx4_core: too low log_num_mpt: %d\n", mod_param_profile.num_mpt); return -1; } if (mod_param_profile.num_mtt_segs && mod_param_profile.num_mtt_segs < 15) { pr_warning("mlx4_core: too low log_num_mtt: %d\n", mod_param_profile.num_mtt_segs); return -1; } if (mod_param_profile.num_mtt_segs > MLX4_MAX_LOG_NUM_MTT) { pr_warning("mlx4_core: too high log_num_mtt: %d\n", mod_param_profile.num_mtt_segs); return -1; } return 0; } static int __init mlx4_init(void) { int ret; if (mlx4_verify_params()) return -EINVAL; mlx4_catas_init(); mlx4_wq = create_singlethread_workqueue("mlx4"); if (!mlx4_wq) return -ENOMEM; if (enable_sys_tune) sys_tune_init(); ret = pci_register_driver(&mlx4_driver); if (ret < 0) goto err; return 0; err: if (enable_sys_tune) sys_tune_fini(); destroy_workqueue(mlx4_wq); return ret; } static void __exit mlx4_cleanup(void) { if (enable_sys_tune) sys_tune_fini(); pci_unregister_driver(&mlx4_driver); destroy_workqueue(mlx4_wq); } module_init_order(mlx4_init, SI_ORDER_MIDDLE); module_exit(mlx4_cleanup); static int mlx4_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t mlx4_mod = { .name = "mlx4", .evhand = mlx4_evhand, }; MODULE_VERSION(mlx4, 1); DECLARE_MODULE(mlx4, mlx4_mod, SI_SUB_OFED_PREINIT, SI_ORDER_ANY); MODULE_DEPEND(mlx4, linuxkpi, 1, 1, 1); Index: head/sys/ofed/drivers/net/mlx4/port.c =================================================================== --- head/sys/ofed/drivers/net/mlx4/port.c (revision 300675) +++ head/sys/ofed/drivers/net/mlx4/port.c (revision 300676) @@ -1,1222 +1,1224 @@ /* * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#define LINUXKPI_PARAM_PREFIX mlx4_ + #include #include #include #include #include #include #include "mlx4.h" #include "mlx4_stats.h" int mlx4_set_4k_mtu = -1; module_param_named(set_4k_mtu, mlx4_set_4k_mtu, int, 0444); MODULE_PARM_DESC(set_4k_mtu, "(Obsolete) attempt to set 4K MTU to all ConnectX ports"); #define MLX4_MAC_VALID (1ull << 63) #define MLX4_VLAN_VALID (1u << 31) #define MLX4_VLAN_MASK 0xfff void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table) { int i; mutex_init(&table->mutex); for (i = 0; i < MLX4_MAX_MAC_NUM; i++) { table->entries[i] = 0; table->refs[i] = 0; } table->max = 1 << dev->caps.log_num_macs; table->total = 0; } void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table) { int i; mutex_init(&table->mutex); for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) { table->entries[i] = 0; table->refs[i] = 0; } table->max = (1 << dev->caps.log_num_vlans) - MLX4_VLAN_REGULAR; table->total = 0; } static int validate_index(struct mlx4_dev *dev, struct mlx4_mac_table *table, int index) { int err = 0; if (index < 0 || index >= table->max || !table->refs[index]) { mlx4_warn(dev, "No valid Mac entry for the given index\n"); err = -EINVAL; } return err; } static int find_index(struct mlx4_dev *dev, struct mlx4_mac_table *table, u64 mac) { int i; for (i = 0; i < MLX4_MAX_MAC_NUM; i++) { if ((mac & MLX4_MAC_MASK) == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) return i; } /* Mac not found */ return -EINVAL; } static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port, __be64 *entries) { struct mlx4_cmd_mailbox *mailbox; u32 in_mod; int err; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); memcpy(mailbox->buf, entries, MLX4_MAC_TABLE_SIZE); in_mod = MLX4_SET_PORT_MAC_TABLE << 8 | port; err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev, mailbox); return err; } int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac) { struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; struct mlx4_mac_table *table = &info->mac_table; int i, err = 0; int free = -1; mlx4_dbg(dev, "Registering MAC: 0x%llx for port %d\n", (unsigned long long) mac, port); mutex_lock(&table->mutex); for (i = 0; i < MLX4_MAX_MAC_NUM; i++) { if (free < 0 && !table->refs[i]) { free = i; continue; } if ((mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) && table->refs[i]) { /* MAC already registered, Must not have duplicates */ err = i; ++table->refs[i]; goto out; } } mlx4_dbg(dev, "Free MAC index is %d\n", free); if (table->total == table->max) { /* No free mac entries */ err = -ENOSPC; goto out; } /* Register new MAC */ table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID); err = mlx4_set_port_mac_table(dev, port, table->entries); if (unlikely(err)) { mlx4_err(dev, "Failed adding MAC: 0x%llx\n", (unsigned long long) mac); table->entries[free] = 0; goto out; } table->refs[free] = 1; err = free; ++table->total; out: mutex_unlock(&table->mutex); return err; } EXPORT_SYMBOL_GPL(__mlx4_register_mac); int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac) { u64 out_param = 0; int err = -EINVAL; if (mlx4_is_mfunc(dev)) { if (!(dev->flags & MLX4_FLAG_OLD_REG_MAC)) { err = mlx4_cmd_imm(dev, mac, &out_param, ((u32) port) << 8 | (u32) RES_MAC, RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); } if (err && err == -EINVAL && mlx4_is_slave(dev)) { /* retry using old REG_MAC format */ set_param_l(&out_param, port); err = mlx4_cmd_imm(dev, mac, &out_param, RES_MAC, RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (!err) dev->flags |= MLX4_FLAG_OLD_REG_MAC; } if (err) return err; return get_param_l(&out_param); } return __mlx4_register_mac(dev, port, mac); } EXPORT_SYMBOL_GPL(mlx4_register_mac); int mlx4_get_base_qpn(struct mlx4_dev *dev, u8 port) { return dev->caps.reserved_qps_base[MLX4_QP_REGION_ETH_ADDR] + (port - 1) * (1 << dev->caps.log_num_macs); } EXPORT_SYMBOL_GPL(mlx4_get_base_qpn); void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac) { struct mlx4_port_info *info; struct mlx4_mac_table *table; int index; if (port < 1 || port > dev->caps.num_ports) { mlx4_warn(dev, "invalid port number (%d), aborting...\n", port); return; } info = &mlx4_priv(dev)->port[port]; table = &info->mac_table; mutex_lock(&table->mutex); index = find_index(dev, table, mac); if (validate_index(dev, table, index)) goto out; if (--table->refs[index]) { mlx4_dbg(dev, "Have more references for index %d," "no need to modify mac table\n", index); goto out; } table->entries[index] = 0; mlx4_set_port_mac_table(dev, port, table->entries); --table->total; out: mutex_unlock(&table->mutex); } EXPORT_SYMBOL_GPL(__mlx4_unregister_mac); void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac) { u64 out_param = 0; if (mlx4_is_mfunc(dev)) { if (!(dev->flags & MLX4_FLAG_OLD_REG_MAC)) { (void) mlx4_cmd_imm(dev, mac, &out_param, ((u32) port) << 8 | (u32) RES_MAC, RES_OP_RESERVE_AND_MAP, MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); } else { /* use old unregister mac format */ set_param_l(&out_param, port); (void) mlx4_cmd_imm(dev, mac, &out_param, RES_MAC, RES_OP_RESERVE_AND_MAP, MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); } return; } __mlx4_unregister_mac(dev, port, mac); return; } EXPORT_SYMBOL_GPL(mlx4_unregister_mac); int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac) { struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; struct mlx4_mac_table *table = &info->mac_table; int index = qpn - info->base_qpn; int err = 0; /* CX1 doesn't support multi-functions */ mutex_lock(&table->mutex); err = validate_index(dev, table, index); if (err) goto out; table->entries[index] = cpu_to_be64(new_mac | MLX4_MAC_VALID); err = mlx4_set_port_mac_table(dev, port, table->entries); if (unlikely(err)) { mlx4_err(dev, "Failed adding MAC: 0x%llx\n", (unsigned long long) new_mac); table->entries[index] = 0; } out: mutex_unlock(&table->mutex); return err; } EXPORT_SYMBOL_GPL(__mlx4_replace_mac); static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port, __be32 *entries) { struct mlx4_cmd_mailbox *mailbox; u32 in_mod; int err; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); memcpy(mailbox->buf, entries, MLX4_VLAN_TABLE_SIZE); in_mod = MLX4_SET_PORT_VLAN_TABLE << 8 | port; err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev, mailbox); return err; } int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx) { struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table; int i; for (i = 0; i < MLX4_MAX_VLAN_NUM; ++i) { if (table->refs[i] && (vid == (MLX4_VLAN_MASK & be32_to_cpu(table->entries[i])))) { /* VLAN already registered, increase reference count */ *idx = i; return 0; } } return -ENOENT; } EXPORT_SYMBOL_GPL(mlx4_find_cached_vlan); int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index) { struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table; int i, err = 0; int free = -1; mutex_lock(&table->mutex); if (table->total == table->max) { /* No free vlan entries */ err = -ENOSPC; goto out; } for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) { if (free < 0 && (table->refs[i] == 0)) { free = i; continue; } if (table->refs[i] && (vlan == (MLX4_VLAN_MASK & be32_to_cpu(table->entries[i])))) { /* Vlan already registered, increase references count */ *index = i; ++table->refs[i]; goto out; } } if (free < 0) { err = -ENOMEM; goto out; } /* Register new VLAN */ table->refs[free] = 1; table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID); err = mlx4_set_port_vlan_table(dev, port, table->entries); if (unlikely(err)) { mlx4_warn(dev, "Failed adding vlan: %u\n", vlan); table->refs[free] = 0; table->entries[free] = 0; goto out; } *index = free; ++table->total; out: mutex_unlock(&table->mutex); return err; } int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index) { u64 out_param = 0; int err; if (vlan > 4095) return -EINVAL; if (mlx4_is_mfunc(dev)) { err = mlx4_cmd_imm(dev, vlan, &out_param, ((u32) port) << 8 | (u32) RES_VLAN, RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (!err) *index = get_param_l(&out_param); return err; } return __mlx4_register_vlan(dev, port, vlan, index); } EXPORT_SYMBOL_GPL(mlx4_register_vlan); void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan) { struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table; int index; mutex_lock(&table->mutex); if (mlx4_find_cached_vlan(dev, port, vlan, &index)) { mlx4_warn(dev, "vlan 0x%x is not in the vlan table\n", vlan); goto out; } if (index < MLX4_VLAN_REGULAR) { mlx4_warn(dev, "Trying to free special vlan index %d\n", index); goto out; } if (--table->refs[index]) { mlx4_dbg(dev, "Have %d more references for index %d, " "no need to modify vlan table\n", table->refs[index], index); goto out; } table->entries[index] = 0; mlx4_set_port_vlan_table(dev, port, table->entries); --table->total; out: mutex_unlock(&table->mutex); } void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan) { u64 out_param = 0; if (mlx4_is_mfunc(dev)) { (void) mlx4_cmd_imm(dev, vlan, &out_param, ((u32) port) << 8 | (u32) RES_VLAN, RES_OP_RESERVE_AND_MAP, MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); return; } __mlx4_unregister_vlan(dev, port, vlan); } EXPORT_SYMBOL_GPL(mlx4_unregister_vlan); int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps) { struct mlx4_cmd_mailbox *inmailbox, *outmailbox; u8 *inbuf, *outbuf; int err; inmailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(inmailbox)) return PTR_ERR(inmailbox); outmailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(outmailbox)) { mlx4_free_cmd_mailbox(dev, inmailbox); return PTR_ERR(outmailbox); } inbuf = inmailbox->buf; outbuf = outmailbox->buf; memset(inbuf, 0, 256); memset(outbuf, 0, 256); inbuf[0] = 1; inbuf[1] = 1; inbuf[2] = 1; inbuf[3] = 1; *(__be16 *) (&inbuf[16]) = cpu_to_be16(0x0015); *(__be32 *) (&inbuf[20]) = cpu_to_be32(port); err = mlx4_cmd_box(dev, inmailbox->dma, outmailbox->dma, port, 3, MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); if (!err) *caps = *(__be32 *) (outbuf + 84); mlx4_free_cmd_mailbox(dev, inmailbox); mlx4_free_cmd_mailbox(dev, outmailbox); return err; } static struct mlx4_roce_gid_entry zgid_entry; int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave) { if (slave == 0) return MLX4_ROCE_PF_GIDS; if (slave <= ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) % dev->num_vfs)) return ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / dev->num_vfs) + 1; return (MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / dev->num_vfs; } int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave) { int gids; int vfs; gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS; vfs = dev->num_vfs; if (slave == 0) return 0; if (slave <= gids % vfs) return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave - 1); return MLX4_ROCE_PF_GIDS + (gids % vfs) + ((gids / vfs) * (slave - 1)); } static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod, u8 op_mod, struct mlx4_cmd_mailbox *inbox) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_port_info *port_info; struct mlx4_mfunc_master_ctx *master = &priv->mfunc.master; struct mlx4_slave_state *slave_st = &master->slave_state[slave]; struct mlx4_set_port_rqp_calc_context *qpn_context; struct mlx4_set_port_general_context *gen_context; struct mlx4_roce_gid_entry *gid_entry_tbl, *gid_entry_mbox, *gid_entry_mb1; int reset_qkey_viols; int port; int is_eth; int num_gids; int base; u32 in_modifier; u32 promisc; u16 mtu, prev_mtu; int err; int i, j; int offset; __be32 agg_cap_mask; __be32 slave_cap_mask; __be32 new_cap_mask; port = in_mod & 0xff; in_modifier = (in_mod >> 8) & 0xff; is_eth = op_mod; port_info = &priv->port[port]; if (op_mod > 1) return -EINVAL; /* Slaves cannot perform SET_PORT operations except changing MTU */ if (is_eth) { if (slave != dev->caps.function && in_modifier != MLX4_SET_PORT_GENERAL && in_modifier != MLX4_SET_PORT_GID_TABLE) { mlx4_warn(dev, "denying SET_PORT for slave:%d," "port %d, config_select 0x%x\n", slave, port, in_modifier); return -EINVAL; } switch (in_modifier) { case MLX4_SET_PORT_RQP_CALC: qpn_context = inbox->buf; qpn_context->base_qpn = cpu_to_be32(port_info->base_qpn); qpn_context->n_mac = 0x7; promisc = be32_to_cpu(qpn_context->promisc) >> SET_PORT_PROMISC_SHIFT; qpn_context->promisc = cpu_to_be32( promisc << SET_PORT_PROMISC_SHIFT | port_info->base_qpn); promisc = be32_to_cpu(qpn_context->mcast) >> SET_PORT_MC_PROMISC_SHIFT; qpn_context->mcast = cpu_to_be32( promisc << SET_PORT_MC_PROMISC_SHIFT | port_info->base_qpn); break; case MLX4_SET_PORT_GENERAL: gen_context = inbox->buf; /* Mtu is configured as the max MTU among all the * the functions on the port. */ mtu = be16_to_cpu(gen_context->mtu); mtu = min_t(int, mtu, dev->caps.eth_mtu_cap[port] + ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN); prev_mtu = slave_st->mtu[port]; slave_st->mtu[port] = mtu; if (mtu > master->max_mtu[port]) master->max_mtu[port] = mtu; if (mtu < prev_mtu && prev_mtu == master->max_mtu[port]) { slave_st->mtu[port] = mtu; master->max_mtu[port] = mtu; for (i = 0; i < dev->num_slaves; i++) { master->max_mtu[port] = max(master->max_mtu[port], master->slave_state[i].mtu[port]); } } gen_context->mtu = cpu_to_be16(master->max_mtu[port]); break; case MLX4_SET_PORT_GID_TABLE: /* change to MULTIPLE entries: number of guest's gids * need a FOR-loop here over number of gids the guest has. * 1. Check no duplicates in gids passed by slave */ num_gids = mlx4_get_slave_num_gids(dev, slave); base = mlx4_get_base_gid_ix(dev, slave); gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf); for (i = 0; i < num_gids; gid_entry_mbox++, i++) { if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw, sizeof(zgid_entry))) continue; gid_entry_mb1 = gid_entry_mbox + 1; for (j = i + 1; j < num_gids; gid_entry_mb1++, j++) { if (!memcmp(gid_entry_mb1->raw, zgid_entry.raw, sizeof(zgid_entry))) continue; if (!memcmp(gid_entry_mb1->raw, gid_entry_mbox->raw, sizeof(gid_entry_mbox->raw))) { /* found duplicate */ return -EINVAL; } } } /* 2. Check that do not have duplicates in OTHER * entries in the port GID table */ for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) { if (i >= base && i < base + num_gids) continue; /* don't compare to slave's current gids */ gid_entry_tbl = &priv->roce_gids[port - 1][i]; if (!memcmp(gid_entry_tbl->raw, zgid_entry.raw, sizeof(zgid_entry))) continue; gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf); for (j = 0; j < num_gids; gid_entry_mbox++, j++) { if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw, sizeof(zgid_entry))) continue; if (!memcmp(gid_entry_mbox->raw, gid_entry_tbl->raw, sizeof(gid_entry_tbl->raw))) { /* found duplicate */ mlx4_warn(dev, "requested gid entry for slave:%d " "is a duplicate of gid at index %d\n", slave, i); return -EINVAL; } } } /* insert slave GIDs with memcpy, starting at slave's base index */ gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf); for (i = 0, offset = base; i < num_gids; gid_entry_mbox++, offset++, i++) memcpy(priv->roce_gids[port - 1][offset].raw, gid_entry_mbox->raw, 16); /* Now, copy roce port gids table to current mailbox for passing to FW */ gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf); for (i = 0; i < MLX4_ROCE_MAX_GIDS; gid_entry_mbox++, i++) memcpy(gid_entry_mbox->raw, priv->roce_gids[port - 1][i].raw, 16); break; } return mlx4_cmd(dev, inbox->dma, in_mod & 0xffff, op_mod, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); } /* For IB, we only consider: * - The capability mask, which is set to the aggregate of all * slave function capabilities * - The QKey violatin counter - reset according to each request. */ if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { reset_qkey_viols = (*(u8 *) inbox->buf) & 0x40; new_cap_mask = ((__be32 *) inbox->buf)[2]; } else { reset_qkey_viols = ((u8 *) inbox->buf)[3] & 0x1; new_cap_mask = ((__be32 *) inbox->buf)[1]; } /* slave may not set the IS_SM capability for the port */ if (slave != mlx4_master_func_num(dev) && (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_IS_SM)) return -EINVAL; /* No DEV_MGMT in multifunc mode */ if (mlx4_is_mfunc(dev) && (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_DEV_MGMT_SUP)) return -EINVAL; agg_cap_mask = 0; slave_cap_mask = priv->mfunc.master.slave_state[slave].ib_cap_mask[port]; priv->mfunc.master.slave_state[slave].ib_cap_mask[port] = new_cap_mask; for (i = 0; i < dev->num_slaves; i++) agg_cap_mask |= priv->mfunc.master.slave_state[i].ib_cap_mask[port]; /* only clear mailbox for guests. Master may be setting * MTU or PKEY table size */ if (slave != dev->caps.function) memset(inbox->buf, 0, 256); if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { *(u8 *) inbox->buf |= !!reset_qkey_viols << 6; ((__be32 *) inbox->buf)[2] = agg_cap_mask; } else { ((u8 *) inbox->buf)[3] |= !!reset_qkey_viols; ((__be32 *) inbox->buf)[1] = agg_cap_mask; } err = mlx4_cmd(dev, inbox->dma, port, is_eth, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); if (err) priv->mfunc.master.slave_state[slave].ib_cap_mask[port] = slave_cap_mask; return err; } int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox, struct mlx4_cmd_info *cmd) { return mlx4_common_set_port(dev, slave, vhcr->in_modifier, vhcr->op_modifier, inbox); } /* bit locations for set port command with zero op modifier */ enum { MLX4_SET_PORT_VL_CAP = 4, /* bits 7:4 */ MLX4_SET_PORT_MTU_CAP = 12, /* bits 15:12 */ MLX4_CHANGE_PORT_PKEY_TBL_SZ = 20, MLX4_CHANGE_PORT_VL_CAP = 21, MLX4_CHANGE_PORT_MTU_CAP = 22, }; int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz) { struct mlx4_cmd_mailbox *mailbox; int err = -EINVAL, vl_cap, pkey_tbl_flag = 0; u32 in_mod; if (dev->caps.port_type[port] == MLX4_PORT_TYPE_NONE) return 0; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); memset(mailbox->buf, 0, 256); if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) { in_mod = MLX4_SET_PORT_GENERAL << 8 | port; err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); } else { ((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port]; if (pkey_tbl_sz >= 0 && mlx4_is_master(dev)) { pkey_tbl_flag = 1; ((__be16 *) mailbox->buf)[20] = cpu_to_be16(pkey_tbl_sz); } /* IB VL CAP enum isn't used by the firmware, just numerical values */ for (vl_cap = dev->caps.vl_cap[port]; vl_cap >= 1; vl_cap >>= 1) { ((__be32 *) mailbox->buf)[0] = cpu_to_be32( (1 << MLX4_CHANGE_PORT_MTU_CAP) | (1 << MLX4_CHANGE_PORT_VL_CAP) | (pkey_tbl_flag << MLX4_CHANGE_PORT_PKEY_TBL_SZ) | (dev->caps.port_ib_mtu[port] << MLX4_SET_PORT_MTU_CAP) | (vl_cap << MLX4_SET_PORT_VL_CAP)); err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err != -ENOMEM) break; } } mlx4_free_cmd_mailbox(dev, mailbox); return err; } int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx) { struct mlx4_cmd_mailbox *mailbox; struct mlx4_set_port_general_context *context; int err; u32 in_mod; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); context = mailbox->buf; memset(context, 0, sizeof *context); context->flags = SET_PORT_GEN_ALL_VALID; context->mtu = cpu_to_be16(mtu); context->pptx = (pptx * (!pfctx)) << 7; context->pfctx = pfctx; context->pprx = (pprx * (!pfcrx)) << 7; context->pfcrx = pfcrx; in_mod = MLX4_SET_PORT_GENERAL << 8 | port; err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); mlx4_free_cmd_mailbox(dev, mailbox); return err; } EXPORT_SYMBOL(mlx4_SET_PORT_general); int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, u8 promisc) { struct mlx4_cmd_mailbox *mailbox; struct mlx4_set_port_rqp_calc_context *context; int err; u32 in_mod; u32 m_promisc = (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) ? MCAST_DIRECT : MCAST_DEFAULT; if (dev->caps.steering_mode != MLX4_STEERING_MODE_A0) return 0; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); context = mailbox->buf; memset(context, 0, sizeof *context); context->base_qpn = cpu_to_be32(base_qpn); context->n_mac = dev->caps.log_num_macs; context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_SHIFT | base_qpn); context->mcast = cpu_to_be32(m_promisc << SET_PORT_MC_PROMISC_SHIFT | base_qpn); context->intra_no_vlan = 0; context->no_vlan = MLX4_NO_VLAN_IDX; context->intra_vlan_miss = 0; context->vlan_miss = MLX4_VLAN_MISS_IDX; in_mod = MLX4_SET_PORT_RQP_CALC << 8 | port; err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); mlx4_free_cmd_mailbox(dev, mailbox); return err; } EXPORT_SYMBOL(mlx4_SET_PORT_qpn_calc); int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc) { struct mlx4_cmd_mailbox *mailbox; struct mlx4_set_port_prio2tc_context *context; int err; u32 in_mod; int i; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); context = mailbox->buf; memset(context, 0, sizeof *context); for (i = 0; i < MLX4_NUM_UP; i += 2) context->prio2tc[i >> 1] = prio2tc[i] << 4 | prio2tc[i + 1]; in_mod = MLX4_SET_PORT_PRIO2TC << 8 | port; err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev, mailbox); return err; } EXPORT_SYMBOL(mlx4_SET_PORT_PRIO2TC); int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw, u8 *pg, u16 *ratelimit) { struct mlx4_cmd_mailbox *mailbox; struct mlx4_set_port_scheduler_context *context; int err; u32 in_mod; int i; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); context = mailbox->buf; memset(context, 0, sizeof *context); for (i = 0; i < MLX4_NUM_TC; i++) { struct mlx4_port_scheduler_tc_cfg_be *tc = &context->tc[i]; u16 r; if (ratelimit && ratelimit[i]) { if (ratelimit[i] <= MLX4_MAX_100M_UNITS_VAL) { r = ratelimit[i]; tc->max_bw_units = htons(MLX4_RATELIMIT_100M_UNITS); } else { r = ratelimit[i]/10; tc->max_bw_units = htons(MLX4_RATELIMIT_1G_UNITS); } tc->max_bw_value = htons(r); } else { tc->max_bw_value = htons(MLX4_RATELIMIT_DEFAULT); tc->max_bw_units = htons(MLX4_RATELIMIT_1G_UNITS); } tc->pg = htons(pg[i]); tc->bw_precentage = htons(tc_tx_bw[i]); } in_mod = MLX4_SET_PORT_SCHEDULER << 8 | port; err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev, mailbox); return err; } EXPORT_SYMBOL(mlx4_SET_PORT_SCHEDULER); int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox, struct mlx4_cmd_info *cmd) { int err = 0; return err; } int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode) { return mlx4_cmd(dev, (mac | (clear << 63)), port, mode, MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); } EXPORT_SYMBOL(mlx4_SET_MCAST_FLTR); int mlx4_SET_VLAN_FLTR_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox, struct mlx4_cmd_info *cmd) { int err = 0; return err; } int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox, struct mlx4_cmd_info *cmd) { return 0; } void mlx4_set_stats_bitmap(struct mlx4_dev *dev, unsigned long *stats_bitmap) { int last_i = 0; bitmap_zero(stats_bitmap, NUM_ALL_STATS); if (mlx4_is_slave(dev)) { last_i = dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN ? NUM_PKT_STATS + NUM_FLOW_STATS : NUM_PKT_STATS; } else { bitmap_set(stats_bitmap, last_i, NUM_PKT_STATS); last_i = NUM_PKT_STATS; if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN) { bitmap_set(stats_bitmap, last_i, NUM_FLOW_STATS); last_i += NUM_FLOW_STATS; } } if (mlx4_is_slave(dev)) bitmap_set(stats_bitmap, last_i, NUM_VF_STATS); last_i += NUM_VF_STATS; if (mlx4_is_master(dev)) bitmap_set(stats_bitmap, last_i, NUM_VPORT_STATS); last_i += NUM_VPORT_STATS; bitmap_set(stats_bitmap, last_i, NUM_PORT_STATS); } EXPORT_SYMBOL(mlx4_set_stats_bitmap); int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, int *slave_id) { struct mlx4_priv *priv = mlx4_priv(dev); int i, found_ix = -1; int vf_gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS; if (!mlx4_is_mfunc(dev)) return -EINVAL; for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) { if (!memcmp(priv->roce_gids[port - 1][i].raw, gid, 16)) { found_ix = i; break; } } if (found_ix >= 0) { if (found_ix < MLX4_ROCE_PF_GIDS) *slave_id = 0; else if (found_ix < MLX4_ROCE_PF_GIDS + (vf_gids % dev->num_vfs) * (vf_gids / dev->num_vfs + 1)) *slave_id = ((found_ix - MLX4_ROCE_PF_GIDS) / (vf_gids / dev->num_vfs + 1)) + 1; else *slave_id = ((found_ix - MLX4_ROCE_PF_GIDS - ((vf_gids % dev->num_vfs) * ((vf_gids / dev->num_vfs + 1)))) / (vf_gids / dev->num_vfs)) + vf_gids % dev->num_vfs + 1; } return (found_ix >= 0) ? 0 : -EINVAL; } EXPORT_SYMBOL(mlx4_get_slave_from_roce_gid); int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id, u8 *gid) { struct mlx4_priv *priv = mlx4_priv(dev); if (!mlx4_is_master(dev)) return -EINVAL; memcpy(gid, priv->roce_gids[port - 1][slave_id].raw, 16); return 0; } EXPORT_SYMBOL(mlx4_get_roce_gid_from_slave); /* Cable Module Info */ #define MODULE_INFO_MAX_READ 48 #define I2C_ADDR_LOW 0x50 #define I2C_ADDR_HIGH 0x51 #define I2C_PAGE_SIZE 256 /* Module Info Data */ struct mlx4_cable_info { u8 i2c_addr; u8 page_num; __be16 dev_mem_address; __be16 reserved1; __be16 size; __be32 reserved2[2]; u8 data[MODULE_INFO_MAX_READ]; }; enum cable_info_err { CABLE_INF_INV_PORT = 0x1, CABLE_INF_OP_NOSUP = 0x2, CABLE_INF_NOT_CONN = 0x3, CABLE_INF_NO_EEPRM = 0x4, CABLE_INF_PAGE_ERR = 0x5, CABLE_INF_INV_ADDR = 0x6, CABLE_INF_I2C_ADDR = 0x7, CABLE_INF_QSFP_VIO = 0x8, CABLE_INF_I2C_BUSY = 0x9, }; #define MAD_STATUS_2_CABLE_ERR(mad_status) ((mad_status >> 8) & 0xFF) #ifdef DEBUG static inline const char *cable_info_mad_err_str(u16 mad_status) { u8 err = MAD_STATUS_2_CABLE_ERR(mad_status); switch (err) { case CABLE_INF_INV_PORT: return "invalid port selected"; case CABLE_INF_OP_NOSUP: return "operation not supported for this port (the port is of type CX4 or internal)"; case CABLE_INF_NOT_CONN: return "cable is not connected"; case CABLE_INF_NO_EEPRM: return "the connected cable has no EPROM (passive copper cable)"; case CABLE_INF_PAGE_ERR: return "page number is greater than 15"; case CABLE_INF_INV_ADDR: return "invalid device_address or size (that is, size equals 0 or address+size is greater than 256)"; case CABLE_INF_I2C_ADDR: return "invalid I2C slave address"; case CABLE_INF_QSFP_VIO: return "at least one cable violates the QSFP specification and ignores the modsel signal"; case CABLE_INF_I2C_BUSY: return "I2C bus is constantly busy"; } return "Unknown Error"; } #endif /* DEBUG */ /** * mlx4_get_module_info - Read cable module eeprom data * @dev: mlx4_dev. * @port: port number. * @offset: byte offset in eeprom to start reading data from. * @size: num of bytes to read. * @data: output buffer to put the requested data into. * * Reads cable module eeprom data, puts the outcome data into * data pointer paramer. * Returns num of read bytes on success or a negative error * code. */ int mlx4_get_module_info(struct mlx4_dev *dev, u8 port, u16 offset, u16 size, u8 *data) { struct mlx4_cmd_mailbox *inbox, *outbox; struct mlx4_mad_ifc *inmad, *outmad; struct mlx4_cable_info *cable_info; u16 i2c_addr; int ret; if (size > MODULE_INFO_MAX_READ) size = MODULE_INFO_MAX_READ; inbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(inbox)) { mlx4_err(dev, "mlx4_alloc_cmd_mailbox returned with error(%lx)", PTR_ERR(inbox)); return PTR_ERR(inbox); } outbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(outbox)) { mlx4_free_cmd_mailbox(dev, inbox); mlx4_err(dev, "mlx4_alloc_cmd_mailbox returned with error(%lx)", PTR_ERR(outbox)); return PTR_ERR(outbox); } inmad = (struct mlx4_mad_ifc *)(inbox->buf); outmad = (struct mlx4_mad_ifc *)(outbox->buf); inmad->method = 0x1; /* Get */ inmad->class_version = 0x1; inmad->mgmt_class = 0x1; inmad->base_version = 0x1; inmad->attr_id = cpu_to_be16(0xFF60); /* Module Info */ if (offset < I2C_PAGE_SIZE && offset + size > I2C_PAGE_SIZE) /* Cross pages reads are not allowed * read until offset 256 in low page */ size -= offset + size - I2C_PAGE_SIZE; i2c_addr = I2C_ADDR_LOW; if (offset >= I2C_PAGE_SIZE) { /* Reset offset to high page */ i2c_addr = I2C_ADDR_HIGH; offset -= I2C_PAGE_SIZE; } cable_info = (struct mlx4_cable_info *)inmad->data; cable_info->dev_mem_address = cpu_to_be16(offset); cable_info->page_num = 0; cable_info->i2c_addr = i2c_addr; cable_info->size = cpu_to_be16(size); ret = mlx4_cmd_box(dev, inbox->dma, outbox->dma, port, 3, MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); if (ret) goto out; if (be16_to_cpu(outmad->status)) { /* Mad returned with bad status */ ret = be16_to_cpu(outmad->status); #ifdef DEBUG mlx4_warn(dev, "MLX4_CMD_MAD_IFC Get Module info attr(%x) " "port(%d) i2c_addr(%x) offset(%d) size(%d): Response " "Mad Status(%x) - %s\n", 0xFF60, port, i2c_addr, offset, size, ret, cable_info_mad_err_str(ret)); #endif if (i2c_addr == I2C_ADDR_HIGH && MAD_STATUS_2_CABLE_ERR(ret) == CABLE_INF_I2C_ADDR) /* Some SFP cables do not support i2c slave * address 0x51 (high page), abort silently. */ ret = 0; else ret = -ret; goto out; } cable_info = (struct mlx4_cable_info *)outmad->data; memcpy(data, cable_info->data, size); ret = size; out: mlx4_free_cmd_mailbox(dev, inbox); mlx4_free_cmd_mailbox(dev, outbox); return ret; } EXPORT_SYMBOL(mlx4_get_module_info);