Index: head/sys/ofed/drivers/infiniband/hw/mlx4/cm.c =================================================================== --- head/sys/ofed/drivers/infiniband/hw/mlx4/cm.c (revision 296381) +++ head/sys/ofed/drivers/infiniband/hw/mlx4/cm.c (revision 296382) @@ -1,484 +1,484 @@ /* * Copyright (c) 2012 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include "mlx4_ib.h" #define CM_CLEANUP_CACHE_TIMEOUT (5 * HZ) struct id_map_entry { struct rb_node node; u32 sl_cm_id; u32 pv_cm_id; int slave_id; int scheduled_delete; struct mlx4_ib_dev *dev; struct list_head list; struct delayed_work timeout; }; struct cm_generic_msg { struct ib_mad_hdr hdr; __be32 local_comm_id; __be32 remote_comm_id; }; struct cm_sidr_generic_msg { struct ib_mad_hdr hdr; __be32 request_id; }; struct cm_req_msg { unsigned char unused[0x60]; union ib_gid primary_path_sgid; }; static void set_local_comm_id(struct ib_mad *mad, u32 cm_id) { if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { struct cm_sidr_generic_msg *msg = (struct cm_sidr_generic_msg *)mad; msg->request_id = cpu_to_be32(cm_id); } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { pr_err("trying to set local_comm_id in SIDR_REP\n"); return; } else { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - msg->local_comm_id = cpu_to_be32(cm_id); + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->local_comm_id = cpu_to_be32(cm_id); } } static u32 get_local_comm_id(struct ib_mad *mad) { if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { struct cm_sidr_generic_msg *msg = (struct cm_sidr_generic_msg *)mad; return be32_to_cpu(msg->request_id); } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { pr_err("trying to set local_comm_id in SIDR_REP\n"); return -1; } else { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - return be32_to_cpu(msg->local_comm_id); + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->local_comm_id); } } static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id) { if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { struct cm_sidr_generic_msg *msg = (struct cm_sidr_generic_msg *)mad; msg->request_id = cpu_to_be32(cm_id); } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { pr_err("trying to set remote_comm_id in SIDR_REQ\n"); return; } else { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - msg->remote_comm_id = cpu_to_be32(cm_id); + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->remote_comm_id = cpu_to_be32(cm_id); } } static u32 get_remote_comm_id(struct ib_mad *mad) { if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { struct cm_sidr_generic_msg *msg = (struct cm_sidr_generic_msg *)mad; return be32_to_cpu(msg->request_id); } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { pr_err("trying to set remote_comm_id in SIDR_REQ\n"); return -1; } else { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - return be32_to_cpu(msg->remote_comm_id); + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->remote_comm_id); } } static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad) { struct cm_req_msg *msg = (struct cm_req_msg *)mad; return msg->primary_path_sgid; } /* Lock should be taken before called */ static struct id_map_entry * id_map_find_by_sl_id(struct ib_device *ibdev, u32 slave_id, u32 sl_cm_id) { struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map; struct rb_node *node = sl_id_map->rb_node; while (node) { struct id_map_entry *id_map_entry = rb_entry(node, struct id_map_entry, node); if (id_map_entry->sl_cm_id > sl_cm_id) node = node->rb_left; else if (id_map_entry->sl_cm_id < sl_cm_id) node = node->rb_right; else if (id_map_entry->slave_id > slave_id) node = node->rb_left; else if (id_map_entry->slave_id < slave_id) node = node->rb_right; else return id_map_entry; } return NULL; } static void id_map_ent_timeout(struct work_struct *work) { struct delayed_work *delay = to_delayed_work(work); struct id_map_entry *ent = container_of(delay, struct id_map_entry, timeout); struct id_map_entry *db_ent, *found_ent; struct mlx4_ib_dev *dev = ent->dev; struct mlx4_ib_sriov *sriov = &dev->sriov; struct rb_root *sl_id_map = &sriov->sl_id_map; int pv_id = (int) ent->pv_cm_id; spin_lock(&sriov->id_map_lock); db_ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_id); if (!db_ent) goto out; found_ent = id_map_find_by_sl_id(&dev->ib_dev, ent->slave_id, ent->sl_cm_id); if (found_ent && found_ent == ent) rb_erase(&found_ent->node, sl_id_map); idr_remove(&sriov->pv_id_table, pv_id); out: list_del(&ent->list); spin_unlock(&sriov->id_map_lock); kfree(ent); } static void id_map_find_del(struct ib_device *ibdev, int pv_cm_id) { struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; struct rb_root *sl_id_map = &sriov->sl_id_map; struct id_map_entry *ent, *found_ent; spin_lock(&sriov->id_map_lock); ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_cm_id); if (!ent) goto out; found_ent = id_map_find_by_sl_id(ibdev, ent->slave_id, ent->sl_cm_id); if (found_ent && found_ent == ent) rb_erase(&found_ent->node, sl_id_map); idr_remove(&sriov->pv_id_table, pv_cm_id); out: spin_unlock(&sriov->id_map_lock); } static void sl_id_map_add(struct ib_device *ibdev, struct id_map_entry *new) { struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map; struct rb_node **link = &sl_id_map->rb_node, *parent = NULL; struct id_map_entry *ent; int slave_id = new->slave_id; int sl_cm_id = new->sl_cm_id; ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id); if (ent) { pr_debug("overriding existing sl_id_map entry (cm_id = %x)\n", sl_cm_id); rb_replace_node(&ent->node, &new->node, sl_id_map); return; } /* Go to the bottom of the tree */ while (*link) { parent = *link; ent = rb_entry(parent, struct id_map_entry, node); if (ent->sl_cm_id > sl_cm_id || (ent->sl_cm_id == sl_cm_id && ent->slave_id > slave_id)) link = &(*link)->rb_left; else link = &(*link)->rb_right; } rb_link_node(&new->node, parent, link); rb_insert_color(&new->node, sl_id_map); } static struct id_map_entry * id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id) { int ret, id; static int next_id; struct id_map_entry *ent; struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; ent = kmalloc(sizeof (struct id_map_entry), GFP_KERNEL); if (!ent) { mlx4_ib_warn(ibdev, "Couldn't allocate id cache entry - out of memory\n"); return ERR_PTR(-ENOMEM); } ent->sl_cm_id = sl_cm_id; ent->slave_id = slave_id; ent->scheduled_delete = 0; ent->dev = to_mdev(ibdev); INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout); do { spin_lock(&to_mdev(ibdev)->sriov.id_map_lock); ret = idr_get_new_above(&sriov->pv_id_table, ent, next_id, &id); if (!ret) { next_id = ((unsigned) id + 1) & MAX_IDR_MASK; ent->pv_cm_id = (u32)id; sl_id_map_add(ibdev, ent); } spin_unlock(&sriov->id_map_lock); } while (ret == -EAGAIN && idr_pre_get(&sriov->pv_id_table, GFP_KERNEL)); /*the function idr_get_new_above can return -ENOSPC, so don't insert in that case.*/ if (!ret) { spin_lock(&sriov->id_map_lock); list_add_tail(&ent->list, &sriov->cm_list); spin_unlock(&sriov->id_map_lock); return ent; } /*error flow*/ kfree(ent); mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret); return ERR_PTR(-ENOMEM); } static struct id_map_entry * id_map_get(struct ib_device *ibdev, int *pv_cm_id, int sl_cm_id, int slave_id) { struct id_map_entry *ent; struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; spin_lock(&sriov->id_map_lock); if (*pv_cm_id == -1) { ent = id_map_find_by_sl_id(ibdev, sl_cm_id, slave_id); if (ent) *pv_cm_id = (int) ent->pv_cm_id; } else ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, *pv_cm_id); spin_unlock(&sriov->id_map_lock); return ent; } static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id) { struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; unsigned long flags; spin_lock(&sriov->id_map_lock); spin_lock_irqsave(&sriov->going_down_lock, flags); /*make sure that there is no schedule inside the scheduled work.*/ if (!sriov->is_going_down) { id->scheduled_delete = 1; schedule_delayed_work(&id->timeout, CM_CLEANUP_CACHE_TIMEOUT); } spin_unlock_irqrestore(&sriov->going_down_lock, flags); spin_unlock(&sriov->id_map_lock); } int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, struct ib_mad *mad) { struct id_map_entry *id; u32 sl_cm_id; int pv_cm_id = -1; if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || mad->mad_hdr.attr_id == CM_REP_ATTR_ID || mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID || mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { sl_cm_id = get_local_comm_id(mad); id = id_map_alloc(ibdev, slave_id, sl_cm_id); if (IS_ERR(id)) { mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n", __func__, slave_id, sl_cm_id); return PTR_ERR(id); } } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { return 0; } else { sl_cm_id = get_local_comm_id(mad); id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id); } if (!id) { pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL!\n", slave_id, sl_cm_id); return -EINVAL; } set_local_comm_id(mad, id->pv_cm_id); if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) schedule_delayed(ibdev, id); else if (mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) id_map_find_del(ibdev, pv_cm_id); return 0; } int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, struct ib_mad *mad, int is_eth) { u32 pv_cm_id; struct id_map_entry *id; if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { union ib_gid gid; if (is_eth) return 0; gid = gid_from_req_msg(ibdev, mad); *slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id); if (*slave < 0) { mlx4_ib_warn(ibdev, "failed matching slave_id by gid (0x%llx)\n", (unsigned long long)gid.global.interface_id); return -ENOENT; } return 0; } pv_cm_id = get_remote_comm_id(mad); id = id_map_get(ibdev, (int *)&pv_cm_id, -1, -1); if (!id) { pr_debug("Couldn't find an entry for pv_cm_id 0x%x\n", pv_cm_id); return -ENOENT; } if (!is_eth) *slave = id->slave_id; set_remote_comm_id(mad, id->sl_cm_id); if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) schedule_delayed(ibdev, id); else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) { id_map_find_del(ibdev, (int) pv_cm_id); } return 0; } void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev) { spin_lock_init(&dev->sriov.id_map_lock); INIT_LIST_HEAD(&dev->sriov.cm_list); dev->sriov.sl_id_map = RB_ROOT; idr_init(&dev->sriov.pv_id_table); idr_pre_get(&dev->sriov.pv_id_table, GFP_KERNEL); } /* slave = -1 ==> all slaves */ /* TBD -- call paravirt clean for single slave. Need for slave RESET event */ void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave) { struct mlx4_ib_sriov *sriov = &dev->sriov; struct rb_root *sl_id_map = &sriov->sl_id_map; struct list_head lh; struct rb_node *nd; int need_flush = 1; struct id_map_entry *map, *tmp_map; /* cancel all delayed work queue entries */ INIT_LIST_HEAD(&lh); spin_lock(&sriov->id_map_lock); list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) { if (slave < 0 || slave == map->slave_id) { if (map->scheduled_delete) need_flush &= !!cancel_delayed_work(&map->timeout); } } spin_unlock(&sriov->id_map_lock); if (!need_flush) flush_scheduled_work(); /* make sure all timers were flushed */ /* now, remove all leftover entries from databases*/ spin_lock(&sriov->id_map_lock); if (slave < 0) { while (rb_first(sl_id_map)) { struct id_map_entry *ent = rb_entry(rb_first(sl_id_map), struct id_map_entry, node); rb_erase(&ent->node, sl_id_map); idr_remove(&sriov->pv_id_table, (int) ent->pv_cm_id); } list_splice_init(&dev->sriov.cm_list, &lh); } else { /* first, move nodes belonging to slave to db remove list */ nd = rb_first(sl_id_map); while (nd) { struct id_map_entry *ent = rb_entry(nd, struct id_map_entry, node); nd = rb_next(nd); if (ent->slave_id == slave) list_move_tail(&ent->list, &lh); } /* remove those nodes from databases */ list_for_each_entry_safe(map, tmp_map, &lh, list) { rb_erase(&map->node, sl_id_map); idr_remove(&sriov->pv_id_table, (int) map->pv_cm_id); } /* add remaining nodes from cm_list */ list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) { if (slave == map->slave_id) list_move_tail(&map->list, &lh); } } spin_unlock(&sriov->id_map_lock); /* free any map entries left behind due to cancel_delayed_work above */ list_for_each_entry_safe(map, tmp_map, &lh, list) { list_del(&map->list); kfree(map); } } Index: head/sys/ofed/drivers/infiniband/hw/mlx4/cq.c =================================================================== --- head/sys/ofed/drivers/infiniband/hw/mlx4/cq.c (revision 296381) +++ head/sys/ofed/drivers/infiniband/hw/mlx4/cq.c (revision 296382) @@ -1,1009 +1,1009 @@ /* * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include "mlx4_ib.h" #include "user.h" /* Which firmware version adds support for Resize CQ */ #define MLX4_FW_VER_RESIZE_CQ mlx4_fw_ver(2, 5, 0) #define MLX4_FW_VER_IGNORE_OVERRUN_CQ mlx4_fw_ver(2, 7, 8200) static void mlx4_ib_cq_comp(struct mlx4_cq *cq) { struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; ibcq->comp_handler(ibcq, ibcq->cq_context); } static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type) { struct ib_event event; struct ib_cq *ibcq; if (type != MLX4_EVENT_TYPE_CQ_ERROR) { pr_warn("Unexpected event type %d " "on CQ %06x\n", type, cq->cqn); return; } ibcq = &to_mibcq(cq)->ibcq; if (ibcq->event_handler) { event.device = ibcq->device; event.event = IB_EVENT_CQ_ERR; event.element.cq = ibcq; ibcq->event_handler(&event, ibcq->cq_context); } } static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n) { return mlx4_buf_offset(&buf->buf, n * buf->entry_size); } static void *get_cqe(struct mlx4_ib_cq *cq, int n) { return get_cqe_from_buf(&cq->buf, n); } static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n) { struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe); struct mlx4_cqe *tcqe = ((cq->buf.entry_size == 64) ? (cqe + 1) : cqe); return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe; } static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq) { return get_sw_cqe(cq, cq->mcq.cons_index); } int mlx4_ib_modify_cq(struct ib_cq *cq, struct ib_cq_attr *cq_attr, int cq_attr_mask) { int err = 0; struct mlx4_ib_cq *mcq = to_mcq(cq); struct mlx4_ib_dev *dev = to_mdev(cq->device); if (cq_attr_mask & IB_CQ_CAP_FLAGS) { if (cq_attr->cq_cap_flags & IB_CQ_TIMESTAMP) return -ENOTSUPP; if (cq_attr->cq_cap_flags & IB_CQ_IGNORE_OVERRUN) { if (dev->dev->caps.cq_flags & MLX4_DEV_CAP_CQ_FLAG_IO) err = mlx4_cq_ignore_overrun(dev->dev, &mcq->mcq); else err = -ENOSYS; } } if (!err) if (cq_attr_mask & IB_CQ_MODERATION) err = mlx4_cq_modify(dev->dev, &mcq->mcq, cq_attr->moderation.cq_count, cq_attr->moderation.cq_period); return err; } static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int nent) { int err; err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size, PAGE_SIZE * 2, &buf->buf); if (err) goto out; buf->entry_size = dev->dev->caps.cqe_size; err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift, &buf->mtt); if (err) goto err_buf; err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf); if (err) goto err_mtt; return 0; err_mtt: mlx4_mtt_cleanup(dev->dev, &buf->mtt); err_buf: mlx4_buf_free(dev->dev, nent * buf->entry_size, &buf->buf); out: return err; } static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe) { mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf); } static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context, struct mlx4_ib_cq_buf *buf, struct ib_umem **umem, u64 buf_addr, int cqe) { int err; int cqe_size = dev->dev->caps.cqe_size; int shift; int n; *umem = ib_umem_get(context, buf_addr, cqe * cqe_size, IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(*umem)) return PTR_ERR(*umem); n = ib_umem_page_count(*umem); shift = mlx4_ib_umem_calc_optimal_mtt_size(*umem, 0, &n); err = mlx4_mtt_init(dev->dev, n, shift, &buf->mtt); if (err) goto err_buf; err = mlx4_ib_umem_write_mtt(dev, &buf->mtt, *umem); if (err) goto err_mtt; return 0; err_mtt: mlx4_mtt_cleanup(dev->dev, &buf->mtt); err_buf: ib_umem_release(*umem); return err; } /* we don't support system timestamping */ #define CQ_CREATE_FLAGS_SUPPORTED IB_CQ_TIMESTAMP struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct mlx4_ib_cq *cq; struct mlx4_uar *uar; int err; int entries = attr->cqe; int vector = attr->comp_vector; if (entries < 1 || entries > dev->dev->caps.max_cqes) return ERR_PTR(-EINVAL); if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED) return ERR_PTR(-EINVAL); cq = kzalloc(sizeof(*cq), GFP_KERNEL); if (!cq) return ERR_PTR(-ENOMEM); entries = roundup_pow_of_two(entries + 1); cq->ibcq.cqe = entries - 1; mutex_init(&cq->resize_mutex); spin_lock_init(&cq->lock); cq->resize_buf = NULL; cq->resize_umem = NULL; cq->create_flags = attr->flags; if (context) { struct mlx4_ib_create_cq ucmd; if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { err = -EFAULT; goto err_cq; } err = mlx4_ib_get_cq_umem(dev, context, &cq->buf, &cq->umem, ucmd.buf_addr, entries); if (err) goto err_cq; err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr, &cq->db); if (err) goto err_mtt; uar = &to_mucontext(context)->uar; } else { err = mlx4_db_alloc(dev->dev, &cq->db, 1); if (err) goto err_cq; cq->mcq.set_ci_db = cq->db.db; cq->mcq.arm_db = cq->db.db + 1; *cq->mcq.set_ci_db = 0; *cq->mcq.arm_db = 0; err = mlx4_ib_alloc_cq_buf(dev, &cq->buf, entries); if (err) goto err_db; uar = &dev->priv_uar; } if (dev->eq_table) vector = dev->eq_table[vector % ibdev->num_comp_vectors]; err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, cq->db.dma, &cq->mcq, vector, 0, !!(cq->create_flags & IB_CQ_TIMESTAMP)); if (err) goto err_dbmap; cq->mcq.comp = mlx4_ib_cq_comp; cq->mcq.event = mlx4_ib_cq_event; if (context) if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) { err = -EFAULT; goto err_dbmap; } return &cq->ibcq; err_dbmap: if (context) mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db); err_mtt: mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt); if (context) ib_umem_release(cq->umem); else mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); err_db: if (!context) mlx4_db_free(dev->dev, &cq->db); err_cq: kfree(cq); return ERR_PTR(err); } static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq, int entries) { int err; if (cq->resize_buf) return -EBUSY; cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC); if (!cq->resize_buf) return -ENOMEM; err = mlx4_ib_alloc_cq_buf(dev, &cq->resize_buf->buf, entries); if (err) { kfree(cq->resize_buf); cq->resize_buf = NULL; return err; } cq->resize_buf->cqe = entries - 1; return 0; } static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq, int entries, struct ib_udata *udata) { struct mlx4_ib_resize_cq ucmd; int err; if (cq->resize_umem) return -EBUSY; if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) return -EFAULT; cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC); if (!cq->resize_buf) return -ENOMEM; err = mlx4_ib_get_cq_umem(dev, cq->umem->context, &cq->resize_buf->buf, &cq->resize_umem, ucmd.buf_addr, entries); if (err) { kfree(cq->resize_buf); cq->resize_buf = NULL; return err; } cq->resize_buf->cqe = entries - 1; return 0; } static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq) { u32 i; i = cq->mcq.cons_index; while (get_sw_cqe(cq, i)) ++i; return i - cq->mcq.cons_index; } static int mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq) { struct mlx4_cqe *cqe, *new_cqe; int i; int cqe_size = cq->buf.entry_size; int cqe_inc = cqe_size == 64 ? 1 : 0; struct mlx4_cqe *start_cqe; i = cq->mcq.cons_index; cqe = get_cqe(cq, i & cq->ibcq.cqe); start_cqe = cqe; cqe += cqe_inc; while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) { new_cqe = get_cqe_from_buf(&cq->resize_buf->buf, (i + 1) & cq->resize_buf->cqe); memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), cqe_size); new_cqe += cqe_inc; new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) | (((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); cqe = get_cqe(cq, ++i & cq->ibcq.cqe); if (cqe == start_cqe) { pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n", cq->mcq.cqn); return -ENOMEM; } cqe += cqe_inc; } ++cq->mcq.cons_index; return 0; } int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(ibcq->device); struct mlx4_ib_cq *cq = to_mcq(ibcq); struct mlx4_mtt mtt; int outst_cqe; int err; if (dev->dev->caps.fw_ver < MLX4_FW_VER_RESIZE_CQ) return -ENOSYS; mutex_lock(&cq->resize_mutex); if (entries < 1 || entries > dev->dev->caps.max_cqes) { err = -EINVAL; goto out; } entries = roundup_pow_of_two(entries + 1); if (entries == ibcq->cqe + 1) { err = 0; goto out; } if (entries > dev->dev->caps.max_cqes + 1) { err = -EINVAL; goto out; } if (ibcq->uobject) { err = mlx4_alloc_resize_umem(dev, cq, entries, udata); if (err) goto out; } else { /* Can't be smaller than the number of outstanding CQEs */ outst_cqe = mlx4_ib_get_outstanding_cqes(cq); if (entries < outst_cqe + 1) { err = 0; goto out; } err = mlx4_alloc_resize_buf(dev, cq, entries); if (err) goto out; } mtt = cq->buf.mtt; err = mlx4_cq_resize(dev->dev, &cq->mcq, entries, &cq->resize_buf->buf.mtt); if (err) goto err_buf; mlx4_mtt_cleanup(dev->dev, &mtt); if (ibcq->uobject) { cq->buf = cq->resize_buf->buf; cq->ibcq.cqe = cq->resize_buf->cqe; ib_umem_release(cq->umem); cq->umem = cq->resize_umem; kfree(cq->resize_buf); cq->resize_buf = NULL; cq->resize_umem = NULL; } else { struct mlx4_ib_cq_buf tmp_buf; int tmp_cqe = 0; spin_lock_irq(&cq->lock); if (cq->resize_buf) { err = mlx4_ib_cq_resize_copy_cqes(cq); tmp_buf = cq->buf; tmp_cqe = cq->ibcq.cqe; cq->buf = cq->resize_buf->buf; cq->ibcq.cqe = cq->resize_buf->cqe; kfree(cq->resize_buf); cq->resize_buf = NULL; } spin_unlock_irq(&cq->lock); if (tmp_cqe) mlx4_ib_free_cq_buf(dev, &tmp_buf, tmp_cqe); } goto out; err_buf: mlx4_mtt_cleanup(dev->dev, &cq->resize_buf->buf.mtt); if (!ibcq->uobject) mlx4_ib_free_cq_buf(dev, &cq->resize_buf->buf, cq->resize_buf->cqe); kfree(cq->resize_buf); cq->resize_buf = NULL; if (cq->resize_umem) { ib_umem_release(cq->resize_umem); cq->resize_umem = NULL; } out: mutex_unlock(&cq->resize_mutex); return err; } int mlx4_ib_ignore_overrun_cq(struct ib_cq *ibcq) { struct mlx4_ib_dev *dev = to_mdev(ibcq->device); struct mlx4_ib_cq *cq = to_mcq(ibcq); if (dev->dev->caps.fw_ver < MLX4_FW_VER_IGNORE_OVERRUN_CQ) return -ENOSYS; return mlx4_cq_ignore_overrun(dev->dev, &cq->mcq); } int mlx4_ib_destroy_cq(struct ib_cq *cq) { struct mlx4_ib_dev *dev = to_mdev(cq->device); struct mlx4_ib_cq *mcq = to_mcq(cq); mlx4_cq_free(dev->dev, &mcq->mcq); mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt); if (cq->uobject) { mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db); ib_umem_release(mcq->umem); } else { mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe); mlx4_db_free(dev->dev, &mcq->db); } kfree(mcq); return 0; } static void dump_cqe(void *cqe) { __be32 *buf = cqe; pr_debug("CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n", be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]), be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]), be32_to_cpu(buf[6]), be32_to_cpu(buf[7])); } static void mlx4_ib_handle_error_cqe(struct mlx4_err_cqe *cqe, struct ib_wc *wc) { if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) { pr_debug("local QP operation err " "(QPN %06x, WQE index %x, vendor syndrome %02x, " "opcode = %02x)\n", be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index), cqe->vendor_err_syndrome, cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); dump_cqe(cqe); } switch (cqe->syndrome) { case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR: wc->status = IB_WC_LOC_LEN_ERR; break; case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR: wc->status = IB_WC_LOC_QP_OP_ERR; break; case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR: wc->status = IB_WC_LOC_PROT_ERR; break; case MLX4_CQE_SYNDROME_WR_FLUSH_ERR: wc->status = IB_WC_WR_FLUSH_ERR; break; case MLX4_CQE_SYNDROME_MW_BIND_ERR: wc->status = IB_WC_MW_BIND_ERR; break; case MLX4_CQE_SYNDROME_BAD_RESP_ERR: wc->status = IB_WC_BAD_RESP_ERR; break; case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR: wc->status = IB_WC_LOC_ACCESS_ERR; break; case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: wc->status = IB_WC_REM_INV_REQ_ERR; break; case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR: wc->status = IB_WC_REM_ACCESS_ERR; break; case MLX4_CQE_SYNDROME_REMOTE_OP_ERR: wc->status = IB_WC_REM_OP_ERR; break; case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: wc->status = IB_WC_RETRY_EXC_ERR; break; case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR: wc->status = IB_WC_RNR_RETRY_EXC_ERR; break; case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR: wc->status = IB_WC_REM_ABORT_ERR; break; default: wc->status = IB_WC_GENERAL_ERR; break; } wc->vendor_err = cqe->vendor_err_syndrome; } static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum) { return ((status & cpu_to_be16(MLX4_CQE_STATUS_IPV4 | MLX4_CQE_STATUS_IPV4F | MLX4_CQE_STATUS_IPV4OPT | MLX4_CQE_STATUS_IPV6 | MLX4_CQE_STATUS_IPOK)) == cpu_to_be16(MLX4_CQE_STATUS_IPV4 | MLX4_CQE_STATUS_IPOK)) && (status & cpu_to_be16(MLX4_CQE_STATUS_UDP | MLX4_CQE_STATUS_TCP)) && checksum == cpu_to_be16(0xffff); } static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc, unsigned tail, struct mlx4_cqe *cqe, int is_eth) { struct mlx4_ib_proxy_sqp_hdr *hdr; ib_dma_sync_single_for_cpu(qp->ibqp.device, qp->sqp_proxy_rcv[tail].map, sizeof (struct mlx4_ib_proxy_sqp_hdr), DMA_FROM_DEVICE); hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr); wc->pkey_index = be16_to_cpu(hdr->tun.pkey_index); wc->src_qp = be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF; wc->wc_flags |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0; wc->dlid_path_bits = 0; if (is_eth) { wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid); memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4); memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2); } else { wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32); wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12); } return 0; } static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, struct mlx4_ib_qp **cur_qp, struct ib_wc *wc) { struct mlx4_cqe *cqe; struct mlx4_qp *mqp; struct mlx4_ib_wq *wq; struct mlx4_ib_srq *srq; struct mlx4_srq *msrq = NULL; int is_send; int is_error; u32 g_mlpath_rqpn; u16 wqe_ctr; unsigned tail = 0; int timestamp_en = !!(cq->create_flags & IB_CQ_TIMESTAMP); repoll: cqe = next_cqe_sw(cq); if (!cqe) return -EAGAIN; if (cq->buf.entry_size == 64) cqe++; ++cq->mcq.cons_index; /* * Make sure we read CQ entry contents after we've checked the * ownership bit. */ rmb(); is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK; is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_ERROR; if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP && is_send)) { pr_warn("Completion for NOP opcode detected!\n"); return -EINVAL; } /* Resize CQ in progress */ if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_RESIZE)) { if (cq->resize_buf) { struct mlx4_ib_dev *dev = to_mdev(cq->ibcq.device); mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); cq->buf = cq->resize_buf->buf; cq->ibcq.cqe = cq->resize_buf->cqe; kfree(cq->resize_buf); cq->resize_buf = NULL; } goto repoll; } if (!*cur_qp || (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) != (*cur_qp)->mqp.qpn) { /* * We do not have to take the QP table lock here, * because CQs will be locked while QPs are removed * from the table. */ mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev, be32_to_cpu(cqe->vlan_my_qpn)); if (unlikely(!mqp)) { pr_warn("CQ %06x with entry for unknown QPN %06x\n", cq->mcq.cqn, be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK); return -EINVAL; } *cur_qp = to_mibqp(mqp); } wc->qp = &(*cur_qp)->ibqp; if (wc->qp->qp_type == IB_QPT_XRC_TGT) { u32 srq_num; g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn); srq_num = g_mlpath_rqpn & 0xffffff; /* SRQ is also in the radix tree */ msrq = mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev, srq_num); if (unlikely(!msrq)) { pr_warn("CQ %06x with entry for unknown SRQN %06x\n", cq->mcq.cqn, srq_num); return -EINVAL; } } if (is_send) { wq = &(*cur_qp)->sq; if (!(*cur_qp)->sq_signal_bits) { wqe_ctr = be16_to_cpu(cqe->wqe_index); wq->tail += (u16) (wqe_ctr - (u16) wq->tail); } wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; ++wq->tail; } else if ((*cur_qp)->ibqp.srq) { srq = to_msrq((*cur_qp)->ibqp.srq); wqe_ctr = be16_to_cpu(cqe->wqe_index); wc->wr_id = srq->wrid[wqe_ctr]; mlx4_ib_free_srq_wqe(srq, wqe_ctr); } else if (msrq) { srq = to_mibsrq(msrq); wqe_ctr = be16_to_cpu(cqe->wqe_index); wc->wr_id = srq->wrid[wqe_ctr]; mlx4_ib_free_srq_wqe(srq, wqe_ctr); } else { wq = &(*cur_qp)->rq; tail = wq->tail & (wq->wqe_cnt - 1); wc->wr_id = wq->wrid[tail]; ++wq->tail; } if (unlikely(is_error)) { mlx4_ib_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc); return 0; } wc->status = IB_WC_SUCCESS; if (is_send) { wc->wc_flags = 0; switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { case MLX4_OPCODE_RDMA_WRITE_IMM: wc->wc_flags |= IB_WC_WITH_IMM; /* fall through */ case MLX4_OPCODE_RDMA_WRITE: wc->opcode = IB_WC_RDMA_WRITE; break; case MLX4_OPCODE_SEND_IMM: wc->wc_flags |= IB_WC_WITH_IMM; case MLX4_OPCODE_SEND: case MLX4_OPCODE_SEND_INVAL: wc->opcode = IB_WC_SEND; break; case MLX4_OPCODE_RDMA_READ: wc->opcode = IB_WC_RDMA_READ; wc->byte_len = be32_to_cpu(cqe->byte_cnt); break; case MLX4_OPCODE_ATOMIC_CS: wc->opcode = IB_WC_COMP_SWAP; wc->byte_len = 8; break; case MLX4_OPCODE_ATOMIC_FA: wc->opcode = IB_WC_FETCH_ADD; wc->byte_len = 8; break; case MLX4_OPCODE_MASKED_ATOMIC_CS: wc->opcode = IB_WC_MASKED_COMP_SWAP; wc->byte_len = 8; break; case MLX4_OPCODE_MASKED_ATOMIC_FA: wc->opcode = IB_WC_MASKED_FETCH_ADD; wc->byte_len = 8; break; case MLX4_OPCODE_BIND_MW: wc->opcode = IB_WC_BIND_MW; break; case MLX4_OPCODE_LSO: wc->opcode = IB_WC_LSO; break; case MLX4_OPCODE_FMR: wc->opcode = IB_WC_FAST_REG_MR; break; case MLX4_OPCODE_LOCAL_INVAL: wc->opcode = IB_WC_LOCAL_INV; break; } } else { wc->byte_len = be32_to_cpu(cqe->byte_cnt); switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; wc->wc_flags = IB_WC_WITH_IMM; wc->ex.imm_data = cqe->immed_rss_invalid; break; case MLX4_RECV_OPCODE_SEND_INVAL: wc->opcode = IB_WC_RECV; wc->wc_flags = IB_WC_WITH_INVALIDATE; wc->ex.invalidate_rkey = be32_to_cpu(cqe->immed_rss_invalid); break; case MLX4_RECV_OPCODE_SEND: wc->opcode = IB_WC_RECV; wc->wc_flags = 0; break; case MLX4_RECV_OPCODE_SEND_IMM: wc->opcode = IB_WC_RECV; wc->wc_flags = IB_WC_WITH_IMM; wc->ex.imm_data = cqe->immed_rss_invalid; break; } if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) { if ((*cur_qp)->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) return use_tunnel_data (*cur_qp, cq, wc, tail, cqe, rdma_port_get_link_layer (wc->qp->device, (*cur_qp)->port) == IB_LINK_LAYER_ETHERNET); } if (timestamp_en) { /* currently, only CQ_CREATE_WITH_TIMESTAMPING_RAW is * supported. CQ_CREATE_WITH_TIMESTAMPING_SYS isn't * supported */ if (cq->create_flags & IB_CQ_TIMESTAMP_TO_SYS_TIME) { wc->ts.timestamp = 0; } else { wc->ts.timestamp = ((u64)(be32_to_cpu(cqe->timestamp_16_47) + !cqe->timestamp_0_15) << 16) | be16_to_cpu(cqe->timestamp_0_15); wc->wc_flags |= IB_WC_WITH_TIMESTAMP; - } + } } else { wc->wc_flags |= IB_WC_WITH_SLID; - wc->slid = be16_to_cpu(cqe->rlid); + wc->slid = be16_to_cpu(cqe->rlid); } g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn); wc->src_qp = g_mlpath_rqpn & 0xffffff; wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f; wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0; wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f; wc->wc_flags |= mlx4_ib_ipoib_csum_ok(cqe->status, cqe->checksum) ? IB_WC_IP_CSUM_OK : 0; if (!timestamp_en) { - if (rdma_port_get_link_layer(wc->qp->device, + if (rdma_port_get_link_layer(wc->qp->device, (*cur_qp)->port) == IB_LINK_LAYER_ETHERNET) - wc->sl = be16_to_cpu(cqe->sl_vid) >> 13; - else - wc->sl = be16_to_cpu(cqe->sl_vid) >> 12; + wc->sl = be16_to_cpu(cqe->sl_vid) >> 13; + else + wc->sl = be16_to_cpu(cqe->sl_vid) >> 12; wc->wc_flags |= IB_WC_WITH_SL; } if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_VLAN_PRESENT_MASK) && !timestamp_en) { wc->vlan_id = be16_to_cpu(cqe->sl_vid) & MLX4_CQE_VID_MASK; wc->wc_flags |= IB_WC_WITH_VLAN; } else { wc->vlan_id = 0xffff; } if (!timestamp_en) { memcpy(wc->smac, cqe->smac, 6); wc->wc_flags |= IB_WC_WITH_SMAC; } } return 0; } int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { struct mlx4_ib_cq *cq = to_mcq(ibcq); struct mlx4_ib_qp *cur_qp = NULL; unsigned long flags; int npolled; int err = 0; spin_lock_irqsave(&cq->lock, flags); for (npolled = 0; npolled < num_entries; ++npolled) { err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled); if (err) break; } mlx4_cq_set_ci(&cq->mcq); spin_unlock_irqrestore(&cq->lock, flags); if (err == 0 || err == -EAGAIN) return npolled; else return err; } int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { mlx4_cq_arm(&to_mcq(ibcq)->mcq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT, to_mdev(ibcq->device)->priv_uar.map, MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock)); return 0; } void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) { u32 prod_index; int nfreed = 0; struct mlx4_cqe *cqe, *dest; u8 owner_bit; int cqe_inc = cq->buf.entry_size == 64 ? 1 : 0; /* * First we need to find the current producer index, so we * know where to start cleaning from. It doesn't matter if HW * adds new entries after this loop -- the QP we're worried * about is already in RESET, so the new entries won't come * from our QP and therefore don't need to be checked. */ for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index) if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe) break; /* * Now sweep backwards through the CQ, removing CQ entries * that match our QP by copying older entries on top of them. */ while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); cqe += cqe_inc; if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index)); ++nfreed; } else if (nfreed) { dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); dest += cqe_inc; owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK; memcpy(dest, cqe, sizeof *cqe); dest->owner_sr_opcode = owner_bit | (dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); } } if (nfreed) { cq->mcq.cons_index += nfreed; /* * Make sure update of buffer contents is done before * updating consumer index. */ wmb(); mlx4_cq_set_ci(&cq->mcq); } } void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) { spin_lock_irq(&cq->lock); __mlx4_ib_cq_clean(cq, qpn, srq); spin_unlock_irq(&cq->lock); } Index: head/sys/ofed/drivers/infiniband/hw/mlx4/mad.c =================================================================== --- head/sys/ofed/drivers/infiniband/hw/mlx4/mad.c (revision 296381) +++ head/sys/ofed/drivers/infiniband/hw/mlx4/mad.c (revision 296382) @@ -1,2350 +1,2350 @@ /* * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include "mlx4_ib.h" enum { MLX4_IB_VENDOR_CLASS1 = 0x9, MLX4_IB_VENDOR_CLASS2 = 0xa }; #define MLX4_TUN_SEND_WRID_SHIFT 34 #define MLX4_TUN_QPN_SHIFT 32 #define MLX4_TUN_WRID_RECV (((u64) 1) << MLX4_TUN_SEND_WRID_SHIFT) #define MLX4_TUN_SET_WRID_QPN(a) (((u64) ((a) & 0x3)) << MLX4_TUN_QPN_SHIFT) #define MLX4_TUN_IS_RECV(a) (((a) >> MLX4_TUN_SEND_WRID_SHIFT) & 0x1) #define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3) /* Port mgmt change event handling */ #define GET_BLK_PTR_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr) #define GET_MASK_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask) #define NUM_IDX_IN_PKEY_TBL_BLK 32 #define GUID_TBL_ENTRY_SIZE 8 /* size in bytes */ #define GUID_TBL_BLK_NUM_ENTRIES 8 #define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES) struct mlx4_mad_rcv_buf { struct ib_grh grh; u8 payload[256]; } __packed; struct mlx4_mad_snd_buf { u8 payload[256]; } __packed; struct mlx4_tunnel_mad { struct ib_grh grh; struct mlx4_ib_tunnel_header hdr; struct ib_mad mad; } __packed; struct mlx4_rcv_tunnel_mad { struct mlx4_rcv_tunnel_hdr hdr; struct ib_grh grh; struct ib_mad mad; } __packed; static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num); static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num); static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, int block, u32 change_bitmap); __be64 mlx4_ib_gen_node_guid(void) { #define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40)) return cpu_to_be64(NODE_GUID_HI | random()); } __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx) { return cpu_to_be64(atomic_inc_return(&ctx->tid)) | cpu_to_be64(0xff00000000000000LL); } int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, int port, struct ib_wc *in_wc, struct ib_grh *in_grh, void *in_mad, void *response_mad) { struct mlx4_cmd_mailbox *inmailbox, *outmailbox; void *inbox; int err; u32 in_modifier = port; u8 op_modifier = 0; inmailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(inmailbox)) return PTR_ERR(inmailbox); inbox = inmailbox->buf; outmailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(outmailbox)) { mlx4_free_cmd_mailbox(dev->dev, inmailbox); return PTR_ERR(outmailbox); } memcpy(inbox, in_mad, 256); /* * Key check traps can't be generated unless we have in_wc to * tell us where to send the trap. */ if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_MKEY) || !in_wc) op_modifier |= 0x1; if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_BKEY) || !in_wc) op_modifier |= 0x2; if (mlx4_is_mfunc(dev->dev) && (mad_ifc_flags & MLX4_MAD_IFC_NET_VIEW || in_wc)) op_modifier |= 0x8; if (in_wc) { struct { __be32 my_qpn; u32 reserved1; __be32 rqpn; u8 sl; u8 g_path; u16 reserved2[2]; __be16 pkey; u32 reserved3[11]; u8 grh[40]; } *ext_info; memset(inbox + 256, 0, 256); ext_info = inbox + 256; ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num); ext_info->rqpn = cpu_to_be32(in_wc->src_qp); ext_info->sl = in_wc->sl << 4; ext_info->g_path = in_wc->dlid_path_bits | (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0); ext_info->pkey = cpu_to_be16(in_wc->pkey_index); if (in_grh) memcpy(ext_info->grh, in_grh, 40); op_modifier |= 0x4; in_modifier |= in_wc->slid << 16; } err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier, mlx4_is_master(dev->dev) ? (op_modifier & ~0x8) : op_modifier, MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, (op_modifier & 0x8) ? MLX4_CMD_NATIVE : MLX4_CMD_WRAPPED); if (!err) memcpy(response_mad, outmailbox->buf, 256); mlx4_free_cmd_mailbox(dev->dev, inmailbox); mlx4_free_cmd_mailbox(dev->dev, outmailbox); return err; } static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl) { struct ib_ah *new_ah; struct ib_ah_attr ah_attr; unsigned long flags; if (!dev->send_agent[port_num - 1][0]) return; memset(&ah_attr, 0, sizeof ah_attr); ah_attr.dlid = lid; ah_attr.sl = sl; ah_attr.port_num = port_num; new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd, &ah_attr); if (IS_ERR(new_ah)) return; spin_lock_irqsave(&dev->sm_lock, flags); if (dev->sm_ah[port_num - 1]) ib_destroy_ah(dev->sm_ah[port_num - 1]); dev->sm_ah[port_num - 1] = new_ah; spin_unlock_irqrestore(&dev->sm_lock, flags); } /* * Snoop SM MADs for port info, GUID info, and P_Key table sets, so we can * synthesize LID change, Client-Rereg, GID change, and P_Key change events. */ static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad, u16 prev_lid) { struct ib_port_info *pinfo; u16 lid; __be16 *base; u32 bn, pkey_change_bitmap; int i; struct mlx4_ib_dev *dev = to_mdev(ibdev); if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && mad->mad_hdr.method == IB_MGMT_METHOD_SET) switch (mad->mad_hdr.attr_id) { case IB_SMP_ATTR_PORT_INFO: pinfo = (struct ib_port_info *) ((struct ib_smp *) mad)->data; lid = be16_to_cpu(pinfo->lid); update_sm_ah(dev, port_num, be16_to_cpu(pinfo->sm_lid), pinfo->neighbormtu_mastersmsl & 0xf); if (pinfo->clientrereg_resv_subnetto & 0x80) handle_client_rereg_event(dev, port_num); if (prev_lid != lid) handle_lid_change_event(dev, port_num); break; case IB_SMP_ATTR_PKEY_TABLE: if (!mlx4_is_mfunc(dev->dev)) { mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_PKEY_CHANGE); break; } /* at this point, we are running in the master. * Slaves do not receive SMPs. */ bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 0xFFFF; base = (__be16 *) &(((struct ib_smp *)mad)->data[0]); pkey_change_bitmap = 0; for (i = 0; i < 32; i++) { pr_debug("PKEY[%d] = x%x\n", i + bn*32, be16_to_cpu(base[i])); if (be16_to_cpu(base[i]) != dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32]) { pkey_change_bitmap |= (1 << i); dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32] = be16_to_cpu(base[i]); } } pr_debug("PKEY Change event: port=%d, " "block=0x%x, change_bitmap=0x%x\n", port_num, bn, pkey_change_bitmap); if (pkey_change_bitmap) { mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_PKEY_CHANGE); if (!dev->sriov.is_going_down) __propagate_pkey_ev(dev, port_num, bn, pkey_change_bitmap); } break; case IB_SMP_ATTR_GUID_INFO: /* paravirtualized master's guid is guid 0 -- does not change */ if (!mlx4_is_master(dev->dev)) mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_GID_CHANGE); /*if master, notify relevant slaves*/ if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) { bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod); mlx4_ib_update_cache_on_guid_change(dev, bn, port_num, (u8 *)(&((struct ib_smp *)mad)->data)); mlx4_ib_notify_slaves_on_guid_change(dev, bn, port_num, (u8 *)(&((struct ib_smp *)mad)->data)); } break; default: break; } } static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, int block, u32 change_bitmap) { int i, ix, slave, err; int have_event = 0; for (slave = 0; slave < dev->dev->caps.sqp_demux; slave++) { if (slave == mlx4_master_func_num(dev->dev)) continue; if (!mlx4_is_slave_active(dev->dev, slave)) continue; have_event = 0; for (i = 0; i < 32; i++) { if (!(change_bitmap & (1 << i))) continue; for (ix = 0; ix < dev->dev->caps.pkey_table_len[port_num]; ix++) { if (dev->pkeys.virt2phys_pkey[slave][port_num - 1] [ix] == i + 32 * block) { err = mlx4_gen_pkey_eqe(dev->dev, slave, port_num); pr_debug("propagate_pkey_ev: slave %d," " port %d, ix %d (%d)\n", slave, port_num, ix, err); have_event = 1; break; } } if (have_event) break; } } } static void node_desc_override(struct ib_device *dev, struct ib_mad *mad) { unsigned long flags; if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP && mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) { spin_lock_irqsave(&to_mdev(dev)->sm_lock, flags); memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64); spin_unlock_irqrestore(&to_mdev(dev)->sm_lock, flags); } } static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *mad) { int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED; struct ib_mad_send_buf *send_buf; struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn]; int ret; unsigned long flags; if (agent) { send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_ATOMIC); if (IS_ERR(send_buf)) return; /* * We rely here on the fact that MLX QPs don't use the * address handle after the send is posted (this is * wrong following the IB spec strictly, but we know * it's OK for our devices). */ spin_lock_irqsave(&dev->sm_lock, flags); memcpy(send_buf->mad, mad, sizeof *mad); if ((send_buf->ah = dev->sm_ah[port_num - 1])) ret = ib_post_send_mad(send_buf, NULL); else ret = -EINVAL; spin_unlock_irqrestore(&dev->sm_lock, flags); if (ret) ib_free_send_mad(send_buf); } } static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave, struct ib_sa_mad *sa_mad) { int ret = 0; /* dispatch to different sa handlers */ switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { case IB_SA_ATTR_MC_MEMBER_REC: ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad); break; default: break; } return ret; } int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid) { struct mlx4_ib_dev *dev = to_mdev(ibdev); int i; for (i = 0; i < dev->dev->caps.sqp_demux; i++) { if (dev->sriov.demux[port - 1].guid_cache[i] == guid) return i; } return -1; } static int find_slave_port_pkey_ix(struct mlx4_ib_dev *dev, int slave, u8 port, u16 pkey, u16 *ix) { int i, ret; u8 unassigned_pkey_ix, pkey_ix, partial_ix = 0xFF; u16 slot_pkey; if (slave == mlx4_master_func_num(dev->dev)) return ib_find_cached_pkey(&dev->ib_dev, port, pkey, ix); unassigned_pkey_ix = dev->dev->phys_caps.pkey_phys_table_len[port] - 1; for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) { if (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == unassigned_pkey_ix) continue; pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][i]; ret = ib_get_cached_pkey(&dev->ib_dev, port, pkey_ix, &slot_pkey); if (ret) continue; if ((slot_pkey & 0x7FFF) == (pkey & 0x7FFF)) { if (slot_pkey & 0x8000) { *ix = (u16) pkey_ix; return 0; } else { /* take first partial pkey index found */ if (partial_ix == 0xFF) partial_ix = pkey_ix; } } } if (partial_ix < 0xFF) { *ix = (u16) partial_ix; return 0; } return -EINVAL; } int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type dest_qpt, struct ib_wc *wc, struct ib_grh *grh, struct ib_mad *mad) { struct ib_sge list; struct ib_send_wr wr, *bad_wr; struct mlx4_ib_demux_pv_ctx *tun_ctx; struct mlx4_ib_demux_pv_qp *tun_qp; struct mlx4_rcv_tunnel_mad *tun_mad; struct ib_ah_attr attr; struct ib_ah *ah; struct ib_qp *src_qp = NULL; unsigned tun_tx_ix = 0; int dqpn; int ret = 0; u16 tun_pkey_ix; u16 cached_pkey; u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; if (dest_qpt > IB_QPT_GSI) return -EINVAL; tun_ctx = dev->sriov.demux[port-1].tun[slave]; /* check if proxy qp created */ if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE) return -EAGAIN; /* QP0 forwarding only for Dom0 */ if (!dest_qpt && (mlx4_master_func_num(dev->dev) != slave)) return -EINVAL; if (!dest_qpt) tun_qp = &tun_ctx->qp[0]; else tun_qp = &tun_ctx->qp[1]; /* compute P_Key index to put in tunnel header for slave */ if (dest_qpt) { u16 pkey_ix; ret = ib_get_cached_pkey(&dev->ib_dev, port, wc->pkey_index, &cached_pkey); if (ret) return -EINVAL; ret = find_slave_port_pkey_ix(dev, slave, port, cached_pkey, &pkey_ix); if (ret) return -EINVAL; tun_pkey_ix = pkey_ix; } else tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; dqpn = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave + port + (dest_qpt * 2) - 1; /* get tunnel tx data buf for slave */ src_qp = tun_qp->qp; /* create ah. Just need an empty one with the port num for the post send. * The driver will set the force loopback bit in post_send */ memset(&attr, 0, sizeof attr); attr.port_num = port; if (is_eth) { memcpy(&attr.grh.dgid.raw[0], &grh->dgid.raw[0], 16); attr.ah_flags = IB_AH_GRH; } ah = ib_create_ah(tun_ctx->pd, &attr); if (IS_ERR(ah)) return -ENOMEM; /* allocate tunnel tx buf after pass failure returns */ spin_lock(&tun_qp->tx_lock); if (tun_qp->tx_ix_head - tun_qp->tx_ix_tail >= (MLX4_NUM_TUNNEL_BUFS - 1)) ret = -EAGAIN; else tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); spin_unlock(&tun_qp->tx_lock); if (ret) goto out; tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr); if (tun_qp->tx_ring[tun_tx_ix].ah) ib_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah); tun_qp->tx_ring[tun_tx_ix].ah = ah; ib_dma_sync_single_for_cpu(&dev->ib_dev, tun_qp->tx_ring[tun_tx_ix].buf.map, sizeof (struct mlx4_rcv_tunnel_mad), DMA_TO_DEVICE); /* copy over to tunnel buffer */ if (grh) memcpy(&tun_mad->grh, grh, sizeof *grh); memcpy(&tun_mad->mad, mad, sizeof *mad); /* adjust tunnel data */ tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix); tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF); tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0; if (is_eth) { u16 vlan = 0; if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan, NULL)) { if (vlan != wc->vlan_id) /* VST and default vlan is not the packet vlan drop the * packet*/ goto out; else /* VST , remove hide the vlan from the VF */ vlan = 0; } else { vlan = wc->vlan_id; } tun_mad->hdr.sl_vid = cpu_to_be16(vlan); memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4); memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2); } else { tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); } ib_dma_sync_single_for_device(&dev->ib_dev, tun_qp->tx_ring[tun_tx_ix].buf.map, sizeof (struct mlx4_rcv_tunnel_mad), DMA_TO_DEVICE); list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map; list.length = sizeof (struct mlx4_rcv_tunnel_mad); list.lkey = tun_ctx->mr->lkey; wr.wr.ud.ah = ah; wr.wr.ud.port_num = port; wr.wr.ud.remote_qkey = IB_QP_SET_QKEY; wr.wr.ud.remote_qpn = dqpn; wr.next = NULL; wr.wr_id = ((u64) tun_tx_ix) | MLX4_TUN_SET_WRID_QPN(dest_qpt); wr.sg_list = &list; wr.num_sge = 1; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; ret = ib_post_send(src_qp, &wr, &bad_wr); out: if (ret) ib_destroy_ah(ah); return ret; } static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port, struct ib_wc *wc, struct ib_grh *grh, struct ib_mad *mad) { struct mlx4_ib_dev *dev = to_mdev(ibdev); int err; int slave; u8 *slave_id; int is_eth = 0; if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) is_eth = 0; else is_eth = 1; if (is_eth) { if (!wc->wc_flags & IB_WC_GRH) { mlx4_ib_warn(ibdev, "RoCE grh not present.\n"); return -EINVAL; } if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) { mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n"); return -EINVAL; } if (mlx4_get_slave_from_roce_gid(dev->dev, port, grh->dgid.raw, &slave)) { mlx4_ib_warn(ibdev, "failed matching grh\n"); return -ENOENT; } if (slave >= dev->dev->caps.sqp_demux) { mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", slave, dev->dev->caps.sqp_demux); return -ENOENT; } if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad, is_eth)) return 0; err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); if (err) pr_debug("failed sending to slave %d via tunnel qp (%d)\n", slave, err); return 0; } /* Initially assume that this mad is for us */ slave = mlx4_master_func_num(dev->dev); /* See if the slave id is encoded in a response mad */ if (mad->mad_hdr.method & 0x80) { slave_id = (u8 *) &mad->mad_hdr.tid; slave = *slave_id; if (slave != 255) /*255 indicates the dom0*/ *slave_id = 0; /* remap tid */ } /* If a grh is present, we demux according to it */ if (wc->wc_flags & IB_WC_GRH) { slave = mlx4_ib_find_real_gid(ibdev, port, grh->dgid.global.interface_id); if (slave < 0) { mlx4_ib_warn(ibdev, "failed matching grh\n"); return -ENOENT; } } /* Class-specific handling */ switch (mad->mad_hdr.mgmt_class) { case IB_MGMT_CLASS_SUBN_ADM: if (mlx4_ib_demux_sa_handler(ibdev, port, slave, (struct ib_sa_mad *) mad)) return 0; break; case IB_MGMT_CLASS_CM: if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad, is_eth)) return 0; break; case IB_MGMT_CLASS_DEVICE_MGMT: if (mad->mad_hdr.method != IB_MGMT_METHOD_GET_RESP) return 0; break; default: /* Drop unsupported classes for slaves in tunnel mode */ if (slave != mlx4_master_func_num(dev->dev)) { pr_debug("dropping unsupported ingress mad from class:%d " "for slave:%d\n", mad->mad_hdr.mgmt_class, slave); return 0; } } /*make sure that no slave==255 was not handled yet.*/ if (slave >= dev->dev->caps.sqp_demux) { mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", slave, dev->dev->caps.sqp_demux); return -ENOENT; } err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); if (err) pr_debug("failed sending to slave %d via tunnel qp (%d)\n", slave, err); return 0; } static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad) { u16 slid, prev_lid = 0; int err; struct ib_port_attr pattr; if (in_wc && in_wc->qp->qp_num) { pr_debug("received MAD: slid:%d sqpn:%d " "dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n", in_wc->slid, in_wc->src_qp, in_wc->dlid_path_bits, in_wc->qp->qp_num, in_wc->wc_flags, in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method, be16_to_cpu(in_mad->mad_hdr.attr_id)); if (in_wc->wc_flags & IB_WC_GRH) { pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n", (unsigned long long)be64_to_cpu(in_grh->sgid.global.subnet_prefix), (unsigned long long)be64_to_cpu(in_grh->sgid.global.interface_id)); pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n", (unsigned long long)be64_to_cpu(in_grh->dgid.global.subnet_prefix), (unsigned long long)be64_to_cpu(in_grh->dgid.global.interface_id)); } } slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) { forward_trap(to_mdev(ibdev), port_num, in_mad); return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; } if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && in_mad->mad_hdr.method != IB_MGMT_METHOD_SET && in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS) return IB_MAD_RESULT_SUCCESS; /* * Don't process SMInfo queries -- the SMA can't handle them. */ if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO) return IB_MAD_RESULT_SUCCESS; } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1 || in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2 || in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) { if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && in_mad->mad_hdr.method != IB_MGMT_METHOD_SET) return IB_MAD_RESULT_SUCCESS; } else return IB_MAD_RESULT_SUCCESS; if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && in_mad->mad_hdr.method == IB_MGMT_METHOD_SET && in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && !ib_query_port(ibdev, port_num, &pattr)) prev_lid = pattr.lid; err = mlx4_MAD_IFC(to_mdev(ibdev), (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) | (mad_flags & IB_MAD_IGNORE_BKEY ? MLX4_MAD_IFC_IGNORE_BKEY : 0) | MLX4_MAD_IFC_NET_VIEW, port_num, in_wc, in_grh, in_mad, out_mad); if (err) return IB_MAD_RESULT_FAILURE; if (!out_mad->mad_hdr.status) { if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)) smp_snoop(ibdev, port_num, in_mad, prev_lid); /* slaves get node desc from FW */ if (!mlx4_is_slave(to_mdev(ibdev)->dev)) node_desc_override(ibdev, out_mad); } /* set return bit in status of directed route responses */ if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) out_mad->mad_hdr.status |= cpu_to_be16(1 << 15); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) /* no response for trap repress */ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; } static void edit_counter_ext(struct mlx4_if_stat_extended *cnt, void *counters, __be16 attr_id) { switch (attr_id) { case IB_PMA_PORT_COUNTERS: { struct ib_pma_portcounters *pma_cnt = (struct ib_pma_portcounters *)counters; pma_cnt->port_xmit_data = cpu_to_be32((be64_to_cpu(cnt->counters[0]. IfTxUnicastOctets) + be64_to_cpu(cnt->counters[0]. IfTxMulticastOctets) + be64_to_cpu(cnt->counters[0]. IfTxBroadcastOctets) + be64_to_cpu(cnt->counters[0]. IfTxDroppedOctets)) >> 2); pma_cnt->port_rcv_data = cpu_to_be32((be64_to_cpu(cnt->counters[0]. IfRxUnicastOctets) + be64_to_cpu(cnt->counters[0]. IfRxMulticastOctets) + be64_to_cpu(cnt->counters[0]. IfRxBroadcastOctets) + be64_to_cpu(cnt->counters[0]. IfRxNoBufferOctets) + be64_to_cpu(cnt->counters[0]. IfRxErrorOctets)) >> 2); pma_cnt->port_xmit_packets = cpu_to_be32(be64_to_cpu(cnt->counters[0]. IfTxUnicastFrames) + be64_to_cpu(cnt->counters[0]. IfTxMulticastFrames) + be64_to_cpu(cnt->counters[0]. IfTxBroadcastFrames) + be64_to_cpu(cnt->counters[0]. IfTxDroppedFrames)); pma_cnt->port_rcv_packets = cpu_to_be32(be64_to_cpu(cnt->counters[0]. IfRxUnicastFrames) + be64_to_cpu(cnt->counters[0]. IfRxMulticastFrames) + be64_to_cpu(cnt->counters[0]. IfRxBroadcastFrames) + be64_to_cpu(cnt->counters[0]. IfRxNoBufferFrames) + be64_to_cpu(cnt->counters[0]. IfRxErrorFrames)); pma_cnt->port_rcv_errors = cpu_to_be32(be64_to_cpu(cnt-> counters[0]. IfRxErrorFrames)); break; } case IB_PMA_PORT_COUNTERS_EXT: { struct ib_pma_portcounters_ext *pma_cnt_ext = (struct ib_pma_portcounters_ext *)counters; pma_cnt_ext->port_xmit_data = cpu_to_be64((be64_to_cpu(cnt->counters[0]. IfTxUnicastOctets) + be64_to_cpu(cnt->counters[0]. IfTxMulticastOctets) + be64_to_cpu(cnt->counters[0]. IfTxBroadcastOctets) + be64_to_cpu(cnt->counters[0]. IfTxDroppedOctets)) >> 2); pma_cnt_ext->port_rcv_data = cpu_to_be64((be64_to_cpu(cnt->counters[0]. IfRxUnicastOctets) + be64_to_cpu(cnt->counters[0]. IfRxMulticastOctets) + be64_to_cpu(cnt->counters[0]. IfRxBroadcastOctets) + be64_to_cpu(cnt->counters[0]. IfRxNoBufferOctets) + be64_to_cpu(cnt->counters[0]. IfRxErrorOctets)) >> 2); pma_cnt_ext->port_xmit_packets = cpu_to_be64(be64_to_cpu(cnt->counters[0]. IfTxUnicastFrames) + be64_to_cpu(cnt->counters[0]. IfTxMulticastFrames) + be64_to_cpu(cnt->counters[0]. IfTxBroadcastFrames) + be64_to_cpu(cnt->counters[0]. IfTxDroppedFrames)); pma_cnt_ext->port_rcv_packets = cpu_to_be64(be64_to_cpu(cnt->counters[0]. IfRxUnicastFrames) + be64_to_cpu(cnt->counters[0]. IfRxMulticastFrames) + be64_to_cpu(cnt->counters[0]. IfRxBroadcastFrames) + be64_to_cpu(cnt->counters[0]. IfRxNoBufferFrames) + be64_to_cpu(cnt->counters[0]. IfRxErrorFrames)); pma_cnt_ext->port_unicast_xmit_packets = cnt->counters[0]. IfTxUnicastFrames; pma_cnt_ext->port_unicast_rcv_packets = cnt->counters[0]. IfRxUnicastFrames; pma_cnt_ext->port_multicast_xmit_packets = cpu_to_be64(be64_to_cpu(cnt->counters[0]. IfTxMulticastFrames) + be64_to_cpu(cnt->counters[0]. IfTxBroadcastFrames)); pma_cnt_ext->port_multicast_rcv_packets = cpu_to_be64(be64_to_cpu(cnt->counters[0]. IfTxMulticastFrames) + be64_to_cpu(cnt->counters[0]. IfTxBroadcastFrames)); break; } default: pr_warn("Unsupported attr_id 0x%x\n", attr_id); break; } } static void edit_counter(struct mlx4_if_stat_basic *cnt, void *counters, __be16 attr_id) { switch (attr_id) { case IB_PMA_PORT_COUNTERS: { struct ib_pma_portcounters *pma_cnt = (struct ib_pma_portcounters *) counters; pma_cnt->port_xmit_data = cpu_to_be32(be64_to_cpu( cnt->counters[0].IfTxOctets) >> 2); pma_cnt->port_rcv_data = cpu_to_be32(be64_to_cpu( cnt->counters[0].IfRxOctets) >> 2); pma_cnt->port_xmit_packets = cpu_to_be32(be64_to_cpu(cnt->counters[0].IfTxFrames)); pma_cnt->port_rcv_packets = cpu_to_be32(be64_to_cpu(cnt->counters[0].IfRxFrames)); break; } case IB_PMA_PORT_COUNTERS_EXT: { struct ib_pma_portcounters_ext *pma_cnt_ext = (struct ib_pma_portcounters_ext *) counters; pma_cnt_ext->port_xmit_data = cpu_to_be64((be64_to_cpu(cnt->counters[0]. IfTxOctets) >> 2)); pma_cnt_ext->port_rcv_data = cpu_to_be64((be64_to_cpu(cnt->counters[0]. IfRxOctets) >> 2)); pma_cnt_ext->port_xmit_packets = cnt->counters[0].IfTxFrames; pma_cnt_ext->port_rcv_packets = cnt->counters[0].IfRxFrames; break; } default: pr_warn("Unsupported attr_id 0x%x\n", attr_id); break; } } int mlx4_ib_query_if_stat(struct mlx4_ib_dev *dev, u32 counter_index, union mlx4_counter *counter, u8 clear) { struct mlx4_cmd_mailbox *mailbox; int err; u32 inmod = counter_index | ((clear & 1) << 31); mailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(mailbox)) return IB_MAD_RESULT_FAILURE; err = mlx4_cmd_box(dev->dev, 0, mailbox->dma, inmod, 0, MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); if (!err) memcpy(counter, mailbox->buf, MLX4_IF_STAT_SZ(1)); mlx4_free_cmd_mailbox(dev->dev, mailbox); return err; } static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad) { struct mlx4_ib_dev *dev = to_mdev(ibdev); int err; u32 counter_index = dev->counters[port_num - 1].counter_index & 0xffff; u8 mode; char counter_buf[MLX4_IF_STAT_SZ(1)]; union mlx4_counter *counter = (union mlx4_counter *) counter_buf; if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT) return -EINVAL; /* in case of default counter IB shares the counter with ETH */ /* the state could be -EEXIST or -ENOSPC */ if (dev->counters[port_num - 1].status) { memset(out_mad->data, 0, sizeof out_mad->data); err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; } else { if (mlx4_ib_query_if_stat(dev, counter_index, counter, 0)) return IB_MAD_RESULT_FAILURE; memset(out_mad->data, 0, sizeof(out_mad->data)); mode = counter->control.cnt_mode & 0xFF; err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; switch (mode & 0xf) { case 0: edit_counter((void *)counter, (void *)(out_mad->data + 40), in_mad->mad_hdr.attr_id); break; case 1: edit_counter_ext((void *)counter, (void *)(out_mad->data + 40), in_mad->mad_hdr.attr_id); break; default: err = IB_MAD_RESULT_FAILURE; } } return err; } int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad) { switch (rdma_port_get_link_layer(ibdev, port_num)) { case IB_LINK_LAYER_INFINIBAND: return ib_process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, in_mad, out_mad); case IB_LINK_LAYER_ETHERNET: return iboe_process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, in_mad, out_mad); default: return -EINVAL; } } static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) { if (mad_send_wc->send_buf->context[0]) ib_destroy_ah(mad_send_wc->send_buf->context[0]); ib_free_send_mad(mad_send_wc->send_buf); } int mlx4_ib_mad_init(struct mlx4_ib_dev *dev) { struct ib_mad_agent *agent; int p, q; int ret; enum rdma_link_layer ll; for (p = 0; p < dev->num_ports; ++p) { ll = rdma_port_get_link_layer(&dev->ib_dev, p + 1); for (q = 0; q <= 1; ++q) { if (ll == IB_LINK_LAYER_INFINIBAND) { agent = ib_register_mad_agent(&dev->ib_dev, p + 1, q ? IB_QPT_GSI : IB_QPT_SMI, NULL, 0, send_handler, NULL, NULL); if (IS_ERR(agent)) { ret = PTR_ERR(agent); goto err; } dev->send_agent[p][q] = agent; } else dev->send_agent[p][q] = NULL; } } return 0; err: for (p = 0; p < dev->num_ports; ++p) for (q = 0; q <= 1; ++q) if (dev->send_agent[p][q]) ib_unregister_mad_agent(dev->send_agent[p][q]); return ret; } void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev) { struct ib_mad_agent *agent; int p, q; for (p = 0; p < dev->num_ports; ++p) { for (q = 0; q <= 1; ++q) { agent = dev->send_agent[p][q]; if (agent) { dev->send_agent[p][q] = NULL; ib_unregister_mad_agent(agent); } } if (dev->sm_ah[p]) ib_destroy_ah(dev->sm_ah[p]); } } static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num) { mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_LID_CHANGE); if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num, MLX4_EQ_PORT_INFO_LID_CHANGE_MASK, 0, 0); } static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num) { /* re-configure the alias-guid and mcg's */ if (mlx4_is_master(dev->dev)) { mlx4_ib_invalidate_all_guid_record(dev, port_num); if (!dev->sriov.is_going_down) { mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0); mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num, MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK, 0, 0); } } mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER); } static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, struct mlx4_eqe *eqe) { __propagate_pkey_ev(dev, port_num, GET_BLK_PTR_FROM_EQE(eqe), GET_MASK_FROM_EQE(eqe)); } static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u8 port_num, u32 guid_tbl_blk_num, u32 change_bitmap) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; u16 i; if (!mlx4_is_mfunc(dev->dev) || !mlx4_is_master(dev->dev)) return; in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) { mlx4_ib_warn(&dev->ib_dev, "failed to allocate memory for guid info mads\n"); goto out; } guid_tbl_blk_num *= 4; for (i = 0; i < 4; i++) { if (change_bitmap && (!((change_bitmap >> (8 * i)) & 0xff))) continue; memset(in_mad, 0, sizeof *in_mad); memset(out_mad, 0, sizeof *out_mad); in_mad->base_version = 1; in_mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; in_mad->class_version = 1; in_mad->method = IB_MGMT_METHOD_GET; in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; in_mad->attr_mod = cpu_to_be32(guid_tbl_blk_num + i); if (mlx4_MAD_IFC(dev, MLX4_MAD_IFC_IGNORE_KEYS | MLX4_MAD_IFC_NET_VIEW, port_num, NULL, NULL, in_mad, out_mad)) { mlx4_ib_warn(&dev->ib_dev, "Failed in get GUID INFO MAD_IFC\n"); goto out; } mlx4_ib_update_cache_on_guid_change(dev, guid_tbl_blk_num + i, port_num, (u8 *)(&((struct ib_smp *)out_mad)->data)); mlx4_ib_notify_slaves_on_guid_change(dev, guid_tbl_blk_num + i, port_num, (u8 *)(&((struct ib_smp *)out_mad)->data)); } out: kfree(in_mad); kfree(out_mad); return; } void handle_port_mgmt_change_event(struct work_struct *work) { struct ib_event_work *ew = container_of(work, struct ib_event_work, work); struct mlx4_ib_dev *dev = ew->ib_dev; struct mlx4_eqe *eqe = &(ew->ib_eqe); u8 port = eqe->event.port_mgmt_change.port; u32 changed_attr; u32 tbl_block; u32 change_bitmap; switch (eqe->subtype) { case MLX4_DEV_PMC_SUBTYPE_PORT_INFO: changed_attr = be32_to_cpu(eqe->event.port_mgmt_change.params.port_info.changed_attr); /* Update the SM ah - This should be done before handling the other changed attributes so that MADs can be sent to the SM */ if (changed_attr & MSTR_SM_CHANGE_MASK) { u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid); u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf; update_sm_ah(dev, port, lid, sl); mlx4_ib_dispatch_event(dev, port, IB_EVENT_SM_CHANGE); if (mlx4_is_master(dev->dev)) mlx4_gen_slaves_port_mgt_ev(dev->dev, port, changed_attr & MSTR_SM_CHANGE_MASK, lid, sl); } /* Check if it is a lid change event */ if (changed_attr & MLX4_EQ_PORT_INFO_LID_CHANGE_MASK) handle_lid_change_event(dev, port); /* Generate GUID changed event */ if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) { mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE); /*if master, notify all slaves*/ if (mlx4_is_master(dev->dev)) mlx4_gen_slaves_port_mgt_ev(dev->dev, port, MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK, 0, 0); } if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK) handle_client_rereg_event(dev, port); break; case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE: mlx4_ib_dispatch_event(dev, port, IB_EVENT_PKEY_CHANGE); if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) propagate_pkey_ev(dev, port, eqe); break; case MLX4_DEV_PMC_SUBTYPE_GUID_INFO: /* paravirtualized master's guid is guid 0 -- does not change */ if (!mlx4_is_master(dev->dev)) mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE); /*if master, notify relevant slaves*/ else if (!dev->sriov.is_going_down) { tbl_block = GET_BLK_PTR_FROM_EQE(eqe); change_bitmap = GET_MASK_FROM_EQE(eqe); handle_slaves_guid_change(dev, port, tbl_block, change_bitmap); } break; default: pr_warn("Unsupported subtype 0x%x for " "Port Management Change event\n", eqe->subtype); } kfree(ew); } void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num, enum ib_event_type type) { struct ib_event event; event.device = &dev->ib_dev; event.element.port_num = port_num; event.event = type; ib_dispatch_event(&event); } static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg) { unsigned long flags; struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context; struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); spin_lock_irqsave(&dev->sriov.going_down_lock, flags); if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE) queue_work(ctx->wq, &ctx->work); spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); } static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx, struct mlx4_ib_demux_pv_qp *tun_qp, int index) { struct ib_sge sg_list; struct ib_recv_wr recv_wr, *bad_recv_wr; int size; size = (tun_qp->qp->qp_type == IB_QPT_UD) ? sizeof (struct mlx4_tunnel_mad) : sizeof (struct mlx4_mad_rcv_buf); sg_list.addr = tun_qp->ring[index].map; sg_list.length = size; sg_list.lkey = ctx->mr->lkey; recv_wr.next = NULL; recv_wr.sg_list = &sg_list; recv_wr.num_sge = 1; recv_wr.wr_id = (u64) index | MLX4_TUN_WRID_RECV | MLX4_TUN_SET_WRID_QPN(tun_qp->proxy_qpt); ib_dma_sync_single_for_device(ctx->ib_dev, tun_qp->ring[index].map, size, DMA_FROM_DEVICE); return ib_post_recv(tun_qp->qp, &recv_wr, &bad_recv_wr); } static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port, int slave, struct ib_sa_mad *sa_mad) { int ret = 0; /* dispatch to different sa handlers */ switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { case IB_SA_ATTR_MC_MEMBER_REC: ret = mlx4_ib_mcg_multiplex_handler(ibdev, port, slave, sa_mad); break; default: break; } return ret; } static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave) { int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave; return (qpn >= proxy_start && qpn <= proxy_start + 1); } int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr, u8 *s_mac, struct ib_mad *mad) { struct ib_sge list; struct ib_send_wr wr, *bad_wr; struct mlx4_ib_demux_pv_ctx *sqp_ctx; struct mlx4_ib_demux_pv_qp *sqp; struct mlx4_mad_snd_buf *sqp_mad; struct ib_ah *ah; struct ib_qp *send_qp = NULL; unsigned wire_tx_ix = 0; int ret = 0; u16 wire_pkey_ix; int src_qpnum; u8 sgid_index; sqp_ctx = dev->sriov.sqps[port-1]; /* check if proxy qp created */ if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE) return -EAGAIN; /* QP0 forwarding only for Dom0 */ if (dest_qpt == IB_QPT_SMI && (mlx4_master_func_num(dev->dev) != slave)) return -EINVAL; if (dest_qpt == IB_QPT_SMI) { src_qpnum = 0; sqp = &sqp_ctx->qp[0]; wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; } else { src_qpnum = 1; sqp = &sqp_ctx->qp[1]; wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][pkey_index]; } send_qp = sqp->qp; /* create ah */ sgid_index = attr->grh.sgid_index; attr->grh.sgid_index = 0; ah = ib_create_ah(sqp_ctx->pd, attr); if (IS_ERR(ah)) return -ENOMEM; attr->grh.sgid_index = sgid_index; to_mah(ah)->av.ib.gid_index = sgid_index; /* get rid of force-loopback bit */ to_mah(ah)->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF); spin_lock(&sqp->tx_lock); if (sqp->tx_ix_head - sqp->tx_ix_tail >= (MLX4_NUM_TUNNEL_BUFS - 1)) ret = -EAGAIN; else wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); spin_unlock(&sqp->tx_lock); if (ret) goto out; sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr); if (sqp->tx_ring[wire_tx_ix].ah) ib_destroy_ah(sqp->tx_ring[wire_tx_ix].ah); sqp->tx_ring[wire_tx_ix].ah = ah; ib_dma_sync_single_for_cpu(&dev->ib_dev, sqp->tx_ring[wire_tx_ix].buf.map, sizeof (struct mlx4_mad_snd_buf), DMA_TO_DEVICE); memcpy(&sqp_mad->payload, mad, sizeof *mad); ib_dma_sync_single_for_device(&dev->ib_dev, sqp->tx_ring[wire_tx_ix].buf.map, sizeof (struct mlx4_mad_snd_buf), DMA_TO_DEVICE); list.addr = sqp->tx_ring[wire_tx_ix].buf.map; list.length = sizeof (struct mlx4_mad_snd_buf); list.lkey = sqp_ctx->mr->lkey; wr.wr.ud.ah = ah; wr.wr.ud.port_num = port; wr.wr.ud.pkey_index = wire_pkey_ix; wr.wr.ud.remote_qkey = qkey; wr.wr.ud.remote_qpn = remote_qpn; wr.next = NULL; wr.wr_id = ((u64) wire_tx_ix) | MLX4_TUN_SET_WRID_QPN(src_qpnum); wr.sg_list = &list; wr.num_sge = 1; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; if (s_mac) memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6); ret = ib_post_send(send_qp, &wr, &bad_wr); out: if (ret) ib_destroy_ah(ah); return ret; } static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port) { int gids; int vfs; if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) return slave; gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS; vfs = dev->dev->num_vfs; if (slave == 0) return 0; if (slave <= gids % vfs) return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave - 1); return MLX4_ROCE_PF_GIDS + (gids % vfs) + ((gids / vfs) * (slave - 1)); } static int get_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port, struct ib_ah_attr *ah_attr) { if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) { ah_attr->grh.sgid_index = slave; return 0; } ah_attr->grh.sgid_index += get_slave_base_gid_ix(dev, slave, port); return 0; } static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc) { struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); struct mlx4_ib_demux_pv_qp *tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc->wr_id)]; int wr_ix = wc->wr_id & (MLX4_NUM_TUNNEL_BUFS - 1); struct mlx4_tunnel_mad *tunnel = tun_qp->ring[wr_ix].addr; struct mlx4_ib_ah ah; struct ib_ah_attr ah_attr; u8 *slave_id; int slave; /* Get slave that sent this packet */ if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn || wc->src_qp >= dev->dev->phys_caps.base_proxy_sqpn + 8 * MLX4_MFUNC_MAX || (wc->src_qp & 0x1) != ctx->port - 1 || wc->src_qp & 0x4) { mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d\n", wc->src_qp); return; } slave = ((wc->src_qp & ~0x7) - dev->dev->phys_caps.base_proxy_sqpn) / 8; if (slave != ctx->slave) { mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: " "belongs to another slave\n", wc->src_qp); return; } if (slave != mlx4_master_func_num(dev->dev) && !(wc->src_qp & 0x2)) { mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: " "non-master trying to send QP0 packets\n", wc->src_qp); return; } /* Map transaction ID */ ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map, sizeof (struct mlx4_tunnel_mad), DMA_FROM_DEVICE); switch (tunnel->mad.mad_hdr.method) { case IB_MGMT_METHOD_SET: case IB_MGMT_METHOD_GET: case IB_MGMT_METHOD_REPORT: case IB_SA_METHOD_GET_TABLE: case IB_SA_METHOD_DELETE: case IB_SA_METHOD_GET_MULTI: case IB_SA_METHOD_GET_TRACE_TBL: slave_id = (u8 *) &tunnel->mad.mad_hdr.tid; if (*slave_id) { mlx4_ib_warn(ctx->ib_dev, "egress mad has non-null tid msb:%d " "class:%d slave:%d\n", *slave_id, tunnel->mad.mad_hdr.mgmt_class, slave); return; } else *slave_id = slave; default: /* nothing */; } /* Class-specific handling */ switch (tunnel->mad.mad_hdr.mgmt_class) { case IB_MGMT_CLASS_SUBN_ADM: if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave, (struct ib_sa_mad *) &tunnel->mad)) return; break; case IB_MGMT_CLASS_CM: if (mlx4_ib_multiplex_cm_handler(ctx->ib_dev, ctx->port, slave, (struct ib_mad *) &tunnel->mad)) return; break; case IB_MGMT_CLASS_DEVICE_MGMT: if (tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_GET && tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_SET) return; break; default: /* Drop unsupported classes for slaves in tunnel mode */ if (slave != mlx4_master_func_num(dev->dev)) { mlx4_ib_warn(ctx->ib_dev, "dropping unsupported egress mad from class:%d " "for slave:%d\n", tunnel->mad.mad_hdr.mgmt_class, slave); return; } } /* We are using standard ib_core services to send the mad, so generate a * stadard address handle by decoding the tunnelled mlx4_ah fields */ memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av)); ah.ibah.device = ctx->ib_dev; mlx4_ib_query_ah(&ah.ibah, &ah_attr); if (ah_attr.ah_flags & IB_AH_GRH) if (get_real_sgid_index(dev, slave, ctx->port, &ah_attr)) return; memcpy(ah_attr.dmac, tunnel->hdr.mac, 6); ah_attr.vlan_id = tunnel->hdr.vlan; /* if slave have default vlan use it */ mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave, &ah_attr.vlan_id, &ah_attr.sl); mlx4_ib_send_to_wire(dev, slave, ctx->port, is_proxy_qp0(dev, wc->src_qp, slave) ? IB_QPT_SMI : IB_QPT_GSI, be16_to_cpu(tunnel->hdr.pkey_index), be32_to_cpu(tunnel->hdr.remote_qpn), be32_to_cpu(tunnel->hdr.qkey), &ah_attr, wc->smac, &tunnel->mad); } static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, enum ib_qp_type qp_type, int is_tun) { int i; struct mlx4_ib_demux_pv_qp *tun_qp; int rx_buf_size, tx_buf_size; if (qp_type > IB_QPT_GSI) return -EINVAL; tun_qp = &ctx->qp[qp_type]; tun_qp->ring = kzalloc(sizeof (struct mlx4_ib_buf) * MLX4_NUM_TUNNEL_BUFS, GFP_KERNEL); if (!tun_qp->ring) return -ENOMEM; tun_qp->tx_ring = kcalloc(MLX4_NUM_TUNNEL_BUFS, sizeof (struct mlx4_ib_tun_tx_buf), GFP_KERNEL); if (!tun_qp->tx_ring) { kfree(tun_qp->ring); tun_qp->ring = NULL; return -ENOMEM; } if (is_tun) { rx_buf_size = sizeof (struct mlx4_tunnel_mad); tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); } else { rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); tx_buf_size = sizeof (struct mlx4_mad_snd_buf); } for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL); if (!tun_qp->ring[i].addr) goto err; tun_qp->ring[i].map = ib_dma_map_single(ctx->ib_dev, tun_qp->ring[i].addr, rx_buf_size, DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(ctx->ib_dev, tun_qp->ring[i].map))) { mlx4_ib_warn(ctx->ib_dev, "ib_dma_map_single failed\n"); kfree(tun_qp->ring[i].addr); goto err; } } for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { tun_qp->tx_ring[i].buf.addr = kmalloc(tx_buf_size, GFP_KERNEL); if (!tun_qp->tx_ring[i].buf.addr) goto tx_err; tun_qp->tx_ring[i].buf.map = ib_dma_map_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.addr, tx_buf_size, DMA_TO_DEVICE); if (unlikely(ib_dma_mapping_error(ctx->ib_dev, tun_qp->tx_ring[i].buf.map))) { mlx4_ib_warn(ctx->ib_dev, "ib_dma_map_single failed\n"); kfree(tun_qp->tx_ring[i].buf.addr); goto tx_err; } tun_qp->tx_ring[i].ah = NULL; } spin_lock_init(&tun_qp->tx_lock); tun_qp->tx_ix_head = 0; tun_qp->tx_ix_tail = 0; tun_qp->proxy_qpt = qp_type; return 0; tx_err: while (i > 0) { --i; ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, tx_buf_size, DMA_TO_DEVICE); kfree(tun_qp->tx_ring[i].buf.addr); } kfree(tun_qp->tx_ring); tun_qp->tx_ring = NULL; i = MLX4_NUM_TUNNEL_BUFS; err: while (i > 0) { --i; ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, rx_buf_size, DMA_FROM_DEVICE); kfree(tun_qp->ring[i].addr); } kfree(tun_qp->ring); tun_qp->ring = NULL; return -ENOMEM; } static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx, enum ib_qp_type qp_type, int is_tun) { int i; struct mlx4_ib_demux_pv_qp *tun_qp; int rx_buf_size, tx_buf_size; if (qp_type > IB_QPT_GSI) return; tun_qp = &ctx->qp[qp_type]; if (is_tun) { rx_buf_size = sizeof (struct mlx4_tunnel_mad); tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); } else { rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); tx_buf_size = sizeof (struct mlx4_mad_snd_buf); } for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, rx_buf_size, DMA_FROM_DEVICE); kfree(tun_qp->ring[i].addr); } for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, tx_buf_size, DMA_TO_DEVICE); kfree(tun_qp->tx_ring[i].buf.addr); if (tun_qp->tx_ring[i].ah) ib_destroy_ah(tun_qp->tx_ring[i].ah); } kfree(tun_qp->tx_ring); kfree(tun_qp->ring); } static void mlx4_ib_tunnel_comp_worker(struct work_struct *work) { struct mlx4_ib_demux_pv_ctx *ctx; struct mlx4_ib_demux_pv_qp *tun_qp; struct ib_wc wc; int ret; ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); while (ib_poll_cq(ctx->cq, 1, &wc) == 1) { tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; if (wc.status == IB_WC_SUCCESS) { switch (wc.opcode) { case IB_WC_RECV: mlx4_ib_multiplex_mad(ctx, &wc); ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)); if (ret) pr_err("Failed reposting tunnel " "buf:%lld\n", (unsigned long long)wc.wr_id); break; case IB_WC_SEND: pr_debug("received tunnel send completion:" "wrid=0x%llx, status=0x%x\n", (unsigned long long)wc.wr_id, wc.status); ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah); tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah = NULL; spin_lock(&tun_qp->tx_lock); tun_qp->tx_ix_tail++; spin_unlock(&tun_qp->tx_lock); break; default: break; } } else { pr_debug("mlx4_ib: completion error in tunnel: %d." " status = %d, wrid = 0x%llx\n", ctx->slave, wc.status, (unsigned long long)wc.wr_id); if (!MLX4_TUN_IS_RECV(wc.wr_id)) { ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah); tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah = NULL; spin_lock(&tun_qp->tx_lock); tun_qp->tx_ix_tail++; spin_unlock(&tun_qp->tx_lock); } } } } static void pv_qp_event_handler(struct ib_event *event, void *qp_context) { struct mlx4_ib_demux_pv_ctx *sqp = qp_context; /* It's worse than that! He's dead, Jim! */ pr_err("Fatal error (%d) on a MAD QP on port %d\n", event->event, sqp->port); } static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx, enum ib_qp_type qp_type, int create_tun) { int i, ret; struct mlx4_ib_demux_pv_qp *tun_qp; struct mlx4_ib_qp_tunnel_init_attr qp_init_attr; struct ib_qp_attr attr; int qp_attr_mask_INIT; if (qp_type > IB_QPT_GSI) return -EINVAL; tun_qp = &ctx->qp[qp_type]; memset(&qp_init_attr, 0, sizeof qp_init_attr); qp_init_attr.init_attr.send_cq = ctx->cq; qp_init_attr.init_attr.recv_cq = ctx->cq; qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; qp_init_attr.init_attr.cap.max_send_wr = MLX4_NUM_TUNNEL_BUFS; qp_init_attr.init_attr.cap.max_recv_wr = MLX4_NUM_TUNNEL_BUFS; qp_init_attr.init_attr.cap.max_send_sge = 1; qp_init_attr.init_attr.cap.max_recv_sge = 1; if (create_tun) { qp_init_attr.init_attr.qp_type = IB_QPT_UD; qp_init_attr.init_attr.create_flags = (enum ib_qp_create_flags)MLX4_IB_SRIOV_TUNNEL_QP; qp_init_attr.port = ctx->port; qp_init_attr.slave = ctx->slave; qp_init_attr.proxy_qp_type = qp_type; qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY | IB_QP_PORT; } else { qp_init_attr.init_attr.qp_type = qp_type; qp_init_attr.init_attr.create_flags = (enum ib_qp_create_flags)MLX4_IB_SRIOV_SQP; qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY; } qp_init_attr.init_attr.port_num = ctx->port; qp_init_attr.init_attr.qp_context = ctx; qp_init_attr.init_attr.event_handler = pv_qp_event_handler; tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr); if (IS_ERR(tun_qp->qp)) { ret = PTR_ERR(tun_qp->qp); tun_qp->qp = NULL; pr_err("Couldn't create %s QP (%d)\n", create_tun ? "tunnel" : "special", ret); return ret; } memset(&attr, 0, sizeof attr); attr.qp_state = IB_QPS_INIT; ret = 0; if (create_tun) ret = find_slave_port_pkey_ix(to_mdev(ctx->ib_dev), ctx->slave, ctx->port, 0xFFFF, &attr.pkey_index); if (ret || !create_tun) - attr.pkey_index = - to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0]; + attr.pkey_index = + to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0]; attr.qkey = IB_QP1_QKEY; attr.port_num = ctx->port; ret = ib_modify_qp(tun_qp->qp, &attr, qp_attr_mask_INIT); if (ret) { pr_err("Couldn't change %s qp state to INIT (%d)\n", create_tun ? "tunnel" : "special", ret); goto err_qp; } attr.qp_state = IB_QPS_RTR; ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE); if (ret) { pr_err("Couldn't change %s qp state to RTR (%d)\n", create_tun ? "tunnel" : "special", ret); goto err_qp; } attr.qp_state = IB_QPS_RTS; attr.sq_psn = 0; ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN); if (ret) { pr_err("Couldn't change %s qp state to RTS (%d)\n", create_tun ? "tunnel" : "special", ret); goto err_qp; } for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i); if (ret) { pr_err(" mlx4_ib_post_pv_buf error" " (err = %d, i = %d)\n", ret, i); goto err_qp; } } return 0; err_qp: ib_destroy_qp(tun_qp->qp); tun_qp->qp = NULL; return ret; } /* * IB MAD completion callback for real SQPs */ static void mlx4_ib_sqp_comp_worker(struct work_struct *work) { struct mlx4_ib_demux_pv_ctx *ctx; struct mlx4_ib_demux_pv_qp *sqp; struct ib_wc wc; struct ib_grh *grh; struct ib_mad *mad; ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); while (mlx4_ib_poll_cq(ctx->cq, 1, &wc) == 1) { sqp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; if (wc.status == IB_WC_SUCCESS) { switch (wc.opcode) { case IB_WC_SEND: ib_destroy_ah(sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah); sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah = NULL; spin_lock(&sqp->tx_lock); sqp->tx_ix_tail++; spin_unlock(&sqp->tx_lock); break; case IB_WC_RECV: mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *) (sqp->ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->payload); grh = &(((struct mlx4_mad_rcv_buf *) (sqp->ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->grh); mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad); if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1))) pr_err("Failed reposting SQP " "buf:%lld\n", (unsigned long long)wc.wr_id); break; default: BUG_ON(1); break; } } else { pr_debug("mlx4_ib: completion error in tunnel: %d." " status = %d, wrid = 0x%llx\n", ctx->slave, wc.status, (unsigned long long)wc.wr_id); if (!MLX4_TUN_IS_RECV(wc.wr_id)) { ib_destroy_ah(sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah); sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah = NULL; spin_lock(&sqp->tx_lock); sqp->tx_ix_tail++; spin_unlock(&sqp->tx_lock); } } } } static int alloc_pv_object(struct mlx4_ib_dev *dev, int slave, int port, struct mlx4_ib_demux_pv_ctx **ret_ctx) { struct mlx4_ib_demux_pv_ctx *ctx; *ret_ctx = NULL; ctx = kzalloc(sizeof (struct mlx4_ib_demux_pv_ctx), GFP_KERNEL); if (!ctx) { pr_err("failed allocating pv resource context " "for port %d, slave %d\n", port, slave); return -ENOMEM; } ctx->ib_dev = &dev->ib_dev; ctx->port = port; ctx->slave = slave; *ret_ctx = ctx; return 0; } static void free_pv_object(struct mlx4_ib_dev *dev, int slave, int port) { if (dev->sriov.demux[port - 1].tun[slave]) { kfree(dev->sriov.demux[port - 1].tun[slave]); dev->sriov.demux[port - 1].tun[slave] = NULL; } } static int create_pv_resources(struct ib_device *ibdev, int slave, int port, int create_tun, struct mlx4_ib_demux_pv_ctx *ctx) { int ret, cq_size; if (ctx->state != DEMUX_PV_STATE_DOWN) return -EEXIST; ctx->state = DEMUX_PV_STATE_STARTING; /* have QP0 only on port owner, and only if link layer is IB */ if (ctx->slave == mlx4_master_func_num(to_mdev(ctx->ib_dev)->dev) && rdma_port_get_link_layer(ibdev, ctx->port) == IB_LINK_LAYER_INFINIBAND) ctx->has_smi = 1; if (ctx->has_smi) { ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_SMI, create_tun); if (ret) { pr_err("Failed allocating qp0 tunnel bufs (%d)\n", ret); goto err_out; } } ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_GSI, create_tun); if (ret) { pr_err("Failed allocating qp1 tunnel bufs (%d)\n", ret); goto err_out_qp0; } cq_size = 2 * MLX4_NUM_TUNNEL_BUFS; if (ctx->has_smi) cq_size *= 2; ctx->cq = ib_create_cq(ctx->ib_dev, mlx4_ib_tunnel_comp_handler, NULL, ctx, cq_size, 0); if (IS_ERR(ctx->cq)) { ret = PTR_ERR(ctx->cq); pr_err("Couldn't create tunnel CQ (%d)\n", ret); goto err_buf; } ctx->pd = ib_alloc_pd(ctx->ib_dev); if (IS_ERR(ctx->pd)) { ret = PTR_ERR(ctx->pd); pr_err("Couldn't create tunnel PD (%d)\n", ret); goto err_cq; } ctx->mr = ib_get_dma_mr(ctx->pd, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(ctx->mr)) { ret = PTR_ERR(ctx->mr); pr_err("Couldn't get tunnel DMA MR (%d)\n", ret); goto err_pd; } if (ctx->has_smi) { ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun); if (ret) { pr_err("Couldn't create %s QP0 (%d)\n", create_tun ? "tunnel for" : "", ret); goto err_mr; } } ret = create_pv_sqp(ctx, IB_QPT_GSI, create_tun); if (ret) { pr_err("Couldn't create %s QP1 (%d)\n", create_tun ? "tunnel for" : "", ret); goto err_qp0; } if (create_tun) INIT_WORK(&ctx->work, mlx4_ib_tunnel_comp_worker); else INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker); ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq; ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); if (ret) { pr_err("Couldn't arm tunnel cq (%d)\n", ret); goto err_wq; } ctx->state = DEMUX_PV_STATE_ACTIVE; return 0; err_wq: ctx->wq = NULL; ib_destroy_qp(ctx->qp[1].qp); ctx->qp[1].qp = NULL; err_qp0: if (ctx->has_smi) ib_destroy_qp(ctx->qp[0].qp); ctx->qp[0].qp = NULL; err_mr: ib_dereg_mr(ctx->mr); ctx->mr = NULL; err_pd: ib_dealloc_pd(ctx->pd); ctx->pd = NULL; err_cq: ib_destroy_cq(ctx->cq); ctx->cq = NULL; err_buf: mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, create_tun); err_out_qp0: if (ctx->has_smi) mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, create_tun); err_out: ctx->state = DEMUX_PV_STATE_DOWN; return ret; } static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port, struct mlx4_ib_demux_pv_ctx *ctx, int flush) { if (!ctx) return; if (ctx->state > DEMUX_PV_STATE_DOWN) { ctx->state = DEMUX_PV_STATE_DOWNING; if (flush) flush_workqueue(ctx->wq); if (ctx->has_smi) { ib_destroy_qp(ctx->qp[0].qp); ctx->qp[0].qp = NULL; mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, 1); } ib_destroy_qp(ctx->qp[1].qp); ctx->qp[1].qp = NULL; mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1); ib_dereg_mr(ctx->mr); ctx->mr = NULL; ib_dealloc_pd(ctx->pd); ctx->pd = NULL; ib_destroy_cq(ctx->cq); ctx->cq = NULL; ctx->state = DEMUX_PV_STATE_DOWN; } } static int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave, int port, int do_init) { int ret = 0; if (!do_init) { clean_vf_mcast(&dev->sriov.demux[port - 1], slave); /* for master, destroy real sqp resources */ if (slave == mlx4_master_func_num(dev->dev)) destroy_pv_resources(dev, slave, port, dev->sriov.sqps[port - 1], 1); /* destroy the tunnel qp resources */ destroy_pv_resources(dev, slave, port, dev->sriov.demux[port - 1].tun[slave], 1); return 0; } /* create the tunnel qp resources */ ret = create_pv_resources(&dev->ib_dev, slave, port, 1, dev->sriov.demux[port - 1].tun[slave]); /* for master, create the real sqp resources */ if (!ret && slave == mlx4_master_func_num(dev->dev)) ret = create_pv_resources(&dev->ib_dev, slave, port, 0, dev->sriov.sqps[port - 1]); return ret; } void mlx4_ib_tunnels_update_work(struct work_struct *work) { struct mlx4_ib_demux_work *dmxw; dmxw = container_of(work, struct mlx4_ib_demux_work, work); mlx4_ib_tunnels_update(dmxw->dev, dmxw->slave, (int) dmxw->port, dmxw->do_init); kfree(dmxw); return; } static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, struct mlx4_ib_demux_ctx *ctx, int port) { char name[12]; int ret = 0; int i; ctx->tun = kcalloc(dev->dev->caps.sqp_demux, sizeof (struct mlx4_ib_demux_pv_ctx *), GFP_KERNEL); if (!ctx->tun) return -ENOMEM; ctx->dev = dev; ctx->port = port; ctx->ib_dev = &dev->ib_dev; for (i = 0; i < dev->dev->caps.sqp_demux; i++) { ret = alloc_pv_object(dev, i, port, &ctx->tun[i]); if (ret) { ret = -ENOMEM; goto err_mcg; } } ret = mlx4_ib_mcg_port_init(ctx); if (ret) { pr_err("Failed initializing mcg para-virt (%d)\n", ret); goto err_mcg; } snprintf(name, sizeof name, "mlx4_ibt%d", port); ctx->wq = create_singlethread_workqueue(name); if (!ctx->wq) { pr_err("Failed to create tunnelling WQ for port %d\n", port); ret = -ENOMEM; goto err_wq; } snprintf(name, sizeof name, "mlx4_ibud%d", port); ctx->ud_wq = create_singlethread_workqueue(name); if (!ctx->ud_wq) { pr_err("Failed to create up/down WQ for port %d\n", port); ret = -ENOMEM; goto err_udwq; } return 0; err_udwq: destroy_workqueue(ctx->wq); ctx->wq = NULL; err_wq: mlx4_ib_mcg_port_cleanup(ctx, 1); err_mcg: for (i = 0; i < dev->dev->caps.sqp_demux; i++) free_pv_object(dev, i, port); kfree(ctx->tun); ctx->tun = NULL; return ret; } static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx) { if (sqp_ctx->state > DEMUX_PV_STATE_DOWN) { sqp_ctx->state = DEMUX_PV_STATE_DOWNING; flush_workqueue(sqp_ctx->wq); if (sqp_ctx->has_smi) { ib_destroy_qp(sqp_ctx->qp[0].qp); sqp_ctx->qp[0].qp = NULL; mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_SMI, 0); } ib_destroy_qp(sqp_ctx->qp[1].qp); sqp_ctx->qp[1].qp = NULL; mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0); ib_dereg_mr(sqp_ctx->mr); sqp_ctx->mr = NULL; ib_dealloc_pd(sqp_ctx->pd); sqp_ctx->pd = NULL; ib_destroy_cq(sqp_ctx->cq); sqp_ctx->cq = NULL; sqp_ctx->state = DEMUX_PV_STATE_DOWN; } } static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx) { int i; if (ctx) { struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); mlx4_ib_mcg_port_cleanup(ctx, 1); for (i = 0; i < dev->dev->caps.sqp_demux; i++) { if (!ctx->tun[i]) continue; if (ctx->tun[i]->state > DEMUX_PV_STATE_DOWN) ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING; } flush_workqueue(ctx->wq); for (i = 0; i < dev->dev->caps.sqp_demux; i++) { destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0); free_pv_object(dev, i, ctx->port); } kfree(ctx->tun); destroy_workqueue(ctx->ud_wq); destroy_workqueue(ctx->wq); } } static void mlx4_ib_master_tunnels(struct mlx4_ib_dev *dev, int do_init) { int i; if (!mlx4_is_master(dev->dev)) return; /* initialize or tear down tunnel QPs for the master */ for (i = 0; i < dev->dev->caps.num_ports; i++) mlx4_ib_tunnels_update(dev, mlx4_master_func_num(dev->dev), i + 1, do_init); return; } int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev) { int i = 0; int err; if (!mlx4_is_mfunc(dev->dev)) return 0; dev->sriov.is_going_down = 0; spin_lock_init(&dev->sriov.going_down_lock); mlx4_ib_cm_paravirt_init(dev); mlx4_ib_warn(&dev->ib_dev, "multi-function enabled\n"); if (mlx4_is_slave(dev->dev)) { mlx4_ib_warn(&dev->ib_dev, "operating in qp1 tunnel mode\n"); return 0; } for (i = 0; i < dev->dev->caps.sqp_demux; i++) { if (i == mlx4_master_func_num(dev->dev)) mlx4_put_slave_node_guid(dev->dev, i, dev->ib_dev.node_guid); else mlx4_put_slave_node_guid(dev->dev, i, mlx4_ib_gen_node_guid()); } err = mlx4_ib_init_alias_guid_service(dev); if (err) { mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n"); goto paravirt_err; } err = mlx4_ib_device_register_sysfs(dev); if (err) { mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n"); goto sysfs_err; } mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 clients\n", dev->dev->caps.sqp_demux); for (i = 0; i < dev->num_ports; i++) { union ib_gid gid; err = __mlx4_ib_query_gid(&dev->ib_dev, i + 1, 0, &gid, 1); if (err) goto demux_err; dev->sriov.demux[i].guid_cache[0] = gid.global.interface_id; err = alloc_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1, &dev->sriov.sqps[i]); if (err) goto demux_err; err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1); if (err) goto demux_err; } mlx4_ib_master_tunnels(dev, 1); return 0; demux_err: while (i > 0) { free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1); mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); --i; } mlx4_ib_device_unregister_sysfs(dev); sysfs_err: mlx4_ib_destroy_alias_guid_service(dev); paravirt_err: mlx4_ib_cm_paravirt_clean(dev, -1); return err; } void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev) { int i; unsigned long flags; if (!mlx4_is_mfunc(dev->dev)) return; spin_lock_irqsave(&dev->sriov.going_down_lock, flags); dev->sriov.is_going_down = 1; spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); if (mlx4_is_master(dev->dev)) { for (i = 0; i < dev->num_ports; i++) { flush_workqueue(dev->sriov.demux[i].ud_wq); mlx4_ib_free_sqp_ctx(dev->sriov.sqps[i]); kfree(dev->sriov.sqps[i]); dev->sriov.sqps[i] = NULL; mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); } mlx4_ib_cm_paravirt_clean(dev, -1); mlx4_ib_destroy_alias_guid_service(dev); mlx4_ib_device_unregister_sysfs(dev); } } Index: head/sys/ofed/drivers/infiniband/hw/mlx4/main.c =================================================================== --- head/sys/ofed/drivers/infiniband/hw/mlx4/main.c (revision 296381) +++ head/sys/ofed/drivers/infiniband/hw/mlx4/main.c (revision 296382) @@ -1,2887 +1,2887 @@ /* * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mlx4_ib.h" #include "mlx4_exp.h" #include "user.h" #include "wc.h" #define DRV_NAME MLX4_IB_DRV_NAME #define DRV_VERSION "1.0" #define DRV_RELDATE __DATE__ #define MLX4_IB_DRIVER_PROC_DIR_NAME "driver/mlx4_ib" #define MLX4_IB_MRS_PROC_DIR_NAME "mrs" #define MLX4_IB_FLOW_MAX_PRIO 0xFFF #define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); MODULE_LICENSE("Dual BSD/GPL"); #ifdef __linux__ MODULE_VERSION(DRV_VERSION); #endif int mlx4_ib_sm_guid_assign = 1; module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)"); enum { MAX_NUM_STR_BITMAP = 1 << 15, DEFAULT_TBL_VAL = -1 }; static struct mlx4_dbdf2val_lst dev_assign_str = { .name = "dev_assign_str param", .num_vals = 1, .def_val = {DEFAULT_TBL_VAL}, .range = {0, MAX_NUM_STR_BITMAP - 1} }; module_param_string(dev_assign_str, dev_assign_str.str, sizeof(dev_assign_str.str), 0444); MODULE_PARM_DESC(dev_assign_str, "Map device function numbers to IB device numbers (e.g. '0000:04:00.0-0,002b:1c:0b.a-1,...').\n" "\t\tHexadecimal digits for the device function (e.g. 002b:1c:0b.a) and decimal for IB device numbers (e.g. 1).\n" "\t\tMax supported devices - 32"); static unsigned long *dev_num_str_bitmap; static spinlock_t dev_num_str_lock; static const char mlx4_ib_version[] = DRV_NAME ": Mellanox ConnectX InfiniBand driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; struct update_gid_work { struct work_struct work; union ib_gid gids[128]; struct mlx4_ib_dev *dev; int port; }; struct dev_rec { int bus; int dev; int func; int nr; }; static int dr_active; static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init); static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, struct net_device*, unsigned long); static u8 mlx4_ib_get_dev_port(struct net_device *dev, struct mlx4_ib_dev *ibdev); static struct workqueue_struct *wq; static void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; mad->class_version = 1; mad->method = IB_MGMT_METHOD_GET; } static union ib_gid zgid; static int check_flow_steering_support(struct mlx4_dev *dev) { int eth_num_ports = 0; int ib_num_ports = 0; int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED; if (dmfs) { int i; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) eth_num_ports++; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) ib_num_ports++; dmfs &= (!ib_num_ports || (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) && (!eth_num_ports || (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)); if (ib_num_ports && mlx4_is_mfunc(dev)) { dmfs = 0; } } return dmfs; } int mlx4_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; memset(props, 0, sizeof *props); props->fw_ver = dev->dev->caps.fw_ver; props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK | IB_DEVICE_SHARED_MR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM) props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT) props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; if (dev->dev->caps.max_gso_sz && dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH) props->device_cap_flags |= IB_DEVICE_UD_TSO; if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY) props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR)) props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) props->device_cap_flags |= IB_DEVICE_XRC; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_CROSS_CHANNEL) props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; if (check_flow_steering_support(dev->dev)) props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; props->device_cap_flags |= IB_DEVICE_QPG; if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) { props->device_cap_flags |= IB_DEVICE_UD_RSS; props->max_rss_tbl_sz = dev->dev->caps.max_rss_tbl_sz; } if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) props->device_cap_flags |= IB_DEVICE_MEM_WINDOW; if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B) props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B; else props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A; } props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & 0xffffff; props->vendor_part_id = dev->dev->pdev->device; props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&props->sys_image_guid, out_mad->data + 4, 8); props->max_mr_size = ~0ull; props->page_size_cap = dev->dev->caps.page_size_cap; props->max_qp = dev->dev->quotas.qp; props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; props->max_mr = dev->dev->quotas.mpt; props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds; props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma; props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma; props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq = dev->dev->quotas.srq; props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; props->max_srq_sge = dev->dev->caps.max_srq_sge; props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay; props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ? IB_ATOMIC_HCA : IB_ATOMIC_NONE; props->masked_atomic_cap = props->atomic_cap; props->max_pkeys = dev->dev->caps.pkey_table_len[1]; props->max_mcast_grp = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms; props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm; props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = dev->dev->caps.max_fmr_maps; props->hca_core_clock = dev->dev->caps.hca_core_clock; if (dev->dev->caps.hca_core_clock > 0) props->comp_mask |= IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK; if (dev->dev->caps.cq_timestamp) { props->timestamp_mask = 0xFFFFFFFFFFFF; props->comp_mask |= IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK; } out: kfree(in_mad); kfree(out_mad); return err; } static enum rdma_link_layer mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num) { struct mlx4_dev *dev = to_mdev(device)->dev; return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ? IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; } static int ib_link_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int ext_active_speed; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); props->lmc = out_mad->data[34] & 0x7; props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); props->sm_sl = out_mad->data[36] & 0xf; props->state = out_mad->data[32] & 0xf; props->phys_state = out_mad->data[33] >> 4; props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20)); if (netw_view) props->gid_tbl_len = out_mad->data[50]; else props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz; props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port]; props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); props->active_width = out_mad->data[31] & 0xf; props->active_speed = out_mad->data[35] >> 4; props->max_mtu = out_mad->data[41] & 0xf; props->active_mtu = out_mad->data[36] >> 4; props->subnet_timeout = out_mad->data[51] & 0x1f; props->max_vl_num = out_mad->data[37] >> 4; props->init_type_reply = out_mad->data[41] >> 4; /* Check if extended speeds (EDR/FDR/...) are supported */ if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { ext_active_speed = out_mad->data[62] >> 4; switch (ext_active_speed) { case 1: props->active_speed = IB_SPEED_FDR; break; case 2: props->active_speed = IB_SPEED_EDR; break; } } /* If reported active speed is QDR, check if is FDR-10 */ if (props->active_speed == IB_SPEED_QDR) { init_query_mad(in_mad); in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; /* Checking LinkSpeedActive for FDR-10 */ if (out_mad->data[15] & 0x1) props->active_speed = IB_SPEED_FDR10; } /* Avoid wrong speed value returned by FW if the IB link is down. */ if (props->state == IB_PORT_DOWN) props->active_speed = IB_SPEED_SDR; out: kfree(in_mad); kfree(out_mad); return err; } static u8 state_to_phys_state(enum ib_port_state state) { return state == IB_PORT_ACTIVE ? 5 : 3; } static int eth_link_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { struct mlx4_ib_dev *mdev = to_mdev(ibdev); struct mlx4_ib_iboe *iboe = &mdev->iboe; struct net_device *ndev; enum ib_mtu tmp; struct mlx4_cmd_mailbox *mailbox; unsigned long flags; int err = 0; mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0, MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) goto out; props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ? IB_WIDTH_4X : IB_WIDTH_1X; props->active_speed = IB_SPEED_QDR; props->port_cap_flags = IB_PORT_CM_SUP; if (netw_view) props->gid_tbl_len = MLX4_ROCE_MAX_GIDS; else props->gid_tbl_len = mdev->dev->caps.gid_table_len[port]; props->max_msg_sz = mdev->dev->caps.max_msg_sz; props->pkey_tbl_len = 1; props->max_mtu = IB_MTU_4096; props->max_vl_num = 2; props->state = IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); props->active_mtu = IB_MTU_256; spin_lock_irqsave(&iboe->lock, flags); ndev = iboe->netdevs[port - 1]; if (!ndev) goto out_unlock; tmp = iboe_get_mtu(ndev->if_mtu); props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256; props->state = (netif_running(ndev) && netif_carrier_ok(ndev)) ? IB_PORT_ACTIVE : IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); out_unlock: spin_unlock_irqrestore(&iboe->lock, flags); out: mlx4_free_cmd_mailbox(mdev->dev, mailbox); return err; } int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { int err; memset(props, 0, sizeof *props); err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? ib_link_query_port(ibdev, port, props, netw_view) : eth_link_query_port(ibdev, port, props, netw_view); return err; } static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { /* returns host view */ return __mlx4_ib_query_port(ibdev, port, props, 0); } int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; struct mlx4_ib_dev *dev = to_mdev(ibdev); int clear = 0; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); if (mlx4_is_mfunc(dev->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw, out_mad->data + 8, 8); if (mlx4_is_mfunc(dev->dev) && !netw_view) { if (index) { /* For any index > 0, return the null guid */ err = 0; clear = 1; goto out; } } init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; in_mad->attr_mod = cpu_to_be32(index / 8); err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); out: if (clear) memset(gid->raw + 8, 0, 8); kfree(in_mad); kfree(out_mad); return err; } static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct mlx4_ib_dev *dev = to_mdev(ibdev); *gid = dev->iboe.gid_table[port - 1][index]; return 0; } static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) return __mlx4_ib_query_gid(ibdev, port, index, gid, 0); else return iboe_query_gid(ibdev, port, index, gid); } int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; in_mad->attr_mod = cpu_to_be32(index / 32); if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]); out: kfree(in_mad); kfree(out_mad); return err; } static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0); } static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { struct mlx4_cmd_mailbox *mailbox; unsigned long flags; if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) return 0; if (mlx4_is_slave(to_mdev(ibdev)->dev)) return -EOPNOTSUPP; spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags); memcpy(ibdev->node_desc, props->node_desc, 64); spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags); /* * If possible, pass node desc to FW, so it can generate * a 144 trap. If cmd fails, just ignore. */ mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev); if (IS_ERR(mailbox)) return 0; memset(mailbox->buf, 0, 256); memcpy(mailbox->buf, props->node_desc, 64); mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox); return 0; } static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, u32 cap_mask) { struct mlx4_cmd_mailbox *mailbox; int err; u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; mailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); memset(mailbox->buf, 0, 256); if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { *(u8 *) mailbox->buf = !!reset_qkey_viols << 6; ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask); } else { ((u8 *) mailbox->buf)[3] = !!reset_qkey_viols; ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask); } err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev->dev, mailbox); return err; } static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { struct ib_port_attr attr; u32 cap_mask; int err; mutex_lock(&to_mdev(ibdev)->cap_mask_mutex); err = mlx4_ib_query_port(ibdev, port, &attr); if (err) goto out; cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; err = mlx4_SET_PORT(to_mdev(ibdev), port, !!(mask & IB_PORT_RESET_QKEY_CNTR), cap_mask); out: mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); return err; } static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct mlx4_ib_ucontext *context; struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3; struct mlx4_ib_alloc_ucontext_resp resp; int err; if (!dev->ib_active) return ERR_PTR(-EAGAIN); if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { resp_v3.qp_tab_size = dev->dev->caps.num_qps; if (mlx4_wc_enabled()) { resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size; resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; } else { resp_v3.bf_reg_size = 0; resp_v3.bf_regs_per_page = 0; } } else { resp.dev_caps = dev->dev->caps.userspace_caps; resp.qp_tab_size = dev->dev->caps.num_qps; if (mlx4_wc_enabled()) { resp.bf_reg_size = dev->dev->caps.bf_reg_size; resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; } else { resp.bf_reg_size = 0; resp.bf_regs_per_page = 0; } resp.cqe_size = dev->dev->caps.cqe_size; } context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar); if (err) { kfree(context); return ERR_PTR(err); } INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); else err = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (err) { mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); kfree(context); return ERR_PTR(-EFAULT); } return &context->ibucontext; } static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar); kfree(context); return 0; } /* XXX FBSD has no support for get_unmapped_area function */ #if 0 static unsigned long mlx4_ib_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm; struct vm_area_struct *vma; unsigned long start_addr; unsigned long page_size_order; unsigned long command; mm = current->mm; if (addr) return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); /* Last 8 bits hold the command others are data per that command */ command = pgoff & MLX4_IB_MMAP_CMD_MASK; if (command != MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); page_size_order = pgoff >> MLX4_IB_MMAP_CMD_BITS; /* code is based on the huge-pages get_unmapped_area code */ start_addr = mm->free_area_cache; if (len <= mm->cached_hole_size) start_addr = TASK_UNMAPPED_BASE; full_search: addr = ALIGN(start_addr, 1 << page_size_order); for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ if (TASK_SIZE - len < addr) { /* * Start a new search - just in case we missed * some holes. */ if (start_addr != TASK_UNMAPPED_BASE) { start_addr = TASK_UNMAPPED_BASE; goto full_search; } return -ENOMEM; } if (!vma || addr + len <= vma->vm_start) return addr; addr = ALIGN(vma->vm_end, 1 << page_size_order); } } #endif static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct mlx4_ib_dev *dev = to_mdev(context->device); /* Last 8 bits hold the command others are data per that command */ unsigned long command = vma->vm_pgoff & MLX4_IB_MMAP_CMD_MASK; if (command < MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) { /* compatability handling for commands 0 & 1*/ if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; } if (command == MLX4_IB_MMAP_UAR_PAGE) { vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; } else if (command == MLX4_IB_MMAP_BLUE_FLAME_PAGE && dev->dev->caps.bf_reg_size != 0) { vma->vm_page_prot = pgprot_wc(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn + dev->dev->caps.num_uars, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; } else if (command == MLX4_IB_MMAP_GET_HW_CLOCK) { struct mlx4_clock_params params; int ret; ret = mlx4_get_internal_clock_params(dev->dev, ¶ms); if (ret) return ret; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, (pci_resource_start(dev->dev->pdev, params.bar) + params.offset) >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; } else return -EINVAL; return 0; } static int mlx4_ib_ioctl(struct ib_ucontext *context, unsigned int cmd, unsigned long arg) { struct mlx4_ib_dev *dev = to_mdev(context->device); int ret; int offset; switch (cmd) { case MLX4_IOCHWCLOCKOFFSET: { struct mlx4_clock_params params; int ret; ret = mlx4_get_internal_clock_params(dev->dev, ¶ms); if (!ret) { offset = params.offset % PAGE_SIZE; ret = put_user(offset, (int *)arg); return sizeof(int); } else { return ret; } } default: { pr_err("mlx4_ib: invalid ioctl %u command with arg %lX\n", cmd, arg); return -ENOTTY; } } return ret; } static int mlx4_ib_query_values(struct ib_device *device, int q_values, struct ib_device_values *values) { struct mlx4_ib_dev *dev = to_mdev(device); cycle_t cycles; values->values_mask = 0; if (q_values & IBV_VALUES_HW_CLOCK) { cycles = mlx4_read_clock(dev->dev); if (cycles < 0) { values->hwclock = cycles & CORE_CLOCK_MASK; values->values_mask |= IBV_VALUES_HW_CLOCK; } q_values &= ~IBV_VALUES_HW_CLOCK; } if (q_values) return -ENOTTY; return 0; } static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx4_ib_pd *pd; int err; pd = kmalloc(sizeof *pd, GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn); if (err) { kfree(pd); return ERR_PTR(err); } if (context) if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) { mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn); kfree(pd); return ERR_PTR(-EFAULT); } return &pd->ibpd; } static int mlx4_ib_dealloc_pd(struct ib_pd *pd) { mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn); kfree(pd); return 0; } static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx4_ib_xrcd *xrcd; int err; if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) return ERR_PTR(-ENOSYS); xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL); if (!xrcd) return ERR_PTR(-ENOMEM); err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn); if (err) goto err1; xrcd->pd = ib_alloc_pd(ibdev); if (IS_ERR(xrcd->pd)) { err = PTR_ERR(xrcd->pd); goto err2; } xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, 1, 0); if (IS_ERR(xrcd->cq)) { err = PTR_ERR(xrcd->cq); goto err3; } return &xrcd->ibxrcd; err3: ib_dealloc_pd(xrcd->pd); err2: mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn); err1: kfree(xrcd); return ERR_PTR(err); } static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd) { ib_destroy_cq(to_mxrcd(xrcd)->cq); ib_dealloc_pd(to_mxrcd(xrcd)->pd); mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn); kfree(xrcd); return 0; } static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) { struct mlx4_ib_qp *mqp = to_mqp(ibqp); struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_gid_entry *ge; ge = kzalloc(sizeof *ge, GFP_KERNEL); if (!ge) return -ENOMEM; ge->gid = *gid; if (mlx4_ib_add_mc(mdev, mqp, gid)) { ge->port = mqp->port; ge->added = 1; } mutex_lock(&mqp->mutex); list_add_tail(&ge->list, &mqp->gid_list); mutex_unlock(&mqp->mutex); return 0; } int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, union ib_gid *gid) { u8 mac[6]; struct net_device *ndev; int ret = 0; if (!mqp->port) return 0; spin_lock(&mdev->iboe.lock); ndev = mdev->iboe.netdevs[mqp->port - 1]; if (ndev) dev_hold(ndev); spin_unlock(&mdev->iboe.lock); if (ndev) { rdma_get_mcast_mac((struct in6_addr *)gid, mac); rtnl_lock(); dev_mc_add(mdev->iboe.netdevs[mqp->port - 1], mac, 6, 0); ret = 1; rtnl_unlock(); dev_put(ndev); } return ret; } struct mlx4_ib_steering { struct list_head list; u64 reg_id; union ib_gid gid; }; static int parse_flow_attr(struct mlx4_dev *dev, union ib_flow_spec *ib_spec, struct _rule_hw *mlx4_spec) { enum mlx4_net_trans_rule_id type; switch (ib_spec->type) { case IB_FLOW_SPEC_ETH: type = MLX4_NET_TRANS_RULE_ID_ETH; memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac, ETH_ALEN); memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac, ETH_ALEN); mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag; mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag; break; case IB_FLOW_SPEC_IB: type = MLX4_NET_TRANS_RULE_ID_IB; mlx4_spec->ib.l3_qpn = ib_spec->ib.val.l3_type_qpn; mlx4_spec->ib.qpn_mask = ib_spec->ib.mask.l3_type_qpn; memcpy(&mlx4_spec->ib.dst_gid, ib_spec->ib.val.dst_gid, 16); memcpy(&mlx4_spec->ib.dst_gid_msk, ib_spec->ib.mask.dst_gid, 16); break; case IB_FLOW_SPEC_IPV4: type = MLX4_NET_TRANS_RULE_ID_IPV4; mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip; mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip; mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip; mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip; break; case IB_FLOW_SPEC_TCP: case IB_FLOW_SPEC_UDP: type = ib_spec->type == IB_FLOW_SPEC_TCP ? MLX4_NET_TRANS_RULE_ID_TCP : MLX4_NET_TRANS_RULE_ID_UDP; mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port; mlx4_spec->tcp_udp.dst_port_msk = ib_spec->tcp_udp.mask.dst_port; mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port; mlx4_spec->tcp_udp.src_port_msk = ib_spec->tcp_udp.mask.src_port; break; default: return -EINVAL; } if (map_sw_to_hw_steering_id(dev, type) < 0 || hw_rule_sz(dev, type) < 0) return -EINVAL; mlx4_spec->id = cpu_to_be16(map_sw_to_hw_steering_id(dev, type)); mlx4_spec->size = hw_rule_sz(dev, type) >> 2; return hw_rule_sz(dev, type); } static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain, enum mlx4_net_trans_promisc_mode flow_type, u64 *reg_id) { int ret, i; int size = 0; void *ib_flow; struct mlx4_ib_dev *mdev = to_mdev(qp->device); struct mlx4_cmd_mailbox *mailbox; struct mlx4_net_trans_rule_hw_ctrl *ctrl; size_t rule_size = sizeof(struct mlx4_net_trans_rule_hw_ctrl) + (sizeof(struct _rule_hw) * flow_attr->num_of_specs); static const u16 __mlx4_domain[] = { [IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS, [IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL, [IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS, [IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC, }; if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) { pr_err("Invalid priority value.\n"); return -EINVAL; - } + } if (domain >= IB_FLOW_DOMAIN_NUM) { pr_err("Invalid domain value.\n"); return -EINVAL; } if (map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0) return -EINVAL; mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); memset(mailbox->buf, 0, rule_size); ctrl = mailbox->buf; ctrl->prio = cpu_to_be16(__mlx4_domain[domain] | flow_attr->priority); ctrl->type = map_sw_to_hw_steering_mode(mdev->dev, flow_type); ctrl->port = flow_attr->port; ctrl->qpn = cpu_to_be32(qp->qp_num); if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK) ctrl->flags = (1 << 3); ib_flow = flow_attr + 1; size += sizeof(struct mlx4_net_trans_rule_hw_ctrl); for (i = 0; i < flow_attr->num_of_specs; i++) { ret = parse_flow_attr(mdev->dev, ib_flow, mailbox->buf + size); if (ret < 0) { mlx4_free_cmd_mailbox(mdev->dev, mailbox); return -EINVAL; } ib_flow += ((union ib_flow_spec *)ib_flow)->size; size += ret; } ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0, MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (ret == -ENOMEM) pr_err("mcg table is full. Fail to register network rule.\n"); else if (ret == -ENXIO) pr_err("Device managed flow steering is disabled. Fail to register network rule.\n"); else if (ret) pr_err("Invalid argumant. Fail to register network rule.\n"); mlx4_free_cmd_mailbox(mdev->dev, mailbox); return ret; } static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id) { int err; err = mlx4_cmd(dev, reg_id, 0, 0, MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) pr_err("Fail to detach network rule. registration id = 0x%llx\n", (unsigned long long)reg_id); return err; } static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain) { int err = 0, i = 0; struct mlx4_ib_flow *mflow; enum mlx4_net_trans_promisc_mode type[2]; memset(type, 0, sizeof(type)); mflow = kzalloc(sizeof(struct mlx4_ib_flow), GFP_KERNEL); if (!mflow) { err = -ENOMEM; goto err_free; } switch (flow_attr->type) { case IB_FLOW_ATTR_NORMAL: type[0] = MLX4_FS_REGULAR; - break; + break; case IB_FLOW_ATTR_ALL_DEFAULT: type[0] = MLX4_FS_ALL_DEFAULT; break; case IB_FLOW_ATTR_MC_DEFAULT: type[0] = MLX4_FS_MC_DEFAULT; break; case IB_FLOW_ATTR_SNIFFER: type[0] = MLX4_FS_UC_SNIFFER; type[1] = MLX4_FS_MC_SNIFFER; break; default: err = -EINVAL; goto err_free; } while (i < ARRAY_SIZE(type) && type[i]) { err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i], &mflow->reg_id[i]); - if (err) + if (err) goto err_free; i++; } return &mflow->ibflow; err_free: kfree(mflow); return ERR_PTR(err); } static int mlx4_ib_destroy_flow(struct ib_flow *flow_id) { int err, ret = 0; int i = 0; struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device); struct mlx4_ib_flow *mflow = to_mflow(flow_id); while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i]) { err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i]); if (err) ret = err; i++; } kfree(mflow); return ret; } static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) { struct mlx4_ib_gid_entry *ge; struct mlx4_ib_gid_entry *tmp; struct mlx4_ib_gid_entry *ret = NULL; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { if (!memcmp(raw, ge->gid.raw, 16)) { ret = ge; break; } } return ret; } static int del_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) { struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); struct mlx4_ib_gid_entry *ge; struct net_device *ndev; u8 mac[6]; mutex_lock(&mqp->mutex); ge = find_gid_entry(mqp, gid->raw); if (ge) { spin_lock(&mdev->iboe.lock); ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL; if (ndev) dev_hold(ndev); spin_unlock(&mdev->iboe.lock); rdma_get_mcast_mac((struct in6_addr *)gid, mac); if (ndev) { rtnl_lock(); dev_mc_delete(mdev->iboe.netdevs[ge->port - 1], mac, 6, 0); rtnl_unlock(); dev_put(ndev); } list_del(&ge->list); kfree(ge); } else pr_warn("could not find mgid entry\n"); mutex_unlock(&mqp->mutex); return ge != 0 ? 0 : -EINVAL; } static int _mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid, int count) { int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); u64 reg_id = 0; int record_err = 0; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { struct mlx4_ib_steering *ib_steering; struct mlx4_ib_steering *tmp; LIST_HEAD(temp); mutex_lock(&mqp->mutex); list_for_each_entry_safe(ib_steering, tmp, &mqp->steering_rules, list) { if (memcmp(ib_steering->gid.raw, gid->raw, 16)) continue; if (--count < 0) break; list_del(&ib_steering->list); list_add(&ib_steering->list, &temp); } mutex_unlock(&mqp->mutex); list_for_each_entry_safe(ib_steering, tmp, &temp, list) { reg_id = ib_steering->reg_id; err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, (ibqp->qp_type == IB_QPT_RAW_PACKET) ? MLX4_PROT_ETH : MLX4_PROT_IB_IPV6, reg_id); if (err) { record_err = record_err ?: err; continue; } err = del_gid_entry(ibqp, gid); if (err) { record_err = record_err ?: err; continue; } list_del(&ib_steering->list); kfree(ib_steering); } mutex_lock(&mqp->mutex); list_for_each_entry(ib_steering, &temp, list) { list_add(&ib_steering->list, &mqp->steering_rules); } mutex_unlock(&mqp->mutex); if (count) { pr_warn("Couldn't release all reg_ids for mgid. Steering rule is left attached\n"); return -EINVAL; } } else { if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_B0 && ibqp->qp_type == IB_QPT_RAW_PACKET) gid->raw[5] = mqp->port; err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, (ibqp->qp_type == IB_QPT_RAW_PACKET) ? MLX4_PROT_ETH : MLX4_PROT_IB_IPV6, reg_id); if (err) return err; err = del_gid_entry(ibqp, gid); if (err) return err; } return record_err; } static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); int count = (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) ? mdev->dev->caps.num_ports : 1; return _mlx4_ib_mcg_detach(ibqp, gid, lid, count); } static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { int err = -ENODEV; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); DECLARE_BITMAP(ports, MLX4_MAX_PORTS); int i = 0; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_B0 && ibqp->qp_type == IB_QPT_RAW_PACKET) gid->raw[5] = mqp->port; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { bitmap_fill(ports, mdev->dev->caps.num_ports); } else { if (mqp->port <= mdev->dev->caps.num_ports) { bitmap_zero(ports, mdev->dev->caps.num_ports); set_bit(0, ports); } else { return -EINVAL; } } for (; i < mdev->dev->caps.num_ports; i++) { u64 reg_id; struct mlx4_ib_steering *ib_steering = NULL; if (!test_bit(i, ports)) continue; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL); if (!ib_steering) goto err_add; } err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, i + 1, !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), (ibqp->qp_type == IB_QPT_RAW_PACKET) ? MLX4_PROT_ETH : MLX4_PROT_IB_IPV6, ®_id); if (err) { kfree(ib_steering); goto err_add; } err = add_gid_entry(ibqp, gid); if (err) { mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, MLX4_PROT_IB_IPV6, reg_id); kfree(ib_steering); goto err_add; } if (ib_steering) { memcpy(ib_steering->gid.raw, gid->raw, 16); mutex_lock(&mqp->mutex); list_add(&ib_steering->list, &mqp->steering_rules); mutex_unlock(&mqp->mutex); ib_steering->reg_id = reg_id; } } return 0; err_add: if (i > 0) _mlx4_ib_mcg_detach(ibqp, gid, lid, i); return err; } static int init_node_data(struct mlx4_ib_dev *dev) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; if (mlx4_is_master(dev->dev)) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(dev->ib_dev.node_desc, out_mad->data, 64); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); out: kfree(in_mad); kfree(out_mad); return err; } static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "MT%d\n", dev->dev->pdev->device); } static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32), (int) (dev->dev->caps.fw_ver >> 16) & 0xffff, (int) dev->dev->caps.fw_ver & 0xffff); } static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->dev->rev_id); } static ssize_t show_board(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, dev->dev->board_id); } static ssize_t show_vsd(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); ssize_t len = MLX4_VSD_LEN; if (dev->dev->vsd_vendor_id == PCI_VENDOR_ID_MELLANOX) len = sprintf(buf, "%.*s\n", MLX4_VSD_LEN, dev->dev->vsd); else memcpy(buf, dev->dev->vsd, MLX4_VSD_LEN); return len; } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static DEVICE_ATTR(vsd, S_IRUGO, show_vsd, NULL); static struct device_attribute *mlx4_class_attributes[] = { &dev_attr_hw_rev, &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id, &dev_attr_vsd }; static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev, u8 port) { memcpy(eui, IF_LLADDR(dev), 3); memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); if (vlan_id < 0x1000) { eui[3] = vlan_id >> 8; eui[4] = vlan_id & 0xff; } else { eui[3] = 0xff; eui[4] = 0xfe; } eui[0] ^= 2; } static void update_gids_task(struct work_struct *work) { struct update_gid_work *gw = container_of(work, struct update_gid_work, work); struct mlx4_cmd_mailbox *mailbox; union ib_gid *gids; int err; struct mlx4_dev *dev = gw->dev->dev; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox)); goto free; } gids = mailbox->buf; memcpy(gids, gw->gids, sizeof gw->gids); if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) == IB_LINK_LAYER_ETHERNET) { err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port, - 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, - MLX4_CMD_WRAPPED); + 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); - if (err) - pr_warn("set port command failed\n"); + if (err) + pr_warn("set port command failed\n"); else mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE); } mlx4_free_cmd_mailbox(dev, mailbox); free: kfree(gw); } static void reset_gids_task(struct work_struct *work) { struct update_gid_work *gw = container_of(work, struct update_gid_work, work); struct mlx4_cmd_mailbox *mailbox; union ib_gid *gids; int err; struct mlx4_dev *dev = gw->dev->dev; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { pr_warn("reset gid table failed\n"); goto free; } gids = mailbox->buf; memcpy(gids, gw->gids, sizeof(gw->gids)); if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET && dev->caps.num_ports > 0) { err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | 1, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) pr_warn("set port 1 command failed\n"); } if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, 2) == IB_LINK_LAYER_ETHERNET && dev->caps.num_ports > 1) { err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | 2, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) pr_warn("set port 2 command failed\n"); } mlx4_free_cmd_mailbox(dev, mailbox); free: kfree(gw); } static int update_gid_table(struct mlx4_ib_dev *dev, int port, union ib_gid *gid, int clear, int default_gid) { struct update_gid_work *work; int i; int need_update = 0; int free = -1; int found = -1; int max_gids; int start_index = !default_gid; max_gids = dev->dev->caps.gid_table_len[port]; for (i = start_index; i < max_gids; ++i) { if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid, sizeof(*gid))) found = i; if (clear) { if (found >= 0) { need_update = 1; dev->iboe.gid_table[port - 1][found] = zgid; - break; - } + break; + } } else { if (found >= 0) break; if (free < 0 && !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof(*gid))) free = i; - } - } + } + } if (found == -1 && !clear && free < 0) { pr_err("GID table of port %d is full. Can't add "GID_PRINT_FMT"\n", port, GID_PRINT_ARGS(gid)); return -ENOMEM; - } + } if (found == -1 && clear) { pr_err(GID_PRINT_FMT" is not in GID table of port %d\n", GID_PRINT_ARGS(gid), port); return -EINVAL; - } + } if (found == -1 && !clear && free >= 0) { dev->iboe.gid_table[port - 1][free] = *gid; need_update = 1; - } + } if (!need_update) return 0; work = kzalloc(sizeof *work, GFP_ATOMIC); if (!work) return -ENOMEM; memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids)); - INIT_WORK(&work->work, update_gids_task); - work->port = port; - work->dev = dev; - queue_work(wq, &work->work); + INIT_WORK(&work->work, update_gids_task); + work->port = port; + work->dev = dev; + queue_work(wq, &work->work); return 0; } static int reset_gid_table(struct mlx4_ib_dev *dev) { struct update_gid_work *work; work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) return -ENOMEM; memset(dev->iboe.gid_table, 0, sizeof(dev->iboe.gid_table)); memset(work->gids, 0, sizeof(work->gids)); INIT_WORK(&work->work, reset_gids_task); work->dev = dev; queue_work(wq, &work->work); return 0; } /* XXX BOND Related - stub (no support for these flags in FBSD)*/ static inline int netif_is_bond_master(struct net_device *dev) { #if 0 return (dev->flags & IFF_MASTER) && (dev->priv_flags & IFF_BONDING); #endif return 0; } static void mlx4_make_default_gid(struct net_device *dev, union ib_gid *gid, u8 port) { gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev, port); } static u8 mlx4_ib_get_dev_port(struct net_device *dev, struct mlx4_ib_dev *ibdev) { u8 port = 0; struct mlx4_ib_iboe *iboe; struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ? rdma_vlan_dev_real_dev(dev) : dev; iboe = &ibdev->iboe; for (port = 1; port <= MLX4_MAX_PORTS; ++port) if ((netif_is_bond_master(real_dev) && (real_dev == iboe->masters[port - 1])) || (!netif_is_bond_master(real_dev) && (real_dev == iboe->netdevs[port - 1]))) - break; + break; return port > MLX4_MAX_PORTS ? 0 : port; } static void mlx4_ib_get_dev_addr(struct net_device *dev, struct mlx4_ib_dev *ibdev, u8 port) { struct ifaddr *ifa; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct inet6_dev *in6_dev; union ib_gid *pgid; struct inet6_ifaddr *ifp; #endif union ib_gid gid; if ((port == 0) || (port > MLX4_MAX_PORTS)) return; /* IPv4 gids */ TAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) { if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET){ ipv6_addr_set_v4mapped( ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr.s_addr, (struct in6_addr *)&gid); update_gid_table(ibdev, port, &gid, 0, 0); } } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) /* IPv6 gids */ in6_dev = in6_dev_get(dev); if (in6_dev) { read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { pgid = (union ib_gid *)&ifp->addr; - update_gid_table(ibdev, port, pgid, 0, 0); - } + update_gid_table(ibdev, port, pgid, 0, 0); + } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); - } + } #endif } static void mlx4_set_default_gid(struct mlx4_ib_dev *ibdev, struct net_device *dev, u8 port) { union ib_gid gid; mlx4_make_default_gid(dev, &gid, port); update_gid_table(ibdev, port, &gid, 0, 1); } static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev) { struct net_device *dev; if (reset_gid_table(ibdev)) return -1; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(dev, &V_ifnet, if_link) { u8 port = mlx4_ib_get_dev_port(dev, ibdev); if (port) { if (!rdma_vlan_dev_real_dev(dev) && !netif_is_bond_master(dev)) mlx4_set_default_gid(ibdev, dev, port); mlx4_ib_get_dev_addr(dev, ibdev, port); } } IFNET_RUNLOCK_NOSLEEP(); return 0; } static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, struct net_device *dev, unsigned long event) { struct mlx4_ib_iboe *iboe; int port; int init = 0; unsigned long flags; iboe = &ibdev->iboe; spin_lock_irqsave(&iboe->lock, flags); mlx4_foreach_ib_transport_port(port, ibdev->dev) { struct net_device *old_netdev = iboe->netdevs[port - 1]; /* XXX BOND related */ #if 0 struct net_device *old_master = iboe->masters[port - 1]; #endif iboe->masters[port - 1] = NULL; iboe->netdevs[port - 1] = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); if (old_netdev != iboe->netdevs[port - 1]) init = 1; if (dev == iboe->netdevs[port - 1] && event == NETDEV_CHANGEADDR) init = 1; /* XXX BOND related */ #if 0 if (iboe->netdevs[port - 1] && netif_is_bond_slave(iboe->netdevs[port - 1])) iboe->masters[port - 1] = iboe->netdevs[port - 1]->master; /* if bonding is used it is possible that we add it to masters only after IP address is assigned to the net bonding interface */ if (old_master != iboe->masters[port - 1]) init = 1; #endif } spin_unlock_irqrestore(&iboe->lock, flags); if (init) if (mlx4_ib_init_gid_table(ibdev)) pr_warn("Fail to reset gid table\n"); } static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = ptr; struct mlx4_ib_dev *ibdev; ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); mlx4_ib_scan_netdevs(ibdev, dev, event); return NOTIFY_DONE; } /* This function initializes the gid table only if the event_netdev real device is an iboe * device, will be invoked by the inet/inet6 events */ static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_netdev = ptr; struct mlx4_ib_dev *ibdev; struct mlx4_ib_iboe *ibdev_iboe; int port = 0; ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet); struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ? rdma_vlan_dev_real_dev(event_netdev) : event_netdev; ibdev_iboe = &ibdev->iboe; port = mlx4_ib_get_dev_port(real_dev, ibdev); /* Perform init_gid_table if the event real_dev is the net_device which represents this port, * otherwise this event is not related and would be ignored.*/ if(port && (real_dev == ibdev_iboe->netdevs[port - 1])) if (mlx4_ib_init_gid_table(ibdev)) pr_warn("Fail to reset gid table\n"); return NOTIFY_DONE; } static void init_pkeys(struct mlx4_ib_dev *ibdev) { int port; int slave; int i; if (mlx4_is_master(ibdev->dev)) { for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) { for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { for (i = 0; i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; ++i) { ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] = /* master has the identity virt2phys pkey mapping */ (slave == mlx4_master_func_num(ibdev->dev) || !i) ? i : ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1; mlx4_sync_pkey_table(ibdev->dev, slave, port, i, ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]); } } } /* initialize pkey cache */ for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { for (i = 0; i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; ++i) ibdev->pkeys.phys_pkey_cache[port-1][i] = (i) ? 0 : 0xFFFF; } } } static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { char name[32]; int eq_per_port = 0; int added_eqs = 0; int total_eqs = 0; int i, j, eq; /* Legacy mode or comp_pool is not large enough */ if (dev->caps.comp_pool == 0 || dev->caps.num_ports > dev->caps.comp_pool) return; eq_per_port = rounddown_pow_of_two(dev->caps.comp_pool/ dev->caps.num_ports); /* Init eq table */ added_eqs = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) added_eqs += eq_per_port; total_eqs = dev->caps.num_comp_vectors + added_eqs; ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL); if (!ibdev->eq_table) return; ibdev->eq_added = added_eqs; eq = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) { for (j = 0; j < eq_per_port; j++) { sprintf(name, "mlx4-ib-%d-%d@%d:%d:%d:%d", i, j, - pci_get_domain(dev->pdev->dev.bsddev), - pci_get_bus(dev->pdev->dev.bsddev), - PCI_SLOT(dev->pdev->devfn), - PCI_FUNC(dev->pdev->devfn)); + pci_get_domain(dev->pdev->dev.bsddev), + pci_get_bus(dev->pdev->dev.bsddev), + PCI_SLOT(dev->pdev->devfn), + PCI_FUNC(dev->pdev->devfn)); /* Set IRQ for specific name (per ring) */ if (mlx4_assign_eq(dev, name, &ibdev->eq_table[eq])) { /* Use legacy (same as mlx4_en driver) */ pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq); ibdev->eq_table[eq] = (eq % dev->caps.num_comp_vectors); } eq++; } } /* Fill the reset of the vector with legacy EQ */ for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++) ibdev->eq_table[eq++] = i; /* Advertise the new number of EQs to clients */ ibdev->ib_dev.num_comp_vectors = total_eqs; } static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { int i; /* no additional eqs were added */ if (!ibdev->eq_table) return; /* Reset the advertised EQ number */ ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; /* Free only the added eqs */ for (i = 0; i < ibdev->eq_added; i++) { /* Don't free legacy eqs if used */ if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors) continue; mlx4_release_eq(dev, ibdev->eq_table[i]); } kfree(ibdev->eq_table); } /* * create show function and a device_attribute struct pointing to * the function for _name */ #define DEVICE_DIAG_RPRT_ATTR(_name, _offset, _op_mod) \ static ssize_t show_rprt_##_name(struct device *dev, \ struct device_attribute *attr, \ char *buf){ \ return show_diag_rprt(dev, buf, _offset, _op_mod); \ } \ static DEVICE_ATTR(_name, S_IRUGO, show_rprt_##_name, NULL); #define MLX4_DIAG_RPRT_CLEAR_DIAGS 3 static size_t show_diag_rprt(struct device *device, char *buf, u32 offset, u8 op_modifier) { size_t ret; u32 counter_offset = offset; u32 diag_counter = 0; struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); ret = mlx4_query_diag_counters(dev->dev, 1, op_modifier, &counter_offset, &diag_counter); if (ret) return ret; return sprintf(buf, "%d\n", diag_counter); } static ssize_t clear_diag_counters(struct device *device, struct device_attribute *attr, const char *buf, size_t length) { size_t ret; struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); ret = mlx4_query_diag_counters(dev->dev, 0, MLX4_DIAG_RPRT_CLEAR_DIAGS, NULL, NULL); if (ret) return ret; return length; } DEVICE_DIAG_RPRT_ATTR(rq_num_lle , 0x00, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_lle , 0x04, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_lqpoe , 0x08, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_lqpoe , 0x0C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_lpe , 0x18, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_lpe , 0x1C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_wrfe , 0x20, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_wrfe , 0x24, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_mwbe , 0x2C, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_bre , 0x34, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_lae , 0x38, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rire , 0x44, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_rire , 0x48, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rae , 0x4C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_rae , 0x50, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_roe , 0x54, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_tree , 0x5C, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rree , 0x64, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_rnr , 0x68, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rnr , 0x6C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_oos , 0x100, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_oos , 0x104, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_mce , 0x108, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_udsdprd , 0x118, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_ucsdprd , 0x120, 2); DEVICE_DIAG_RPRT_ATTR(num_cqovf , 0x1A0, 2); DEVICE_DIAG_RPRT_ATTR(num_eqovf , 0x1A4, 2); DEVICE_DIAG_RPRT_ATTR(num_baddb , 0x1A8, 2); static DEVICE_ATTR(clear_diag, S_IWUSR, NULL, clear_diag_counters); static struct attribute *diag_rprt_attrs[] = { &dev_attr_rq_num_lle.attr, &dev_attr_sq_num_lle.attr, &dev_attr_rq_num_lqpoe.attr, &dev_attr_sq_num_lqpoe.attr, &dev_attr_rq_num_lpe.attr, &dev_attr_sq_num_lpe.attr, &dev_attr_rq_num_wrfe.attr, &dev_attr_sq_num_wrfe.attr, &dev_attr_sq_num_mwbe.attr, &dev_attr_sq_num_bre.attr, &dev_attr_rq_num_lae.attr, &dev_attr_sq_num_rire.attr, &dev_attr_rq_num_rire.attr, &dev_attr_sq_num_rae.attr, &dev_attr_rq_num_rae.attr, &dev_attr_sq_num_roe.attr, &dev_attr_sq_num_tree.attr, &dev_attr_sq_num_rree.attr, &dev_attr_rq_num_rnr.attr, &dev_attr_sq_num_rnr.attr, &dev_attr_rq_num_oos.attr, &dev_attr_sq_num_oos.attr, &dev_attr_rq_num_mce.attr, &dev_attr_rq_num_udsdprd.attr, &dev_attr_rq_num_ucsdprd.attr, &dev_attr_num_cqovf.attr, &dev_attr_num_eqovf.attr, &dev_attr_num_baddb.attr, &dev_attr_clear_diag.attr, NULL }; static struct attribute_group diag_counters_group = { .name = "diag_counters", .attrs = diag_rprt_attrs }; static void init_dev_assign(void) { int i = 1; - + spin_lock_init(&dev_num_str_lock); if (mlx4_fill_dbdf2val_tbl(&dev_assign_str)) return; dev_num_str_bitmap = kmalloc(BITS_TO_LONGS(MAX_NUM_STR_BITMAP) * sizeof(long), GFP_KERNEL); if (!dev_num_str_bitmap) { pr_warn("bitmap alloc failed -- cannot apply dev_assign_str parameter\n"); return; } bitmap_zero(dev_num_str_bitmap, MAX_NUM_STR_BITMAP); while ((i < MLX4_DEVS_TBL_SIZE) && (dev_assign_str.tbl[i].dbdf != MLX4_ENDOF_TBL)) { if (bitmap_allocate_region(dev_num_str_bitmap, dev_assign_str.tbl[i].val[0], 0)) goto err; i++; } dr_active = 1; return; err: kfree(dev_num_str_bitmap); dev_num_str_bitmap = NULL; pr_warn("mlx4_ib: The value of 'dev_assign_str' parameter " "is incorrect. The parameter value is discarded!"); } static int mlx4_ib_dev_idx(struct mlx4_dev *dev) { int i, val; if (!dr_active) return -1; if (!dev) return -1; if (mlx4_get_val(dev_assign_str.tbl, dev->pdev, 0, &val)) return -1; if (val != DEFAULT_TBL_VAL) { dev->flags |= MLX4_FLAG_DEV_NUM_STR; return val; } spin_lock(&dev_num_str_lock); i = bitmap_find_free_region(dev_num_str_bitmap, MAX_NUM_STR_BITMAP, 0); spin_unlock(&dev_num_str_lock); if (i >= 0) return i; return -1; } static void *mlx4_ib_add(struct mlx4_dev *dev) { struct mlx4_ib_dev *ibdev; int num_ports = 0; int i, j; int err; struct mlx4_ib_iboe *iboe; int dev_idx; pr_info_once("%s", mlx4_ib_version); mlx4_foreach_ib_transport_port(i, dev) num_ports++; /* No point in registering a device with no ports... */ if (num_ports == 0) return NULL; ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev); if (!ibdev) { dev_err(&dev->pdev->dev, "Device struct alloc failed\n"); return NULL; } iboe = &ibdev->iboe; if (mlx4_pd_alloc(dev, &ibdev->priv_pdn)) goto err_dealloc; if (mlx4_uar_alloc(dev, &ibdev->priv_uar)) goto err_pd; ibdev->priv_uar.map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!ibdev->priv_uar.map) goto err_uar; MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock); ibdev->dev = dev; dev_idx = mlx4_ib_dev_idx(dev); if (dev_idx >= 0) sprintf(ibdev->ib_dev.name, "mlx4_%d", dev_idx); else - strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); + strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); ibdev->ib_dev.owner = THIS_MODULE; ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; ibdev->num_ports = num_ports; ibdev->ib_dev.phys_port_cnt = ibdev->num_ports; ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; ibdev->ib_dev.dma_device = &dev->pdev->dev; if (dev->caps.userspace_caps) ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; else ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | (1ull << IB_USER_VERBS_CMD_OPEN_QP); ibdev->ib_dev.query_device = mlx4_ib_query_device; ibdev->ib_dev.query_port = mlx4_ib_query_port; ibdev->ib_dev.get_link_layer = mlx4_ib_port_link_layer; ibdev->ib_dev.query_gid = mlx4_ib_query_gid; ibdev->ib_dev.query_pkey = mlx4_ib_query_pkey; ibdev->ib_dev.modify_device = mlx4_ib_modify_device; ibdev->ib_dev.modify_port = mlx4_ib_modify_port; ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext; ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext; ibdev->ib_dev.mmap = mlx4_ib_mmap; /* XXX FBSD has no support for get_unmapped_area function */ #if 0 ibdev->ib_dev.get_unmapped_area = mlx4_ib_get_unmapped_area; #endif ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd; ibdev->ib_dev.dealloc_pd = mlx4_ib_dealloc_pd; ibdev->ib_dev.create_ah = mlx4_ib_create_ah; ibdev->ib_dev.query_ah = mlx4_ib_query_ah; ibdev->ib_dev.destroy_ah = mlx4_ib_destroy_ah; ibdev->ib_dev.create_srq = mlx4_ib_create_srq; ibdev->ib_dev.modify_srq = mlx4_ib_modify_srq; ibdev->ib_dev.query_srq = mlx4_ib_query_srq; ibdev->ib_dev.destroy_srq = mlx4_ib_destroy_srq; ibdev->ib_dev.post_srq_recv = mlx4_ib_post_srq_recv; ibdev->ib_dev.create_qp = mlx4_ib_create_qp; ibdev->ib_dev.modify_qp = mlx4_ib_modify_qp; ibdev->ib_dev.query_qp = mlx4_ib_query_qp; ibdev->ib_dev.destroy_qp = mlx4_ib_destroy_qp; ibdev->ib_dev.post_send = mlx4_ib_post_send; ibdev->ib_dev.post_recv = mlx4_ib_post_recv; ibdev->ib_dev.create_cq = mlx4_ib_create_cq; ibdev->ib_dev.modify_cq = mlx4_ib_modify_cq; ibdev->ib_dev.resize_cq = mlx4_ib_resize_cq; ibdev->ib_dev.destroy_cq = mlx4_ib_destroy_cq; ibdev->ib_dev.poll_cq = mlx4_ib_poll_cq; ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq; ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr; ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr; ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr; ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr; ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list; ibdev->ib_dev.free_fast_reg_page_list = mlx4_ib_free_fast_reg_page_list; ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach; ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; ibdev->ib_dev.process_mad = mlx4_ib_process_mad; ibdev->ib_dev.ioctl = mlx4_ib_ioctl; ibdev->ib_dev.query_values = mlx4_ib_query_values; if (!mlx4_is_slave(ibdev->dev)) { ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; } if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) { ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw; ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw; ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw; ibdev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); } if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd; ibdev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } /* * Set experimental data */ ibdev->ib_dev.uverbs_exp_cmd_mask = (1ull << IB_USER_VERBS_EXP_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_EXP_CMD_MODIFY_CQ) | (1ull << IB_USER_VERBS_EXP_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_EXP_CMD_CREATE_CQ); ibdev->ib_dev.exp_create_qp = mlx4_ib_exp_create_qp; ibdev->ib_dev.exp_query_device = mlx4_ib_exp_query_device; if (check_flow_steering_support(dev)) { ibdev->ib_dev.uverbs_ex_cmd_mask |= (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); ibdev->ib_dev.create_flow = mlx4_ib_create_flow; ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; } else { pr_debug("Device managed flow steering is unavailable for this configuration.\n"); } /* * End of experimental data */ mlx4_ib_alloc_eqs(dev, ibdev); spin_lock_init(&iboe->lock); if (init_node_data(ibdev)) goto err_map; for (i = 0; i < ibdev->num_ports; ++i) { if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == IB_LINK_LAYER_ETHERNET) { if (mlx4_is_slave(dev)) { ibdev->counters[i].status = mlx4_counter_alloc(ibdev->dev, i + 1, &ibdev->counters[i].counter_index); } else {/* allocating the PF IB default counter indices reserved in mlx4_init_counters_table */ ibdev->counters[i].counter_index = ((i + 1) << 1) - 1; ibdev->counters[i].status = 0; } dev_info(&dev->pdev->dev, "%s: allocated counter index %d for port %d\n", __func__, ibdev->counters[i].counter_index, i+1); } else { ibdev->counters[i].counter_index = MLX4_SINK_COUNTER_INDEX; ibdev->counters[i].status = -ENOSPC; } } spin_lock_init(&ibdev->sm_lock); mutex_init(&ibdev->cap_mask_mutex); if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED && !mlx4_is_mfunc(dev)) { ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, MLX4_IB_UC_STEER_QPN_ALIGN, &ibdev->steer_qpn_base, 0); if (err) goto err_counter; ibdev->ib_uc_qpns_bitmap = kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) * sizeof(long), GFP_KERNEL); if (!ibdev->ib_uc_qpns_bitmap) { dev_err(&dev->pdev->dev, "bit map alloc failed\n"); goto err_steer_qp_release; } bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count); err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_base + ibdev->steer_qpn_count - 1); if (err) goto err_steer_free_bitmap; } if (ib_register_device(&ibdev->ib_dev, NULL)) goto err_steer_free_bitmap; if (mlx4_ib_mad_init(ibdev)) goto err_reg; if (mlx4_ib_init_sriov(ibdev)) goto err_mad; if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) { if (!iboe->nb.notifier_call) { - iboe->nb.notifier_call = mlx4_ib_netdev_event; - err = register_netdevice_notifier(&iboe->nb); + iboe->nb.notifier_call = mlx4_ib_netdev_event; + err = register_netdevice_notifier(&iboe->nb); if (err) { iboe->nb.notifier_call = NULL; goto err_notify; } } if (!iboe->nb_inet.notifier_call) { iboe->nb_inet.notifier_call = mlx4_ib_inet_event; err = register_inetaddr_notifier(&iboe->nb_inet); if (err) { iboe->nb_inet.notifier_call = NULL; goto err_notify; } } mlx4_ib_scan_netdevs(ibdev, NULL, 0); } for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { if (device_create_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j])) goto err_notify; } if (sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group)) goto err_notify; ibdev->ib_active = true; if (mlx4_is_mfunc(ibdev->dev)) init_pkeys(ibdev); /* create paravirt contexts for any VFs which are active */ if (mlx4_is_master(ibdev->dev)) { for (j = 0; j < MLX4_MFUNC_MAX; j++) { if (j == mlx4_master_func_num(ibdev->dev)) continue; if (mlx4_is_slave_active(ibdev->dev, j)) do_slave_init(ibdev, j, 1); } } return ibdev; err_notify: for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { device_remove_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j]); } if (ibdev->iboe.nb.notifier_call) { - if (unregister_netdevice_notifier(&ibdev->iboe.nb)) - pr_warn("failure unregistering notifier\n"); + if (unregister_netdevice_notifier(&ibdev->iboe.nb)) + pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb.notifier_call = NULL; } if (ibdev->iboe.nb_inet.notifier_call) { if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb_inet.notifier_call = NULL; } flush_workqueue(wq); mlx4_ib_close_sriov(ibdev); err_mad: mlx4_ib_mad_cleanup(ibdev); err_reg: ib_unregister_device(&ibdev->ib_dev); err_steer_free_bitmap: kfree(ibdev->ib_uc_qpns_bitmap); err_steer_qp_release: if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) mlx4_qp_release_range(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_count); err_counter: for (; i; --i) { if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i) == IB_LINK_LAYER_ETHERNET) { mlx4_counter_free(ibdev->dev, i, ibdev->counters[i - 1].counter_index); } } err_map: iounmap(ibdev->priv_uar.map); mlx4_ib_free_eqs(dev, ibdev); err_uar: mlx4_uar_free(dev, &ibdev->priv_uar); err_pd: mlx4_pd_free(dev, ibdev->priv_pdn); err_dealloc: ib_dealloc_device(&ibdev->ib_dev); return NULL; } int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn) { int offset; WARN_ON(!dev->ib_uc_qpns_bitmap); offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap, dev->steer_qpn_count, get_count_order(count)); if (offset < 0) return offset; *qpn = dev->steer_qpn_base + offset; return 0; } void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) { if (!qpn || dev->dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) return; BUG_ON(qpn < dev->steer_qpn_base); bitmap_release_region(dev->ib_uc_qpns_bitmap, qpn - dev->steer_qpn_base, get_count_order(count)); } int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, int is_attach) { int err; size_t flow_size; struct ib_flow_attr *flow = NULL; struct ib_flow_spec_ib *ib_spec; if (is_attach) { flow_size = sizeof(struct ib_flow_attr) + sizeof(struct ib_flow_spec_ib); flow = kzalloc(flow_size, GFP_KERNEL); if (!flow) return -ENOMEM; flow->port = mqp->port; flow->num_of_specs = 1; flow->size = flow_size; ib_spec = (struct ib_flow_spec_ib *)(flow + 1); ib_spec->type = IB_FLOW_SPEC_IB; ib_spec->size = sizeof(struct ib_flow_spec_ib); ib_spec->val.l3_type_qpn = mqp->ibqp.qp_num; ib_spec->mask.l3_type_qpn = MLX4_IB_FLOW_QPN_MASK; err = __mlx4_ib_create_flow(&mqp->ibqp, flow, IB_FLOW_DOMAIN_NIC, MLX4_FS_REGULAR, &mqp->reg_id); } else { err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id); } kfree(flow); return err; } static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) { struct mlx4_ib_dev *ibdev = ibdev_ptr; int p, j; int dev_idx, ret; if (ibdev->iboe.nb_inet.notifier_call) { if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb_inet.notifier_call = NULL; } mlx4_ib_close_sriov(ibdev); sysfs_remove_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group); mlx4_ib_mad_cleanup(ibdev); for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { device_remove_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j]); } dev_idx = -1; if (dr_active && !(ibdev->dev->flags & MLX4_FLAG_DEV_NUM_STR)) { ret = sscanf(ibdev->ib_dev.name, "mlx4_%d", &dev_idx); if (ret != 1) dev_idx = -1; } ib_unregister_device(&ibdev->ib_dev); if (dev_idx >= 0) { spin_lock(&dev_num_str_lock); bitmap_release_region(dev_num_str_bitmap, dev_idx, 0); spin_unlock(&dev_num_str_lock); } if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { mlx4_qp_release_range(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_count); kfree(ibdev->ib_uc_qpns_bitmap); } if (ibdev->iboe.nb.notifier_call) { if (unregister_netdevice_notifier(&ibdev->iboe.nb)) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb.notifier_call = NULL; } iounmap(ibdev->priv_uar.map); for (p = 0; p < ibdev->num_ports; ++p) { if (mlx4_ib_port_link_layer(&ibdev->ib_dev, p + 1) == IB_LINK_LAYER_ETHERNET) { mlx4_counter_free(ibdev->dev, p + 1, ibdev->counters[p].counter_index); } } mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB) mlx4_CLOSE_PORT(dev, p); mlx4_ib_free_eqs(dev, ibdev); mlx4_uar_free(dev, &ibdev->priv_uar); mlx4_pd_free(dev, ibdev->priv_pdn); ib_dealloc_device(&ibdev->ib_dev); } static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) { struct mlx4_ib_demux_work **dm = NULL; struct mlx4_dev *dev = ibdev->dev; int i; unsigned long flags; if (!mlx4_is_master(dev)) return; dm = kcalloc(dev->caps.num_ports, sizeof *dm, GFP_ATOMIC); if (!dm) { pr_err("failed to allocate memory for tunneling qp update\n"); goto out; } for (i = 0; i < dev->caps.num_ports; i++) { dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC); if (!dm[i]) { pr_err("failed to allocate memory for tunneling qp update work struct\n"); for (i = 0; i < dev->caps.num_ports; i++) { if (dm[i]) kfree(dm[i]); } goto out; } } /* initialize or tear down tunnel QPs for the slave */ for (i = 0; i < dev->caps.num_ports; i++) { INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); dm[i]->port = i + 1; dm[i]->slave = slave; dm[i]->do_init = do_init; dm[i]->dev = ibdev; spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags); if (!ibdev->sriov.is_going_down) queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work); spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); } out: if (dm) kfree(dm); return; } static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, enum mlx4_dev_event event, unsigned long param) { struct ib_event ibev; struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr); struct mlx4_eqe *eqe = NULL; struct ib_event_work *ew; int p = 0; if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE) eqe = (struct mlx4_eqe *)param; else p = (int) param; switch (event) { case MLX4_DEV_EVENT_PORT_UP: if (p > ibdev->num_ports) return; if (mlx4_is_master(dev) && rdma_port_get_link_layer(&ibdev->ib_dev, p) == IB_LINK_LAYER_INFINIBAND) { mlx4_ib_invalidate_all_guid_record(ibdev, p); } mlx4_ib_info((struct ib_device *) ibdev_ptr, "Port %d logical link is up\n", p); ibev.event = IB_EVENT_PORT_ACTIVE; break; case MLX4_DEV_EVENT_PORT_DOWN: if (p > ibdev->num_ports) return; mlx4_ib_info((struct ib_device *) ibdev_ptr, "Port %d logical link is down\n", p); ibev.event = IB_EVENT_PORT_ERR; break; case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: ibdev->ib_active = false; ibev.event = IB_EVENT_DEVICE_FATAL; break; case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: ew = kmalloc(sizeof *ew, GFP_ATOMIC); if (!ew) { pr_err("failed to allocate memory for events work\n"); break; } INIT_WORK(&ew->work, handle_port_mgmt_change_event); memcpy(&ew->ib_eqe, eqe, sizeof *eqe); ew->ib_dev = ibdev; /* need to queue only for port owner, which uses GEN_EQE */ if (mlx4_is_master(dev)) queue_work(wq, &ew->work); else handle_port_mgmt_change_event(&ew->work); return; case MLX4_DEV_EVENT_SLAVE_INIT: /* here, p is the slave id */ do_slave_init(ibdev, p, 1); return; case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: /* here, p is the slave id */ do_slave_init(ibdev, p, 0); return; default: return; } ibev.device = ibdev_ptr; ibev.element.port_num = (u8) p; ib_dispatch_event(&ibev); } static struct mlx4_interface mlx4_ib_interface = { .add = mlx4_ib_add, .remove = mlx4_ib_remove, .event = mlx4_ib_event, .protocol = MLX4_PROT_IB_IPV6 }; static int __init mlx4_ib_init(void) { int err; wq = create_singlethread_workqueue("mlx4_ib"); if (!wq) return -ENOMEM; err = mlx4_ib_mcg_init(); if (err) goto clean_proc; init_dev_assign(); err = mlx4_register_interface(&mlx4_ib_interface); if (err) goto clean_mcg; return 0; clean_mcg: mlx4_ib_mcg_destroy(); clean_proc: destroy_workqueue(wq); return err; } static void __exit mlx4_ib_cleanup(void) { mlx4_unregister_interface(&mlx4_ib_interface); mlx4_ib_mcg_destroy(); destroy_workqueue(wq); kfree(dev_num_str_bitmap); } module_init_order(mlx4_ib_init, SI_ORDER_MIDDLE); module_exit(mlx4_ib_cleanup); static int mlx4ib_evhand(module_t mod, int event, void *arg) { - return (0); + return (0); } static moduledata_t mlx4ib_mod = { - .name = "mlx4ib", - .evhand = mlx4ib_evhand, + .name = "mlx4ib", + .evhand = mlx4ib_evhand, }; DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_SMP, SI_ORDER_ANY); MODULE_DEPEND(mlx4ib, mlx4, 1, 1, 1); MODULE_DEPEND(mlx4ib, ibcore, 1, 1, 1); MODULE_DEPEND(mlx4ib, linuxkpi, 1, 1, 1); Index: head/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c =================================================================== --- head/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c (revision 296381) +++ head/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c (revision 296382) @@ -1,1260 +1,1260 @@ /* * Copyright (c) 2012 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include "mlx4_ib.h" #define MAX_VFS 80 #define MAX_PEND_REQS_PER_FUNC 4 #define MAD_TIMEOUT_MS 2000 #define mcg_warn(fmt, arg...) pr_warn("MCG WARNING: " fmt, ##arg) #define mcg_error(fmt, arg...) pr_err(fmt, ##arg) #define mcg_warn_group(group, format, arg...) \ pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\ (group)->name, group->demux->port, ## arg) #define mcg_error_group(group, format, arg...) \ pr_err(" %16s: " format, (group)->name, ## arg) static union ib_gid mgid0; static struct workqueue_struct *clean_wq; enum mcast_state { MCAST_NOT_MEMBER = 0, MCAST_MEMBER, }; enum mcast_group_state { MCAST_IDLE, MCAST_JOIN_SENT, MCAST_LEAVE_SENT, MCAST_RESP_READY }; struct mcast_member { enum mcast_state state; uint8_t join_state; int num_pend_reqs; struct list_head pending; }; struct ib_sa_mcmember_data { union ib_gid mgid; union ib_gid port_gid; __be32 qkey; __be16 mlid; u8 mtusel_mtu; u8 tclass; __be16 pkey; u8 ratesel_rate; u8 lifetmsel_lifetm; __be32 sl_flowlabel_hoplimit; u8 scope_join_state; u8 proxy_join; u8 reserved[2]; }; struct mcast_group { struct ib_sa_mcmember_data rec; struct rb_node node; struct list_head mgid0_list; struct mlx4_ib_demux_ctx *demux; struct mcast_member func[MAX_VFS]; struct mutex lock; struct work_struct work; struct list_head pending_list; int members[3]; enum mcast_group_state state; enum mcast_group_state prev_state; struct ib_sa_mad response_sa_mad; __be64 last_req_tid; char name[33]; /* MGID string */ struct device_attribute dentry; /* refcount is the reference count for the following: 1. Each queued request 2. Each invocation of the worker thread 3. Membership of the port at the SA */ atomic_t refcount; /* delayed work to clean pending SM request */ struct delayed_work timeout_work; struct list_head cleanup_list; }; struct mcast_req { int func; struct ib_sa_mad sa_mad; struct list_head group_list; struct list_head func_list; struct mcast_group *group; int clean; }; #define safe_atomic_dec(ref) \ - do {\ - if (atomic_dec_and_test(ref)) \ + do {\ + if (atomic_dec_and_test(ref)) \ mcg_warn_group(group, "did not expect to reach zero\n"); \ } while (0) static const char *get_state_string(enum mcast_group_state state) { switch (state) { case MCAST_IDLE: return "MCAST_IDLE"; case MCAST_JOIN_SENT: return "MCAST_JOIN_SENT"; case MCAST_LEAVE_SENT: return "MCAST_LEAVE_SENT"; case MCAST_RESP_READY: return "MCAST_RESP_READY"; } return "Invalid State"; } static struct mcast_group *mcast_find(struct mlx4_ib_demux_ctx *ctx, union ib_gid *mgid) { struct rb_node *node = ctx->mcg_table.rb_node; struct mcast_group *group; int ret; while (node) { group = rb_entry(node, struct mcast_group, node); ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid); if (!ret) return group; if (ret < 0) node = node->rb_left; else node = node->rb_right; } return NULL; } static struct mcast_group *mcast_insert(struct mlx4_ib_demux_ctx *ctx, struct mcast_group *group) { struct rb_node **link = &ctx->mcg_table.rb_node; struct rb_node *parent = NULL; struct mcast_group *cur_group; int ret; while (*link) { parent = *link; cur_group = rb_entry(parent, struct mcast_group, node); ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw, sizeof group->rec.mgid); if (ret < 0) link = &(*link)->rb_left; else if (ret > 0) link = &(*link)->rb_right; else return cur_group; } rb_link_node(&group->node, parent, link); rb_insert_color(&group->node, &ctx->mcg_table); return NULL; } static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad) { struct mlx4_ib_dev *dev = ctx->dev; struct ib_ah_attr ah_attr; spin_lock(&dev->sm_lock); if (!dev->sm_ah[ctx->port - 1]) { /* port is not yet Active, sm_ah not ready */ spin_unlock(&dev->sm_lock); return -EAGAIN; } mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); spin_unlock(&dev->sm_lock); return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY, &ah_attr, 0, mad); } static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad) { struct mlx4_ib_dev *dev = ctx->dev; struct ib_mad_agent *agent = dev->send_agent[ctx->port - 1][1]; struct ib_wc wc; struct ib_ah_attr ah_attr; /* Our agent might not yet be registered when mads start to arrive */ if (!agent) return -EAGAIN; ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); if (ib_find_cached_pkey(&dev->ib_dev, ctx->port, IB_DEFAULT_PKEY_FULL, &wc.pkey_index)) return -EINVAL; wc.sl = 0; wc.dlid_path_bits = 0; wc.port_num = ctx->port; wc.slid = ah_attr.dlid; /* opensm lid */ wc.src_qp = 1; return mlx4_ib_send_to_slave(dev, slave, ctx->port, IB_QPT_GSI, &wc, NULL, mad); } static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad) { struct ib_sa_mad mad; struct ib_sa_mcmember_data *sa_mad_data = (struct ib_sa_mcmember_data *)&mad.data; int ret; /* we rely on a mad request as arrived from a VF */ memcpy(&mad, sa_mad, sizeof mad); /* fix port GID to be the real one (slave 0) */ sa_mad_data->port_gid.global.interface_id = group->demux->guid_cache[0]; /* assign our own TID */ mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux); group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */ ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad); /* set timeout handler */ if (!ret) { /* calls mlx4_ib_mcg_timeout_handler */ queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, msecs_to_jiffies(MAD_TIMEOUT_MS)); } return ret; } static int send_leave_to_wire(struct mcast_group *group, u8 join_state) { struct ib_sa_mad mad; struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data; int ret; memset(&mad, 0, sizeof mad); mad.mad_hdr.base_version = 1; mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; mad.mad_hdr.class_version = 2; mad.mad_hdr.method = IB_SA_METHOD_DELETE; mad.mad_hdr.status = cpu_to_be16(0); mad.mad_hdr.class_specific = cpu_to_be16(0); mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux); group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */ mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); mad.mad_hdr.attr_mod = cpu_to_be32(0); mad.sa_hdr.sm_key = 0x0; mad.sa_hdr.attr_offset = cpu_to_be16(7); mad.sa_hdr.comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE; *sa_data = group->rec; sa_data->scope_join_state = join_state; ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad); if (ret) group->state = MCAST_IDLE; /* set timeout handler */ if (!ret) { /* calls mlx4_ib_mcg_timeout_handler */ queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, msecs_to_jiffies(MAD_TIMEOUT_MS)); } return ret; } static int send_reply_to_slave(int slave, struct mcast_group *group, struct ib_sa_mad *req_sa_mad, u16 status) { struct ib_sa_mad mad; struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data; struct ib_sa_mcmember_data *req_sa_data = (struct ib_sa_mcmember_data *)&req_sa_mad->data; int ret; memset(&mad, 0, sizeof mad); mad.mad_hdr.base_version = 1; mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; mad.mad_hdr.class_version = 2; mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP; mad.mad_hdr.status = cpu_to_be16(status); mad.mad_hdr.class_specific = cpu_to_be16(0); mad.mad_hdr.tid = req_sa_mad->mad_hdr.tid; *(u8 *)&mad.mad_hdr.tid = 0; /* resetting tid to 0 */ mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); mad.mad_hdr.attr_mod = cpu_to_be32(0); mad.sa_hdr.sm_key = req_sa_mad->sa_hdr.sm_key; mad.sa_hdr.attr_offset = cpu_to_be16(7); mad.sa_hdr.comp_mask = 0; /* ignored on responses, see IBTA spec */ *sa_data = group->rec; /* reconstruct VF's requested join_state and port_gid */ sa_data->scope_join_state &= 0xf0; sa_data->scope_join_state |= (group->func[slave].join_state & 0x0f); memcpy(&sa_data->port_gid, &req_sa_data->port_gid, sizeof req_sa_data->port_gid); ret = send_mad_to_slave(slave, group->demux, (struct ib_mad *)&mad); return ret; } static int check_selector(ib_sa_comp_mask comp_mask, ib_sa_comp_mask selector_mask, ib_sa_comp_mask value_mask, u8 src_value, u8 dst_value) { int err; u8 selector = dst_value >> 6; dst_value &= 0x3f; src_value &= 0x3f; if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) return 0; switch (selector) { case IB_SA_GT: err = (src_value <= dst_value); break; case IB_SA_LT: err = (src_value >= dst_value); break; case IB_SA_EQ: err = (src_value != dst_value); break; default: err = 0; break; } return err; } static u16 cmp_rec(struct ib_sa_mcmember_data *src, struct ib_sa_mcmember_data *dst, ib_sa_comp_mask comp_mask) { /* src is group record, dst is request record */ /* MGID must already match */ /* Port_GID we always replace to our Port_GID, so it is a match */ #define MAD_STATUS_REQ_INVALID 0x0200 if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey) return MAD_STATUS_REQ_INVALID; if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) return MAD_STATUS_REQ_INVALID; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, IB_SA_MCMEMBER_REC_MTU, src->mtusel_mtu, dst->mtusel_mtu)) return MAD_STATUS_REQ_INVALID; if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && src->tclass != dst->tclass) return MAD_STATUS_REQ_INVALID; if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) return MAD_STATUS_REQ_INVALID; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, IB_SA_MCMEMBER_REC_RATE, src->ratesel_rate, dst->ratesel_rate)) return MAD_STATUS_REQ_INVALID; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, src->lifetmsel_lifetm, dst->lifetmsel_lifetm)) return MAD_STATUS_REQ_INVALID; if (comp_mask & IB_SA_MCMEMBER_REC_SL && (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0xf0000000) != (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0xf0000000)) return MAD_STATUS_REQ_INVALID; if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL && (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x0fffff00) != (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x0fffff00)) return MAD_STATUS_REQ_INVALID; if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT && (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x000000ff) != (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x000000ff)) return MAD_STATUS_REQ_INVALID; if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && (src->scope_join_state & 0xf0) != (dst->scope_join_state & 0xf0)) return MAD_STATUS_REQ_INVALID; /* join_state checked separately, proxy_join ignored */ return 0; } /* release group, return 1 if this was last release and group is destroyed * timout work is canceled sync */ static int release_group(struct mcast_group *group, int from_timeout_handler) { struct mlx4_ib_demux_ctx *ctx = group->demux; int nzgroup; mutex_lock(&ctx->mcg_table_lock); mutex_lock(&group->lock); if (atomic_dec_and_test(&group->refcount)) { if (!from_timeout_handler) { if (group->state != MCAST_IDLE && !cancel_delayed_work(&group->timeout_work)) { atomic_inc(&group->refcount); mutex_unlock(&group->lock); mutex_unlock(&ctx->mcg_table_lock); return 0; } } nzgroup = memcmp(&group->rec.mgid, &mgid0, sizeof mgid0); if (nzgroup) del_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); if (!list_empty(&group->pending_list)) mcg_warn_group(group, "releasing a group with non empty pending list\n"); if (nzgroup) rb_erase(&group->node, &ctx->mcg_table); list_del_init(&group->mgid0_list); mutex_unlock(&group->lock); mutex_unlock(&ctx->mcg_table_lock); kfree(group); return 1; } else { mutex_unlock(&group->lock); mutex_unlock(&ctx->mcg_table_lock); } return 0; } static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) { int i; for (i = 0; i < 3; i++, join_state >>= 1) if (join_state & 0x1) group->members[i] += inc; } static u8 get_leave_state(struct mcast_group *group) { u8 leave_state = 0; int i; for (i = 0; i < 3; i++) if (!group->members[i]) leave_state |= (1 << i); return leave_state & (group->rec.scope_join_state & 7); } static int join_group(struct mcast_group *group, int slave, u8 join_mask) { int ret = 0; u8 join_state; /* remove bits that slave is already member of, and adjust */ join_state = join_mask & (~group->func[slave].join_state); adjust_membership(group, join_state, 1); group->func[slave].join_state |= join_state; if (group->func[slave].state != MCAST_MEMBER && join_state) { group->func[slave].state = MCAST_MEMBER; ret = 1; } return ret; } static int leave_group(struct mcast_group *group, int slave, u8 leave_state) { int ret = 0; adjust_membership(group, leave_state, -1); group->func[slave].join_state &= ~leave_state; if (!group->func[slave].join_state) { group->func[slave].state = MCAST_NOT_MEMBER; ret = 1; } return ret; } static int check_leave(struct mcast_group *group, int slave, u8 leave_mask) { if (group->func[slave].state != MCAST_MEMBER) return MAD_STATUS_REQ_INVALID; /* make sure we're not deleting unset bits */ if (~group->func[slave].join_state & leave_mask) return MAD_STATUS_REQ_INVALID; if (!leave_mask) return MAD_STATUS_REQ_INVALID; return 0; } static void mlx4_ib_mcg_timeout_handler(struct work_struct *work) { struct delayed_work *delay = to_delayed_work(work); struct mcast_group *group; struct mcast_req *req = NULL; group = container_of(delay, typeof(*group), timeout_work); mutex_lock(&group->lock); if (group->state == MCAST_JOIN_SENT) { if (!list_empty(&group->pending_list)) { req = list_first_entry(&group->pending_list, struct mcast_req, group_list); list_del(&req->group_list); list_del(&req->func_list); --group->func[req->func].num_pend_reqs; mutex_unlock(&group->lock); kfree(req); if (memcmp(&group->rec.mgid, &mgid0, sizeof mgid0)) { if (release_group(group, 1)) return; } else { kfree(group); return; } mutex_lock(&group->lock); } else mcg_warn_group(group, "DRIVER BUG\n"); } else if (group->state == MCAST_LEAVE_SENT) { if (group->rec.scope_join_state & 7) group->rec.scope_join_state &= 0xf8; group->state = MCAST_IDLE; mutex_unlock(&group->lock); if (release_group(group, 1)) return; mutex_lock(&group->lock); } else mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state)); group->state = MCAST_IDLE; atomic_inc(&group->refcount); if (!queue_work(group->demux->mcg_wq, &group->work)) - safe_atomic_dec(&group->refcount); + safe_atomic_dec(&group->refcount); mutex_unlock(&group->lock); } static int handle_leave_req(struct mcast_group *group, u8 leave_mask, struct mcast_req *req) { u16 status; if (req->clean) leave_mask = group->func[req->func].join_state; status = check_leave(group, req->func, leave_mask); if (!status) leave_group(group, req->func, leave_mask); if (!req->clean) send_reply_to_slave(req->func, group, &req->sa_mad, status); --group->func[req->func].num_pend_reqs; list_del(&req->group_list); list_del(&req->func_list); kfree(req); return 1; } static int handle_join_req(struct mcast_group *group, u8 join_mask, struct mcast_req *req) { u8 group_join_state = group->rec.scope_join_state & 7; int ref = 0; u16 status; struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data; if (join_mask == (group_join_state & join_mask)) { /* port's membership need not change */ status = cmp_rec(&group->rec, sa_data, req->sa_mad.sa_hdr.comp_mask); if (!status) join_group(group, req->func, join_mask); --group->func[req->func].num_pend_reqs; send_reply_to_slave(req->func, group, &req->sa_mad, status); list_del(&req->group_list); list_del(&req->func_list); kfree(req); ++ref; } else { /* port's membership needs to be updated */ group->prev_state = group->state; if (send_join_to_wire(group, &req->sa_mad)) { --group->func[req->func].num_pend_reqs; list_del(&req->group_list); list_del(&req->func_list); kfree(req); ref = 1; group->state = group->prev_state; } else group->state = MCAST_JOIN_SENT; } return ref; } static void mlx4_ib_mcg_work_handler(struct work_struct *work) { struct mcast_group *group; struct mcast_req *req = NULL; struct ib_sa_mcmember_data *sa_data; u8 req_join_state; int rc = 1; /* release_count - this is for the scheduled work */ u16 status; u8 method; group = container_of(work, typeof(*group), work); mutex_lock(&group->lock); /* First, let's see if a response from SM is waiting regarding this group. * If so, we need to update the group's REC. If this is a bad response, we * may need to send a bad response to a VF waiting for it. If VF is waiting * and this is a good response, the VF will be answered later in this func. */ if (group->state == MCAST_RESP_READY) { /* cancels mlx4_ib_mcg_timeout_handler */ cancel_delayed_work(&group->timeout_work); status = be16_to_cpu(group->response_sa_mad.mad_hdr.status); method = group->response_sa_mad.mad_hdr.method; if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) { mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n", (long long)be64_to_cpu( group->response_sa_mad.mad_hdr.tid), (long long)be64_to_cpu(group->last_req_tid)); group->state = group->prev_state; goto process_requests; } if (status) { if (!list_empty(&group->pending_list)) req = list_first_entry(&group->pending_list, struct mcast_req, group_list); if (method == IB_MGMT_METHOD_GET_RESP) { if (req) { send_reply_to_slave(req->func, group, &req->sa_mad, status); --group->func[req->func].num_pend_reqs; list_del(&req->group_list); list_del(&req->func_list); kfree(req); ++rc; } else mcg_warn_group(group, "no request for failed join\n"); } else if (method == IB_SA_METHOD_DELETE_RESP && group->demux->flushing) ++rc; } else { u8 resp_join_state; u8 cur_join_state; resp_join_state = ((struct ib_sa_mcmember_data *) group->response_sa_mad.data)->scope_join_state & 7; cur_join_state = group->rec.scope_join_state & 7; if (method == IB_MGMT_METHOD_GET_RESP) { /* successfull join */ if (!cur_join_state && resp_join_state) --rc; } else if (!resp_join_state) ++rc; memcpy(&group->rec, group->response_sa_mad.data, sizeof group->rec); } group->state = MCAST_IDLE; } process_requests: /* We should now go over pending join/leave requests, as long as we are idle. */ while (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) { req = list_first_entry(&group->pending_list, struct mcast_req, group_list); sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data; req_join_state = sa_data->scope_join_state & 0x7; /* For a leave request, we will immediately answer the VF, and * update our internal counters. The actual leave will be sent * to SM later, if at all needed. We dequeue the request now. */ if (req->sa_mad.mad_hdr.method == IB_SA_METHOD_DELETE) rc += handle_leave_req(group, req_join_state, req); else rc += handle_join_req(group, req_join_state, req); } /* Handle leaves */ if (group->state == MCAST_IDLE) { req_join_state = get_leave_state(group); if (req_join_state) { group->rec.scope_join_state &= ~req_join_state; group->prev_state = group->state; if (send_leave_to_wire(group, req_join_state)) { group->state = group->prev_state; ++rc; } else group->state = MCAST_LEAVE_SENT; } } if (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) goto process_requests; mutex_unlock(&group->lock); while (rc--) release_group(group, 0); } static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx *ctx, __be64 tid, union ib_gid *new_mgid) { struct mcast_group *group = NULL, *cur_group; struct mcast_req *req; struct list_head *pos; struct list_head *n; mutex_lock(&ctx->mcg_table_lock); list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) { group = list_entry(pos, struct mcast_group, mgid0_list); mutex_lock(&group->lock); if (group->last_req_tid == tid) { if (memcmp(new_mgid, &mgid0, sizeof mgid0)) { group->rec.mgid = *new_mgid; sprintf(group->name, "%016llx%016llx", (long long)be64_to_cpu(group->rec.mgid.global.subnet_prefix), (long long)be64_to_cpu(group->rec.mgid.global.interface_id)); list_del_init(&group->mgid0_list); cur_group = mcast_insert(ctx, group); if (cur_group) { /* A race between our code and SM. Silently cleaning the new one */ req = list_first_entry(&group->pending_list, struct mcast_req, group_list); --group->func[req->func].num_pend_reqs; list_del(&req->group_list); list_del(&req->func_list); kfree(req); mutex_unlock(&group->lock); mutex_unlock(&ctx->mcg_table_lock); release_group(group, 0); return NULL; } atomic_inc(&group->refcount); add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); mutex_unlock(&group->lock); mutex_unlock(&ctx->mcg_table_lock); return group; } else { struct mcast_req *tmp1, *tmp2; list_del(&group->mgid0_list); if (!list_empty(&group->pending_list) && group->state != MCAST_IDLE) cancel_delayed_work_sync(&group->timeout_work); list_for_each_entry_safe(tmp1, tmp2, &group->pending_list, group_list) { list_del(&tmp1->group_list); kfree(tmp1); } mutex_unlock(&group->lock); mutex_unlock(&ctx->mcg_table_lock); kfree(group); return NULL; } } mutex_unlock(&group->lock); } mutex_unlock(&ctx->mcg_table_lock); return NULL; } static ssize_t sysfs_show_group(struct device *dev, struct device_attribute *attr, char *buf); static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx, union ib_gid *mgid, int create, gfp_t gfp_mask) { struct mcast_group *group, *cur_group; int is_mgid0; int i; is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0); if (!is_mgid0) { group = mcast_find(ctx, mgid); if (group) goto found; } if (!create) return ERR_PTR(-ENOENT); group = kzalloc(sizeof *group, gfp_mask); if (!group) return ERR_PTR(-ENOMEM); group->demux = ctx; group->rec.mgid = *mgid; INIT_LIST_HEAD(&group->pending_list); INIT_LIST_HEAD(&group->mgid0_list); for (i = 0; i < MAX_VFS; ++i) INIT_LIST_HEAD(&group->func[i].pending); INIT_WORK(&group->work, mlx4_ib_mcg_work_handler); INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler); mutex_init(&group->lock); sprintf(group->name, "%016llx%016llx", (long long)be64_to_cpu( group->rec.mgid.global.subnet_prefix), (long long)be64_to_cpu( group->rec.mgid.global.interface_id)); sysfs_attr_init(&group->dentry.attr); group->dentry.show = sysfs_show_group; group->dentry.store = NULL; group->dentry.attr.name = group->name; group->dentry.attr.mode = 0400; group->state = MCAST_IDLE; if (is_mgid0) { list_add(&group->mgid0_list, &ctx->mcg_mgid0_list); goto found; } cur_group = mcast_insert(ctx, group); if (cur_group) { mcg_warn("group just showed up %s - confused\n", cur_group->name); kfree(group); return ERR_PTR(-EINVAL); } add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); found: atomic_inc(&group->refcount); return group; } static void queue_req(struct mcast_req *req) { struct mcast_group *group = req->group; atomic_inc(&group->refcount); /* for the request */ atomic_inc(&group->refcount); /* for scheduling the work */ list_add_tail(&req->group_list, &group->pending_list); list_add_tail(&req->func_list, &group->func[req->func].pending); /* calls mlx4_ib_mcg_work_handler */ if (!queue_work(group->demux->mcg_wq, &group->work)) - safe_atomic_dec(&group->refcount); + safe_atomic_dec(&group->refcount); } int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, struct ib_sa_mad *mad) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)mad->data; struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1]; struct mcast_group *group; switch (mad->mad_hdr.method) { case IB_MGMT_METHOD_GET_RESP: case IB_SA_METHOD_DELETE_RESP: mutex_lock(&ctx->mcg_table_lock); group = acquire_group(ctx, &rec->mgid, 0, GFP_KERNEL); mutex_unlock(&ctx->mcg_table_lock); if (IS_ERR(group)) { if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) { __be64 tid = mad->mad_hdr.tid; *(u8 *)(&tid) = (u8)slave; /* in group we kept the modified TID */ group = search_relocate_mgid0_group(ctx, tid, &rec->mgid); } else group = NULL; } if (!group) return 1; mutex_lock(&group->lock); group->response_sa_mad = *mad; group->prev_state = group->state; group->state = MCAST_RESP_READY; /* calls mlx4_ib_mcg_work_handler */ atomic_inc(&group->refcount); if (!queue_work(ctx->mcg_wq, &group->work)) - safe_atomic_dec(&group->refcount); + safe_atomic_dec(&group->refcount); mutex_unlock(&group->lock); release_group(group, 0); return 1; /* consumed */ case IB_MGMT_METHOD_SET: case IB_SA_METHOD_GET_TABLE: case IB_SA_METHOD_GET_TABLE_RESP: case IB_SA_METHOD_DELETE: return 0; /* not consumed, pass-through to guest over tunnel */ default: mcg_warn("In demux, port %d: unexpected MCMember method: 0x%x, dropping\n", port, mad->mad_hdr.method); return 1; /* consumed */ } } int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave, struct ib_sa_mad *sa_mad) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)sa_mad->data; struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1]; struct mcast_group *group; struct mcast_req *req; int may_create = 0; if (ctx->flushing) return -EAGAIN; switch (sa_mad->mad_hdr.method) { case IB_MGMT_METHOD_SET: may_create = 1; case IB_SA_METHOD_DELETE: req = kzalloc(sizeof *req, GFP_KERNEL); if (!req) return -ENOMEM; req->func = slave; req->sa_mad = *sa_mad; mutex_lock(&ctx->mcg_table_lock); group = acquire_group(ctx, &rec->mgid, may_create, GFP_KERNEL); mutex_unlock(&ctx->mcg_table_lock); if (IS_ERR(group)) { kfree(req); return PTR_ERR(group); } mutex_lock(&group->lock); if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) { mutex_unlock(&group->lock); mcg_warn_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n", port, slave, MAX_PEND_REQS_PER_FUNC); release_group(group, 0); kfree(req); return -ENOMEM; } ++group->func[slave].num_pend_reqs; req->group = group; queue_req(req); mutex_unlock(&group->lock); release_group(group, 0); return 1; /* consumed */ case IB_SA_METHOD_GET_TABLE: case IB_MGMT_METHOD_GET_RESP: case IB_SA_METHOD_GET_TABLE_RESP: case IB_SA_METHOD_DELETE_RESP: return 0; /* not consumed, pass-through */ default: mcg_warn("In multiplex, port %d, func %d: unexpected MCMember method: 0x%x, dropping\n", port, slave, sa_mad->mad_hdr.method); return 1; /* consumed */ } } static ssize_t sysfs_show_group(struct device *dev, struct device_attribute *attr, char *buf) { struct mcast_group *group = container_of(attr, struct mcast_group, dentry); struct mcast_req *req = NULL; char pending_str[40]; char state_str[40]; ssize_t len = 0; int f; if (group->state == MCAST_IDLE) sprintf(state_str, "%s", get_state_string(group->state)); else sprintf(state_str, "%s(TID=0x%llx)", get_state_string(group->state), (long long)be64_to_cpu(group->last_req_tid)); if (list_empty(&group->pending_list)) { sprintf(pending_str, "No"); } else { req = list_first_entry(&group->pending_list, struct mcast_req, group_list); sprintf(pending_str, "Yes(TID=0x%llx)", (long long)be64_to_cpu( req->sa_mad.mad_hdr.tid)); } len += sprintf(buf + len, "%1d [%02d,%02d,%02d] %4d %4s %5s ", group->rec.scope_join_state & 0xf, group->members[2], group->members[1], group->members[0], atomic_read(&group->refcount), pending_str, state_str); for (f = 0; f < MAX_VFS; ++f) if (group->func[f].state == MCAST_MEMBER) len += sprintf(buf + len, "%d[%1x] ", f, group->func[f].join_state); len += sprintf(buf + len, "\t\t(%4hx %4x %2x %2x %2x %2x %2x " "%4x %4x %2x %2x)\n", be16_to_cpu(group->rec.pkey), be32_to_cpu(group->rec.qkey), (group->rec.mtusel_mtu & 0xc0) >> 6, group->rec.mtusel_mtu & 0x3f, group->rec.tclass, (group->rec.ratesel_rate & 0xc0) >> 6, group->rec.ratesel_rate & 0x3f, (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0xf0000000) >> 28, (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x0fffff00) >> 8, be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x000000ff, group->rec.proxy_join); return len; } int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx) { char name[20]; atomic_set(&ctx->tid, 0); sprintf(name, "mlx4_ib_mcg%d", ctx->port); ctx->mcg_wq = create_singlethread_workqueue(name); if (!ctx->mcg_wq) return -ENOMEM; mutex_init(&ctx->mcg_table_lock); ctx->mcg_table = RB_ROOT; INIT_LIST_HEAD(&ctx->mcg_mgid0_list); ctx->flushing = 0; return 0; } static void force_clean_group(struct mcast_group *group) { struct mcast_req *req, *tmp ; list_for_each_entry_safe(req, tmp, &group->pending_list, group_list) { list_del(&req->group_list); kfree(req); } del_sysfs_port_mcg_attr(group->demux->dev, group->demux->port, &group->dentry.attr); rb_erase(&group->node, &group->demux->mcg_table); kfree(group); } static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq) { int i; struct rb_node *p; struct mcast_group *group; unsigned long end; int count; for (i = 0; i < MAX_VFS; ++i) clean_vf_mcast(ctx, i); end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000); do { count = 0; mutex_lock(&ctx->mcg_table_lock); for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) ++count; mutex_unlock(&ctx->mcg_table_lock); if (!count) break; msleep(1); } while (time_after(end, jiffies)); flush_workqueue(ctx->mcg_wq); if (destroy_wq) destroy_workqueue(ctx->mcg_wq); mutex_lock(&ctx->mcg_table_lock); while ((p = rb_first(&ctx->mcg_table)) != NULL) { group = rb_entry(p, struct mcast_group, node); if (atomic_read(&group->refcount)) mcg_warn_group(group, "group refcount %d!!! (pointer %p)\n", atomic_read(&group->refcount), group); force_clean_group(group); } mutex_unlock(&ctx->mcg_table_lock); } struct clean_work { struct work_struct work; struct mlx4_ib_demux_ctx *ctx; int destroy_wq; }; static void mcg_clean_task(struct work_struct *work) { struct clean_work *cw = container_of(work, struct clean_work, work); _mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq); cw->ctx->flushing = 0; kfree(cw); } void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq) { struct clean_work *work; if (ctx->flushing) return; ctx->flushing = 1; if (destroy_wq) { _mlx4_ib_mcg_port_cleanup(ctx, destroy_wq); ctx->flushing = 0; return; } work = kmalloc(sizeof *work, GFP_KERNEL); if (!work) { ctx->flushing = 0; mcg_warn("failed allocating work for cleanup\n"); return; } work->ctx = ctx; work->destroy_wq = destroy_wq; INIT_WORK(&work->work, mcg_clean_task); queue_work(clean_wq, &work->work); } static void build_leave_mad(struct mcast_req *req) { struct ib_sa_mad *mad = &req->sa_mad; mad->mad_hdr.method = IB_SA_METHOD_DELETE; } static void clear_pending_reqs(struct mcast_group *group, int vf) { struct mcast_req *req, *tmp, *group_first = NULL; int clear; int pend = 0; if (!list_empty(&group->pending_list)) group_first = list_first_entry(&group->pending_list, struct mcast_req, group_list); list_for_each_entry_safe(req, tmp, &group->func[vf].pending, func_list) { clear = 1; if (group_first == req && (group->state == MCAST_JOIN_SENT || group->state == MCAST_LEAVE_SENT)) { clear = cancel_delayed_work(&group->timeout_work); pend = !clear; group->state = MCAST_IDLE; } if (clear) { --group->func[vf].num_pend_reqs; list_del(&req->group_list); list_del(&req->func_list); kfree(req); atomic_dec(&group->refcount); } } if (!pend && (!list_empty(&group->func[vf].pending) || group->func[vf].num_pend_reqs)) { mcg_warn_group(group, "DRIVER BUG: list_empty %d, num_pend_reqs %d\n", list_empty(&group->func[vf].pending), group->func[vf].num_pend_reqs); } } static int push_deleteing_req(struct mcast_group *group, int slave) { struct mcast_req *req; struct mcast_req *pend_req; if (!group->func[slave].join_state) return 0; req = kzalloc(sizeof *req, GFP_KERNEL); if (!req) { mcg_warn_group(group, "failed allocation - may leave stall groups\n"); return -ENOMEM; } if (!list_empty(&group->func[slave].pending)) { pend_req = list_entry(group->func[slave].pending.prev, struct mcast_req, group_list); if (pend_req->clean) { kfree(req); return 0; } } req->clean = 1; req->func = slave; req->group = group; ++group->func[slave].num_pend_reqs; build_leave_mad(req); queue_req(req); return 0; } void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave) { struct mcast_group *group; struct rb_node *p; mutex_lock(&ctx->mcg_table_lock); for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) { group = rb_entry(p, struct mcast_group, node); mutex_lock(&group->lock); if (atomic_read(&group->refcount)) { /* clear pending requests of this VF */ clear_pending_reqs(group, slave); push_deleteing_req(group, slave); } mutex_unlock(&group->lock); } mutex_unlock(&ctx->mcg_table_lock); } int mlx4_ib_mcg_init(void) { clean_wq = create_singlethread_workqueue("mlx4_ib_mcg"); if (!clean_wq) return -ENOMEM; return 0; } void mlx4_ib_mcg_destroy(void) { destroy_workqueue(clean_wq); } Index: head/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h =================================================================== --- head/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h (revision 296381) +++ head/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h (revision 296382) @@ -1,865 +1,865 @@ /* * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved. * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef MLX4_IB_H #define MLX4_IB_H #include #include #include #include #include #include #include #include #include #include #include #include #define MLX4_IB_DRV_NAME "mlx4_ib" #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) "<" MLX4_IB_DRV_NAME "> %s: " fmt, __func__ #define mlx4_ib_warn(ibdev, format, arg...) \ dev_warn((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg) #define mlx4_ib_info(ibdev, format, arg...) \ dev_info((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg) enum { MLX4_IB_SQ_MIN_WQE_SHIFT = 6, MLX4_IB_MAX_HEADROOM = 2048 }; #define MLX4_IB_SQ_HEADROOM(shift) ((MLX4_IB_MAX_HEADROOM >> (shift)) + 1) #define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT)) /*module param to indicate if SM assigns the alias_GUID*/ extern int mlx4_ib_sm_guid_assign; extern struct proc_dir_entry *mlx4_mrs_dir_entry; #define MLX4_IB_UC_STEER_QPN_ALIGN 1 #define MLX4_IB_UC_MAX_NUM_QPS (256 * 1024) #define MLX4_IB_MMAP_CMD_MASK 0xFF #define MLX4_IB_MMAP_CMD_BITS 8 struct mlx4_ib_ucontext { struct ib_ucontext ibucontext; struct mlx4_uar uar; struct list_head db_page_list; struct mutex db_page_mutex; }; struct mlx4_ib_pd { struct ib_pd ibpd; u32 pdn; }; struct mlx4_ib_xrcd { struct ib_xrcd ibxrcd; u32 xrcdn; struct ib_pd *pd; struct ib_cq *cq; }; struct mlx4_ib_cq_buf { struct mlx4_buf buf; struct mlx4_mtt mtt; int entry_size; }; struct mlx4_ib_cq_resize { struct mlx4_ib_cq_buf buf; int cqe; }; struct mlx4_shared_mr_info { int mr_id; struct ib_umem *umem; }; struct mlx4_ib_cq { struct ib_cq ibcq; struct mlx4_cq mcq; struct mlx4_ib_cq_buf buf; struct mlx4_ib_cq_resize *resize_buf; struct mlx4_db db; spinlock_t lock; struct mutex resize_mutex; struct ib_umem *umem; struct ib_umem *resize_umem; int create_flags; }; struct mlx4_ib_mr { struct ib_mr ibmr; struct mlx4_mr mmr; struct ib_umem *umem; struct mlx4_shared_mr_info *smr_info; atomic_t invalidated; struct completion invalidation_comp; }; struct mlx4_ib_mw { struct ib_mw ibmw; struct mlx4_mw mmw; }; struct mlx4_ib_fast_reg_page_list { struct ib_fast_reg_page_list ibfrpl; __be64 *mapped_page_list; dma_addr_t map; }; struct mlx4_ib_fmr { struct ib_fmr ibfmr; struct mlx4_fmr mfmr; }; struct mlx4_ib_flow { struct ib_flow ibflow; /* translating DMFS verbs sniffer rule to FW API requires two reg IDs */ u64 reg_id[2]; }; struct mlx4_ib_wq { u64 *wrid; spinlock_t lock; int wqe_cnt; int max_post; int max_gs; int offset; int wqe_shift; unsigned head; unsigned tail; }; enum mlx4_ib_qp_flags { MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, MLX4_IB_QP_CAP_CROSS_CHANNEL = IB_QP_CREATE_CROSS_CHANNEL, MLX4_IB_QP_CAP_MANAGED_SEND = IB_QP_CREATE_MANAGED_SEND, MLX4_IB_QP_CAP_MANAGED_RECV = IB_QP_CREATE_MANAGED_RECV, MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30, MLX4_IB_SRIOV_SQP = 1 << 31, }; struct mlx4_ib_gid_entry { struct list_head list; union ib_gid gid; int added; u8 port; }; enum mlx4_ib_mmap_cmd { MLX4_IB_MMAP_UAR_PAGE = 0, MLX4_IB_MMAP_BLUE_FLAME_PAGE = 1, MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES = 2, MLX4_IB_MMAP_GET_HW_CLOCK = 3, }; enum mlx4_ib_qp_type { /* * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries * here (and in that order) since the MAD layer uses them as * indices into a 2-entry table. */ MLX4_IB_QPT_SMI = IB_QPT_SMI, MLX4_IB_QPT_GSI = IB_QPT_GSI, MLX4_IB_QPT_RC = IB_QPT_RC, MLX4_IB_QPT_UC = IB_QPT_UC, MLX4_IB_QPT_UD = IB_QPT_UD, MLX4_IB_QPT_RAW_IPV6 = IB_QPT_RAW_IPV6, MLX4_IB_QPT_RAW_ETHERTYPE = IB_QPT_RAW_ETHERTYPE, MLX4_IB_QPT_RAW_PACKET = IB_QPT_RAW_PACKET, MLX4_IB_QPT_XRC_INI = IB_QPT_XRC_INI, MLX4_IB_QPT_XRC_TGT = IB_QPT_XRC_TGT, MLX4_IB_QPT_PROXY_SMI_OWNER = 1 << 16, MLX4_IB_QPT_PROXY_SMI = 1 << 17, MLX4_IB_QPT_PROXY_GSI = 1 << 18, MLX4_IB_QPT_TUN_SMI_OWNER = 1 << 19, MLX4_IB_QPT_TUN_SMI = 1 << 20, MLX4_IB_QPT_TUN_GSI = 1 << 21, }; #define MLX4_IB_QPT_ANY_SRIOV (MLX4_IB_QPT_PROXY_SMI_OWNER | \ MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER | \ MLX4_IB_QPT_TUN_SMI | MLX4_IB_QPT_TUN_GSI) enum mlx4_ib_mad_ifc_flags { MLX4_MAD_IFC_IGNORE_MKEY = 1, MLX4_MAD_IFC_IGNORE_BKEY = 2, MLX4_MAD_IFC_IGNORE_KEYS = (MLX4_MAD_IFC_IGNORE_MKEY | MLX4_MAD_IFC_IGNORE_BKEY), MLX4_MAD_IFC_NET_VIEW = 4, }; enum { MLX4_NUM_TUNNEL_BUFS = 256, }; struct mlx4_ib_tunnel_header { struct mlx4_av av; __be32 remote_qpn; __be32 qkey; __be16 vlan; u8 mac[6]; __be16 pkey_index; u8 reserved[6]; }; struct mlx4_ib_buf { void *addr; dma_addr_t map; }; struct mlx4_rcv_tunnel_hdr { __be32 flags_src_qp; /* flags[6:5] is defined for VLANs: * 0x0 - no vlan was in the packet * 0x01 - C-VLAN was in the packet */ u8 g_ml_path; /* gid bit stands for ipv6/4 header in RoCE */ u8 reserved; __be16 pkey_index; __be16 sl_vid; __be16 slid_mac_47_32; __be32 mac_31_0; }; struct mlx4_ib_proxy_sqp_hdr { struct ib_grh grh; struct mlx4_rcv_tunnel_hdr tun; } __packed; struct mlx4_roce_smac_vlan_info { u64 smac; int smac_index; int smac_port; u64 candidate_smac; int candidate_smac_index; int candidate_smac_port; u16 vid; int vlan_index; int vlan_port; u16 candidate_vid; int candidate_vlan_index; int candidate_vlan_port; int update_vid; }; struct mlx4_ib_qpg_data { unsigned long *tss_bitmap; unsigned long *rss_bitmap; struct mlx4_ib_qp *qpg_parent; int tss_qpn_base; int rss_qpn_base; u32 tss_child_count; u32 rss_child_count; u32 qpg_tss_mask_sz; }; struct mlx4_ib_qp { struct ib_qp ibqp; struct mlx4_qp mqp; struct mlx4_buf buf; struct mlx4_db db; struct mlx4_ib_wq rq; u32 doorbell_qpn; __be32 sq_signal_bits; unsigned sq_next_wqe; int sq_max_wqes_per_wr; int sq_spare_wqes; struct mlx4_ib_wq sq; enum mlx4_ib_qp_type mlx4_ib_qp_type; struct ib_umem *umem; struct mlx4_mtt mtt; int buf_size; struct mutex mutex; u16 xrcdn; u32 flags; u8 port; u8 alt_port; u8 atomic_rd_en; u8 resp_depth; u8 sq_no_prefetch; u8 state; int mlx_type; enum ib_qpg_type qpg_type; struct mlx4_ib_qpg_data *qpg_data; struct list_head gid_list; struct list_head steering_rules; struct mlx4_ib_buf *sqp_proxy_rcv; struct mlx4_roce_smac_vlan_info pri; struct mlx4_roce_smac_vlan_info alt; struct list_head rules_list; u64 reg_id; int max_inline_data; struct mlx4_bf bf; /* * Experimental data */ int max_inlr_data; }; struct mlx4_ib_srq { struct ib_srq ibsrq; struct mlx4_srq msrq; struct mlx4_buf buf; struct mlx4_db db; u64 *wrid; spinlock_t lock; int head; int tail; u16 wqe_ctr; struct ib_umem *umem; struct mlx4_mtt mtt; struct mutex mutex; }; struct mlx4_ib_ah { struct ib_ah ibah; union mlx4_ext_av av; }; /****************************************/ /* alias guid support */ /****************************************/ #define NUM_PORT_ALIAS_GUID 2 #define NUM_ALIAS_GUID_IN_REC 8 #define NUM_ALIAS_GUID_REC_IN_PORT 16 #define GUID_REC_SIZE 8 #define NUM_ALIAS_GUID_PER_PORT 128 #define MLX4_NOT_SET_GUID (0x00LL) #define MLX4_GUID_FOR_DELETE_VAL (~(0x00LL)) /****************************************/ /* ioctl codes */ /****************************************/ #define MLX4_IOC_MAGIC 'm' #define MLX4_IOCHWCLOCKOFFSET _IOR(MLX4_IOC_MAGIC, 1, int) enum mlx4_guid_alias_rec_status { MLX4_GUID_INFO_STATUS_IDLE, MLX4_GUID_INFO_STATUS_SET, MLX4_GUID_INFO_STATUS_PENDING, }; enum mlx4_guid_alias_rec_ownership { MLX4_GUID_DRIVER_ASSIGN, MLX4_GUID_SYSADMIN_ASSIGN, MLX4_GUID_NONE_ASSIGN, /*init state of each record*/ }; enum mlx4_guid_alias_rec_method { MLX4_GUID_INFO_RECORD_SET = IB_MGMT_METHOD_SET, MLX4_GUID_INFO_RECORD_DELETE = IB_SA_METHOD_DELETE, }; struct mlx4_sriov_alias_guid_info_rec_det { u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC]; ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/ enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/ u8 method; /*set or delete*/ enum mlx4_guid_alias_rec_ownership ownership; /*indicates who assign that alias_guid record*/ }; struct mlx4_sriov_alias_guid_port_rec_det { struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT]; struct workqueue_struct *wq; struct delayed_work alias_guid_work; u8 port; struct mlx4_sriov_alias_guid *parent; struct list_head cb_list; }; struct mlx4_sriov_alias_guid { struct mlx4_sriov_alias_guid_port_rec_det ports_guid[MLX4_MAX_PORTS]; spinlock_t ag_work_lock; struct ib_sa_client *sa_client; }; struct mlx4_ib_demux_work { struct work_struct work; struct mlx4_ib_dev *dev; int slave; int do_init; u8 port; }; struct mlx4_ib_tun_tx_buf { struct mlx4_ib_buf buf; struct ib_ah *ah; }; struct mlx4_ib_demux_pv_qp { struct ib_qp *qp; enum ib_qp_type proxy_qpt; struct mlx4_ib_buf *ring; struct mlx4_ib_tun_tx_buf *tx_ring; spinlock_t tx_lock; unsigned tx_ix_head; unsigned tx_ix_tail; }; enum mlx4_ib_demux_pv_state { DEMUX_PV_STATE_DOWN, DEMUX_PV_STATE_STARTING, DEMUX_PV_STATE_ACTIVE, DEMUX_PV_STATE_DOWNING, }; struct mlx4_ib_demux_pv_ctx { int port; int slave; enum mlx4_ib_demux_pv_state state; int has_smi; struct ib_device *ib_dev; struct ib_cq *cq; struct ib_pd *pd; struct ib_mr *mr; struct work_struct work; struct workqueue_struct *wq; struct mlx4_ib_demux_pv_qp qp[2]; }; struct mlx4_ib_demux_ctx { struct ib_device *ib_dev; int port; struct workqueue_struct *wq; struct workqueue_struct *ud_wq; spinlock_t ud_lock; __be64 subnet_prefix; __be64 guid_cache[128]; struct mlx4_ib_dev *dev; /* the following lock protects both mcg_table and mcg_mgid0_list */ struct mutex mcg_table_lock; struct rb_root mcg_table; struct list_head mcg_mgid0_list; struct workqueue_struct *mcg_wq; struct mlx4_ib_demux_pv_ctx **tun; atomic_t tid; int flushing; /* flushing the work queue */ }; struct mlx4_ib_sriov { struct mlx4_ib_demux_ctx demux[MLX4_MAX_PORTS]; struct mlx4_ib_demux_pv_ctx *sqps[MLX4_MAX_PORTS]; /* when using this spinlock you should use "irq" because * it may be called from interrupt context.*/ spinlock_t going_down_lock; int is_going_down; struct mlx4_sriov_alias_guid alias_guid; /* CM paravirtualization fields */ struct list_head cm_list; spinlock_t id_map_lock; struct rb_root sl_id_map; struct idr pv_id_table; }; struct mlx4_ib_iboe { spinlock_t lock; struct net_device *netdevs[MLX4_MAX_PORTS]; struct net_device *masters[MLX4_MAX_PORTS]; - struct notifier_block nb; + struct notifier_block nb; struct notifier_block nb_inet; union ib_gid gid_table[MLX4_MAX_PORTS][128]; }; struct pkey_mgt { u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; u16 phys_pkey_cache[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; struct list_head pkey_port_list[MLX4_MFUNC_MAX]; struct kobject *device_parent[MLX4_MFUNC_MAX]; }; struct mlx4_ib_iov_sysfs_attr { void *ctx; struct kobject *kobj; unsigned long data; u32 entry_num; char name[15]; struct device_attribute dentry; struct device *dev; }; struct mlx4_ib_iov_sysfs_attr_ar { struct mlx4_ib_iov_sysfs_attr dentries[3 * NUM_ALIAS_GUID_PER_PORT + 1]; }; struct mlx4_ib_iov_port { char name[100]; u8 num; struct mlx4_ib_dev *dev; struct list_head list; struct mlx4_ib_iov_sysfs_attr_ar *dentr_ar; struct ib_port_attr attr; struct kobject *cur_port; struct kobject *admin_alias_parent; struct kobject *gids_parent; struct kobject *pkeys_parent; struct kobject *mcgs_parent; struct mlx4_ib_iov_sysfs_attr mcg_dentry; }; struct mlx4_ib_counter { int counter_index; int status; }; struct mlx4_ib_dev { struct ib_device ib_dev; struct mlx4_dev *dev; int num_ports; struct mlx4_uar priv_uar; u32 priv_pdn; MLX4_DECLARE_DOORBELL_LOCK(uar_lock); struct ib_mad_agent *send_agent[MLX4_MAX_PORTS][2]; struct ib_ah *sm_ah[MLX4_MAX_PORTS]; spinlock_t sm_lock; struct mlx4_ib_sriov sriov; struct mutex cap_mask_mutex; bool ib_active; struct mlx4_ib_iboe iboe; struct mlx4_ib_counter counters[MLX4_MAX_PORTS]; int *eq_table; int eq_added; struct kobject *iov_parent; struct kobject *ports_parent; struct kobject *dev_ports_parent[MLX4_MFUNC_MAX]; struct mlx4_ib_iov_port iov_ports[MLX4_MAX_PORTS]; struct pkey_mgt pkeys; unsigned long *ib_uc_qpns_bitmap; int steer_qpn_count; int steer_qpn_base; }; struct ib_event_work { struct work_struct work; struct mlx4_ib_dev *ib_dev; struct mlx4_eqe ib_eqe; }; struct mlx4_ib_qp_tunnel_init_attr { struct ib_qp_init_attr init_attr; int slave; enum ib_qp_type proxy_qp_type; u8 port; }; static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) { return container_of(ibdev, struct mlx4_ib_dev, ib_dev); } static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) { return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext); } static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd) { return container_of(ibpd, struct mlx4_ib_pd, ibpd); } static inline struct mlx4_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) { return container_of(ibxrcd, struct mlx4_ib_xrcd, ibxrcd); } static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq) { return container_of(ibcq, struct mlx4_ib_cq, ibcq); } static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq) { return container_of(mcq, struct mlx4_ib_cq, mcq); } static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr) { return container_of(ibmr, struct mlx4_ib_mr, ibmr); } static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw) { return container_of(ibmw, struct mlx4_ib_mw, ibmw); } static inline struct mlx4_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl) { return container_of(ibfrpl, struct mlx4_ib_fast_reg_page_list, ibfrpl); } static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) { return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr); } static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow) { return container_of(ibflow, struct mlx4_ib_flow, ibflow); } static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp) { return container_of(ibqp, struct mlx4_ib_qp, ibqp); } static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp) { return container_of(mqp, struct mlx4_ib_qp, mqp); } static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq) { return container_of(ibsrq, struct mlx4_ib_srq, ibsrq); } static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq) { return container_of(msrq, struct mlx4_ib_srq, msrq); } static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah) { return container_of(ibah, struct mlx4_ib_ah, ibah); } int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev); void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev); int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, struct mlx4_db *db); void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db); struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc); int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, struct ib_umem *umem); int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, int *num_of_mtts); struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata, int mr_id); int mlx4_ib_dereg_mr(struct ib_mr *mr); struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind); int mlx4_ib_dealloc_mw(struct ib_mw *mw); struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len); struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, int page_list_len); void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); int mlx4_ib_modify_cq(struct ib_cq *cq, struct ib_cq_attr *cq_attr, int cq_attr_mask); int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); int mlx4_ib_ignore_overrun_cq(struct ib_cq *ibcq); struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata); int mlx4_ib_destroy_cq(struct ib_cq *cq); int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); int mlx4_ib_destroy_ah(struct ib_ah *ah); struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *init_attr, struct ib_udata *udata); int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); int mlx4_ib_destroy_srq(struct ib_srq *srq); void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index); int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); int mlx4_ib_destroy_qp(struct ib_qp *qp); int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr); int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, int port, struct ib_wc *in_wc, struct ib_grh *in_grh, void *in_mad, void *response_mad); int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad); int mlx4_ib_mad_init(struct mlx4_ib_dev *dev); void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev); struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, u64 iova); int mlx4_ib_unmap_fmr(struct list_head *fmr_list); int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr); int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view); int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey, int netw_view); int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid, int netw_view); int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, u8 *mac, int *is_mcast, u8 port); int mlx4_ib_query_if_stat(struct mlx4_ib_dev *dev, u32 counter_index, union mlx4_counter *counter, u8 clear); static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) { u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3; if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET) return 1; return !!(ah->av.ib.g_slid & 0x80); } static inline int mlx4_ib_qp_has_rq(struct ib_qp_init_attr *attr) { if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT) return 0; return !attr->srq; } int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx); void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq); void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave); int mlx4_ib_mcg_init(void); void mlx4_ib_mcg_destroy(void); int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid); int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave, struct ib_sa_mad *sa_mad); int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, struct ib_sa_mad *mad); int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, union ib_gid *gid); void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num, enum ib_event_type type); void mlx4_ib_tunnels_update_work(struct work_struct *work); int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type qpt, struct ib_wc *wc, struct ib_grh *grh, struct ib_mad *mad); int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr, u8 *s_mac, struct ib_mad *mad); __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx); int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, struct ib_mad *mad, int is_eth); int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, struct ib_mad *mad); void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev); void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave_id); /* alias guid support */ void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port); int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev); void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev); void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port); void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, int block_num, u8 port_num, u8 *p_data); void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num, u8 port_num, u8 *p_data); int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, struct attribute *attr); void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, struct attribute *attr); ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index); int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ; void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device); __be64 mlx4_ib_gen_node_guid(void); int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn); void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count); int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, int is_attach); int mlx4_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props); #endif /* MLX4_IB_H */ Index: head/sys/ofed/drivers/infiniband/hw/mlx4/mr.c =================================================================== --- head/sys/ofed/drivers/infiniband/hw/mlx4/mr.c (revision 296381) +++ head/sys/ofed/drivers/infiniband/hw/mlx4/mr.c (revision 296382) @@ -1,885 +1,885 @@ /* * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include "mlx4_ib.h" static u32 convert_access(int acc) { return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC : 0) | (acc & IB_ACCESS_REMOTE_WRITE ? MLX4_PERM_REMOTE_WRITE : 0) | (acc & IB_ACCESS_REMOTE_READ ? MLX4_PERM_REMOTE_READ : 0) | (acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) | (acc & IB_ACCESS_MW_BIND ? MLX4_PERM_BIND_MW : 0) | MLX4_PERM_LOCAL_READ; } /* No suuport for Shared MR feature */ #if 0 static ssize_t shared_mr_proc_read(struct file *file, char __user *buffer, size_t len, loff_t *offset) { return -ENOSYS; } static ssize_t shared_mr_proc_write(struct file *file, const char __user *buffer, size_t len, loff_t *offset) { return -ENOSYS; } static int shared_mr_mmap(struct file *filep, struct vm_area_struct *vma) { struct proc_dir_entry *pde = PDE(filep->f_path.dentry->d_inode); struct mlx4_shared_mr_info *smr_info = (struct mlx4_shared_mr_info *)pde->data; /* Prevent any mapping not on start of area */ if (vma->vm_pgoff != 0) return -EINVAL; return ib_umem_map_to_vma(smr_info->umem, vma); } static const struct file_operations shared_mr_proc_ops = { .owner = THIS_MODULE, .read = shared_mr_proc_read, .write = shared_mr_proc_write, .mmap = shared_mr_mmap }; static mode_t convert_shared_access(int acc) { return (acc & IB_ACCESS_SHARED_MR_USER_READ ? S_IRUSR : 0) | (acc & IB_ACCESS_SHARED_MR_USER_WRITE ? S_IWUSR : 0) | (acc & IB_ACCESS_SHARED_MR_GROUP_READ ? S_IRGRP : 0) | (acc & IB_ACCESS_SHARED_MR_GROUP_WRITE ? S_IWGRP : 0) | (acc & IB_ACCESS_SHARED_MR_OTHER_READ ? S_IROTH : 0) | (acc & IB_ACCESS_SHARED_MR_OTHER_WRITE ? S_IWOTH : 0); } #endif struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc) { struct mlx4_ib_mr *mr; int err; mr = kzalloc(sizeof *mr, GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0, ~0ull, convert_access(acc), 0, 0, &mr->mmr); if (err) goto err_free; err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr); if (err) goto err_mr; mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; mr->umem = NULL; return &mr->ibmr; err_mr: (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); err_free: kfree(mr); return ERR_PTR(err); } static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, u64 mtt_size, u64 mtt_shift, u64 len, u64 cur_start_addr, u64 *pages, int *start_index, int *npages) { int k; int err = 0; u64 mtt_entries; u64 cur_end_addr = cur_start_addr + len; u64 cur_end_addr_aligned = 0; len += (cur_start_addr & (mtt_size-1ULL)); cur_end_addr_aligned = round_up(cur_end_addr, mtt_size); len += (cur_end_addr_aligned - cur_end_addr); if (len & (mtt_size-1ULL)) { WARN(1 , "write_block: len %llx is not aligned to mtt_size %llx\n", (unsigned long long)len, (unsigned long long)mtt_size); return -EINVAL; } mtt_entries = (len >> mtt_shift); /* Align the MTT start address to the mtt_size. Required to handle cases when the MR starts in the middle of an MTT record. Was not required in old code since the physical addresses provided by the dma subsystem were page aligned, which was also the MTT size. */ cur_start_addr = round_down(cur_start_addr, mtt_size); /* A new block is started ...*/ for (k = 0; k < mtt_entries; ++k) { pages[*npages] = cur_start_addr + (mtt_size * k); (*npages)++; /* * Be friendly to mlx4_write_mtt() and * pass it chunks of appropriate size. */ if (*npages == PAGE_SIZE / sizeof(u64)) { err = mlx4_write_mtt(dev->dev, mtt, *start_index, *npages, pages); if (err) return err; (*start_index) += *npages; *npages = 0; } } return 0; } int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, struct ib_umem *umem) { u64 *pages; u64 len = 0; int err = 0; u64 mtt_size; u64 cur_start_addr = 0; u64 mtt_shift; int start_index = 0; int npages = 0; struct scatterlist *sg; int i; pages = (u64 *) __get_free_page(GFP_KERNEL); if (!pages) return -ENOMEM; mtt_shift = mtt->page_shift; mtt_size = 1ULL << mtt_shift; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) { if (cur_start_addr + len == sg_dma_address(sg)) { /* still the same block */ len += sg_dma_len(sg); continue; } /* A new block is started ...*/ /* If len is malaligned, write an extra mtt entry to cover the misaligned area (round up the division) */ err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size, mtt_shift, len, cur_start_addr, pages, &start_index, &npages); if (err) goto out; cur_start_addr = sg_dma_address(sg); len = sg_dma_len(sg); - } + } /* Handle the last block */ if (len > 0) { /* If len is malaligned, write an extra mtt entry to cover the misaligned area (round up the division) */ err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size, mtt_shift, len, cur_start_addr, pages, &start_index, &npages); if (err) goto out; } if (npages) err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages); out: free_page((unsigned long) pages); return err; } static inline u64 alignment_of(u64 ptr) { return ilog2(ptr & (~(ptr-1))); } static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start, u64 current_block_end, u64 block_shift) { /* Check whether the alignment of the new block is aligned as well as the previous block. Block address must start with zeros till size of entity_size. */ if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0) /* It is not as well aligned as the previous block-reduce the mtt size accordingly. Here we take the last right bit which is 1. */ block_shift = alignment_of(next_block_start); /* Check whether the alignment of the end of previous block - is it aligned as well as the start of the block */ if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0) /* It is not as well aligned as the start of the block - reduce the mtt size accordingly. */ block_shift = alignment_of(current_block_end); return block_shift; } /* Calculate optimal mtt size based on contiguous pages. * Function will return also the number of pages that are not aligned to the calculated mtt_size to be added to total number of pages. For that we should check the first chunk length & last chunk length and if not aligned to mtt_size we should increment the non_aligned_pages number. All chunks in the middle already handled as part of mtt shift calculation for both their start & end addresses. */ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, int *num_of_mtts) { u64 block_shift = MLX4_MAX_MTT_SHIFT; u64 current_block_len = 0; u64 current_block_start = 0; u64 misalignment_bits; u64 first_block_start = 0; u64 last_block_end = 0; u64 total_len = 0; u64 last_block_aligned_end = 0; u64 min_shift = ilog2(umem->page_size); struct scatterlist *sg; int i; u64 next_block_start; u64 current_block_end; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) { /* Initialization - save the first chunk start as the current_block_start - block means contiguous pages. */ if (current_block_len == 0 && current_block_start == 0) { first_block_start = current_block_start = sg_dma_address(sg); /* Find the bits that are different between the physical address and the virtual address for the start of the MR. */ /* umem_get aligned the start_va to a page boundry. Therefore, we need to align the start va to the same boundry */ /* misalignment_bits is needed to handle the case of a single memory region. In this case, the rest of the logic will not reduce the block size. If we use a block size which is bigger than the alignment of the misalignment bits, we might use the virtual page number instead of the physical page number, resulting in access to the wrong data. */ misalignment_bits = (start_va & (~(((u64)(umem->page_size))-1ULL))) ^ current_block_start; block_shift = min(alignment_of(misalignment_bits) , block_shift); } /* Go over the scatter entries and check if they continue the previous scatter entry. */ next_block_start = sg_dma_address(sg); current_block_end = current_block_start - + current_block_len; - /* If we have a split (non-contig.) between two block*/ - if (current_block_end != next_block_start) { - block_shift = mlx4_ib_umem_calc_block_mtt( - next_block_start, - current_block_end, - block_shift); + + current_block_len; + /* If we have a split (non-contig.) between two block*/ + if (current_block_end != next_block_start) { + block_shift = mlx4_ib_umem_calc_block_mtt( + next_block_start, + current_block_end, + block_shift); - /* If we reached the minimum shift for 4k - page we stop the loop. - */ - if (block_shift <= min_shift) - goto end; + /* If we reached the minimum shift for 4k + page we stop the loop. + */ + if (block_shift <= min_shift) + goto end; - /* If not saved yet we are in first block - - we save the length of first block to - calculate the non_aligned_pages number at - * the end. - */ - total_len += current_block_len; + /* If not saved yet we are in first block - + we save the length of first block to + calculate the non_aligned_pages number at + * the end. + */ + total_len += current_block_len; - /* Start a new block */ - current_block_start = next_block_start; - current_block_len = + /* Start a new block */ + current_block_start = next_block_start; + current_block_len = sg_dma_len(sg); - continue; - } - /* The scatter entry is another part of - the current block, increase the block size - * An entry in the scatter can be larger than - 4k (page) as of dma mapping - which merge some blocks together. - */ - current_block_len += + continue; + } + /* The scatter entry is another part of + the current block, increase the block size + * An entry in the scatter can be larger than + 4k (page) as of dma mapping + which merge some blocks together. + */ + current_block_len += sg_dma_len(sg); } /* Account for the last block in the total len */ total_len += current_block_len; /* Add to the first block the misalignment that it suffers from.*/ total_len += (first_block_start & ((1ULL<> block_shift; end: if (block_shift < min_shift) { /* If shift is less than the min we set a WARN and return the min shift. */ WARN(1, "mlx4_ib_umem_calc_optimal_mtt_size - unexpected shift %lld\n", (unsigned long long)block_shift); block_shift = min_shift; } return block_shift; } /* No suuport for Shared MR */ #if 0 static int prepare_shared_mr(struct mlx4_ib_mr *mr, int access_flags, int mr_id) { struct proc_dir_entry *mr_proc_entry; mode_t mode = S_IFREG; char name_buff[16]; mode |= convert_shared_access(access_flags); sprintf(name_buff, "%X", mr_id); mr->smr_info = kmalloc(sizeof(struct mlx4_shared_mr_info), GFP_KERNEL); mr->smr_info->mr_id = mr_id; mr->smr_info->umem = mr->umem; mr_proc_entry = proc_create_data(name_buff, mode, mlx4_mrs_dir_entry, &shared_mr_proc_ops, mr->smr_info); if (!mr_proc_entry) { pr_err("prepare_shared_mr failed via proc\n"); kfree(mr->smr_info); return -ENODEV; } current_uid_gid(&(mr_proc_entry->uid), &(mr_proc_entry->gid)); mr_proc_entry->size = mr->umem->length; return 0; } static int is_shared_mr(int access_flags) { /* We should check whether IB_ACCESS_SHARED_MR_USER_READ or other shared bits were turned on. */ return !!(access_flags & (IB_ACCESS_SHARED_MR_USER_READ | IB_ACCESS_SHARED_MR_USER_WRITE | IB_ACCESS_SHARED_MR_GROUP_READ | IB_ACCESS_SHARED_MR_GROUP_WRITE | IB_ACCESS_SHARED_MR_OTHER_READ | IB_ACCESS_SHARED_MR_OTHER_WRITE)); } static void free_smr_info(struct mlx4_ib_mr *mr) { /* When master/parent shared mr is dereged there is no ability to share this mr any more - its mr_id will be returned to the kernel as part of ib_uverbs_dereg_mr and may be allocated again as part of other reg_mr. */ char name_buff[16]; sprintf(name_buff, "%X", mr->smr_info->mr_id); /* Remove proc entry is checking internally that no operation was strated on that proc fs file and if in the middle current process will wait till end of operation. That's why no sync mechanism is needed when we release below the shared umem. */ remove_proc_entry(name_buff, mlx4_mrs_dir_entry); kfree(mr->smr_info); mr->smr_info = NULL; } #endif static void mlx4_invalidate_umem(void *invalidation_cookie, struct ib_umem *umem, unsigned long addr, size_t size) { struct mlx4_ib_mr *mr = (struct mlx4_ib_mr *)invalidation_cookie; /* This function is called under client peer lock so its resources are race protected */ if (atomic_inc_return(&mr->invalidated) > 1) { umem->invalidation_ctx->inflight_invalidation = 1; goto end; } umem->invalidation_ctx->peer_callback = 1; mlx4_mr_free(to_mdev(mr->ibmr.device)->dev, &mr->mmr); ib_umem_release(umem); complete(&mr->invalidation_comp); end: return; } struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata, int mr_id) { struct mlx4_ib_dev *dev = to_mdev(pd->device); struct mlx4_ib_mr *mr; int shift; int err; int n; struct ib_peer_memory_client *ib_peer_mem; mr = kzalloc(sizeof *mr, GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); mr->umem = ib_umem_get_ex(pd->uobject->context, start, length, access_flags, 0, 1); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err_free; } ib_peer_mem = mr->umem->ib_peer_mem; n = ib_umem_page_count(mr->umem); shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start, &n); err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length, convert_access(access_flags), n, shift, &mr->mmr); if (err) goto err_umem; err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem); if (err) goto err_mr; err = mlx4_mr_enable(dev->dev, &mr->mmr); if (err) goto err_mr; mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; /* No suuport for Shared MR */ #if 0 /* Check whether MR should be shared */ if (is_shared_mr(access_flags)) { /* start address and length must be aligned to page size in order to map a full page and preventing leakage of data */ if (mr->umem->offset || (length & ~PAGE_MASK)) { err = -EINVAL; goto err_mr; } err = prepare_shared_mr(mr, access_flags, mr_id); if (err) goto err_mr; } #endif if (ib_peer_mem) { if (access_flags & IB_ACCESS_MW_BIND) { /* Prevent binding MW on peer clients. * mlx4_invalidate_umem must be void, * therefore, mlx4_mr_free should not fail * when using peer clients. */ err = -ENOSYS; pr_err("MW is not supported with peer memory client"); goto err_smr; } init_completion(&mr->invalidation_comp); ib_umem_activate_invalidation_notifier(mr->umem, mlx4_invalidate_umem, mr); } atomic_set(&mr->invalidated, 0); return &mr->ibmr; err_smr: /* No suuport for Shared MR */ #if 0 if (mr->smr_info) free_smr_info(mr); #endif err_mr: (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); err_umem: ib_umem_release(mr->umem); err_free: kfree(mr); return ERR_PTR(err); } int mlx4_ib_dereg_mr(struct ib_mr *ibmr) { struct mlx4_ib_mr *mr = to_mmr(ibmr); struct ib_umem *umem = mr->umem; int ret; /* No suuport for Shared MR */ #if 0 if (mr->smr_info) free_smr_info(mr); #endif if (atomic_inc_return(&mr->invalidated) > 1) { wait_for_completion(&mr->invalidation_comp); goto end; } ret = mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr); if (ret) { /* Error is not expected here, except when memory windows * are bound to MR which is not supported with * peer memory clients */ atomic_set(&mr->invalidated, 0); return ret; } if (!umem) goto end; - ib_umem_release(mr->umem); + ib_umem_release(mr->umem); end: kfree(mr); return 0; } struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) { struct mlx4_ib_dev *dev = to_mdev(pd->device); struct mlx4_ib_mw *mw; int err; mw = kmalloc(sizeof(*mw), GFP_KERNEL); if (!mw) return ERR_PTR(-ENOMEM); err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn, (enum mlx4_mw_type)type, &mw->mmw); if (err) goto err_free; err = mlx4_mw_enable(dev->dev, &mw->mmw); if (err) goto err_mw; mw->ibmw.rkey = mw->mmw.key; return &mw->ibmw; err_mw: mlx4_mw_free(dev->dev, &mw->mmw); err_free: kfree(mw); return ERR_PTR(err); } int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind) { struct ib_send_wr wr; struct ib_send_wr *bad_wr; int ret; memset(&wr, 0, sizeof(wr)); wr.opcode = IB_WR_BIND_MW; wr.wr_id = mw_bind->wr_id; wr.send_flags = mw_bind->send_flags; wr.wr.bind_mw.mw = mw; wr.wr.bind_mw.bind_info = mw_bind->bind_info; wr.wr.bind_mw.rkey = ib_inc_rkey(mw->rkey); ret = mlx4_ib_post_send(qp, &wr, &bad_wr); if (!ret) mw->rkey = wr.wr.bind_mw.rkey; return ret; } int mlx4_ib_dealloc_mw(struct ib_mw *ibmw) { struct mlx4_ib_mw *mw = to_mmw(ibmw); mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw); kfree(mw); return 0; } struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) { struct mlx4_ib_dev *dev = to_mdev(pd->device); struct mlx4_ib_mr *mr; int err; mr = kzalloc(sizeof *mr, GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0, max_page_list_len, 0, &mr->mmr); if (err) goto err_free; err = mlx4_mr_enable(dev->dev, &mr->mmr); if (err) goto err_mr; mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; mr->umem = NULL; return &mr->ibmr; err_mr: (void) mlx4_mr_free(dev->dev, &mr->mmr); err_free: kfree(mr); return ERR_PTR(err); } struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, int page_list_len) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct mlx4_ib_fast_reg_page_list *mfrpl; int size = page_list_len * sizeof (u64); if (page_list_len > MLX4_MAX_FAST_REG_PAGES) return ERR_PTR(-EINVAL); mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL); if (!mfrpl) return ERR_PTR(-ENOMEM); mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL); if (!mfrpl->ibfrpl.page_list) goto err_free; mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->pdev->dev, size, &mfrpl->map, GFP_KERNEL); if (!mfrpl->mapped_page_list) goto err_free; WARN_ON(mfrpl->map & 0x3f); return &mfrpl->ibfrpl; err_free: kfree(mfrpl->ibfrpl.page_list); kfree(mfrpl); return ERR_PTR(-ENOMEM); } void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) { struct mlx4_ib_dev *dev = to_mdev(page_list->device); struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list); int size = page_list->max_page_list_len * sizeof (u64); dma_free_coherent(&dev->dev->pdev->dev, size, mfrpl->mapped_page_list, mfrpl->map); kfree(mfrpl->ibfrpl.page_list); kfree(mfrpl); } struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc, struct ib_fmr_attr *fmr_attr) { struct mlx4_ib_dev *dev = to_mdev(pd->device); struct mlx4_ib_fmr *fmr; int err = -ENOMEM; fmr = kmalloc(sizeof *fmr, GFP_KERNEL); if (!fmr) return ERR_PTR(-ENOMEM); err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc), fmr_attr->max_pages, fmr_attr->max_maps, fmr_attr->page_shift, &fmr->mfmr); if (err) goto err_free; err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr); if (err) goto err_mr; fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key; return &fmr->ibfmr; err_mr: (void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr); err_free: kfree(fmr); return ERR_PTR(err); } int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, u64 iova) { struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device); return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey); } int mlx4_ib_unmap_fmr(struct list_head *fmr_list) { struct ib_fmr *ibfmr; int err; struct mlx4_dev *mdev = NULL; list_for_each_entry(ibfmr, fmr_list, list) { if (mdev && to_mdev(ibfmr->device)->dev != mdev) return -EINVAL; mdev = to_mdev(ibfmr->device)->dev; } if (!mdev) return 0; list_for_each_entry(ibfmr, fmr_list, list) { struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey); } /* * Make sure all MPT status updates are visible before issuing * SYNC_TPT firmware command. */ wmb(); err = mlx4_SYNC_TPT(mdev); if (err) pr_warn("SYNC_TPT error %d when " "unmapping FMRs\n", err); return 0; } int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr) { struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); struct mlx4_ib_dev *dev = to_mdev(ibfmr->device); int err; err = mlx4_fmr_free(dev->dev, &ifmr->mfmr); if (!err) kfree(ifmr); return err; } Index: head/sys/ofed/drivers/infiniband/hw/mlx4/qp.c =================================================================== --- head/sys/ofed/drivers/infiniband/hw/mlx4/qp.c (revision 296381) +++ head/sys/ofed/drivers/infiniband/hw/mlx4/qp.c (revision 296382) @@ -1,3687 +1,3687 @@ /* * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include "mlx4_ib.h" #include "user.h" #define asm __asm enum { MLX4_IB_ACK_REQ_FREQ = 8, }; enum { MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83, MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, MLX4_IB_LINK_TYPE_IB = 0, MLX4_IB_LINK_TYPE_ETH = 1 }; enum { /* * Largest possible UD header: send with GRH and immediate * data plus 18 bytes for an Ethernet header with VLAN/802.1Q * tag. (LRH would only use 8 bytes, so Ethernet is the * biggest case) */ MLX4_IB_UD_HEADER_SIZE = 82, MLX4_IB_LSO_HEADER_SPARE = 128, }; enum { MLX4_IB_IBOE_ETHERTYPE = 0x8915 }; struct mlx4_ib_sqp { struct mlx4_ib_qp qp; int pkey_index; u32 qkey; u32 send_psn; struct ib_ud_header ud_header; u8 header_buf[MLX4_IB_UD_HEADER_SIZE]; }; enum { MLX4_IB_MIN_SQ_STRIDE = 6, MLX4_IB_CACHE_LINE_SIZE = 64, }; enum { MLX4_RAW_QP_MTU = 7, MLX4_RAW_QP_MSGMAX = 31, }; static const __be32 mlx4_ib_opcode[] = { [IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND), [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO), [IB_WR_SEND_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_SEND_IMM), [IB_WR_RDMA_WRITE] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE), [IB_WR_RDMA_WRITE_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM), [IB_WR_RDMA_READ] = cpu_to_be32(MLX4_OPCODE_RDMA_READ), [IB_WR_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS), [IB_WR_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA), [IB_WR_SEND_WITH_INV] = cpu_to_be32(MLX4_OPCODE_SEND_INVAL), [IB_WR_LOCAL_INV] = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL), [IB_WR_FAST_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR), [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS), [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA), [IB_WR_BIND_MW] = cpu_to_be32( MLX4_OPCODE_BIND_MW), }; #ifndef wc_wmb #if defined(__i386__) #define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") #elif defined(__x86_64__) #define wc_wmb() asm volatile("sfence" ::: "memory") #elif defined(__ia64__) #define wc_wmb() asm volatile("fwb" ::: "memory") #else #define wc_wmb() wmb() #endif #endif static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) { return container_of(mqp, struct mlx4_ib_sqp, qp); } static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { if (!mlx4_is_master(dev->dev)) return 0; return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn && qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn + 8 * MLX4_MFUNC_MAX; } static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { int proxy_sqp = 0; int real_sqp = 0; int i; /* PPF or Native -- real SQP */ real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) && qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn && qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3); if (real_sqp) return 1; /* VF or PF -- proxy SQP */ if (mlx4_is_mfunc(dev->dev)) { for (i = 0; i < dev->dev->caps.num_ports; i++) { if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i] || qp->mqp.qpn == dev->dev->caps.qp1_proxy[i]) { proxy_sqp = 1; break; } } } return proxy_sqp; } /* used for INIT/CLOSE port logic */ static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { int proxy_qp0 = 0; int real_qp0 = 0; int i; /* PPF or Native -- real QP0 */ real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) && qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn && qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1); if (real_qp0) return 1; /* VF or PF -- proxy QP0 */ if (mlx4_is_mfunc(dev->dev)) { for (i = 0; i < dev->dev->caps.num_ports; i++) { if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i]) { proxy_qp0 = 1; break; } } } return proxy_qp0; } static void *get_wqe(struct mlx4_ib_qp *qp, int offset) { return mlx4_buf_offset(&qp->buf, offset); } static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n) { return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); } static void *get_send_wqe(struct mlx4_ib_qp *qp, int n) { return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift)); } /* * Stamp a SQ WQE so that it is invalid if prefetched by marking the * first four bytes of every 64 byte chunk with * 0x7FFFFFF | (invalid_ownership_value << 31). * * When the max work request size is less than or equal to the WQE * basic block size, as an optimization, we can stamp all WQEs with * 0xffffffff, and skip the very first chunk of each WQE. */ static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) { __be32 *wqe; int i; int s; int ind; void *buf; __be32 stamp; struct mlx4_wqe_ctrl_seg *ctrl; if (qp->sq_max_wqes_per_wr > 1) { s = roundup(size, 1U << qp->sq.wqe_shift); for (i = 0; i < s; i += 64) { ind = (i >> qp->sq.wqe_shift) + n; stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) : cpu_to_be32(0xffffffff); buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1)); *wqe = stamp; } } else { ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); s = (ctrl->fence_size & 0x3f) << 4; for (i = 64; i < s; i += 64) { wqe = buf + i; *wqe = cpu_to_be32(0xffffffff); } } } static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) { struct mlx4_wqe_ctrl_seg *ctrl; struct mlx4_wqe_inline_seg *inl; void *wqe; int s; ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); s = sizeof(struct mlx4_wqe_ctrl_seg); if (qp->ibqp.qp_type == IB_QPT_UD) { struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl; struct mlx4_av *av = (struct mlx4_av *)dgram->av; memset(dgram, 0, sizeof *dgram); av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn); s += sizeof(struct mlx4_wqe_datagram_seg); } /* Pad the remainder of the WQE with an inline data segment. */ if (size > s) { inl = wqe + s; inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl)); } ctrl->srcrb_flags = 0; ctrl->fence_size = size / 16; /* * Make sure descriptor is fully written before setting ownership bit * (because HW can start executing as soon as we do). */ wmb(); ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) | (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); stamp_send_wqe(qp, n + qp->sq_spare_wqes, size); } /* Post NOP WQE to prevent wrap-around in the middle of WR */ static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) { unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1)); if (unlikely(s < qp->sq_max_wqes_per_wr)) { post_nop_wqe(qp, ind, s << qp->sq.wqe_shift); ind += s; } return ind; } static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) { struct ib_event event; struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; if (type == MLX4_EVENT_TYPE_PATH_MIG) to_mibqp(qp)->port = to_mibqp(qp)->alt_port; if (ibqp->event_handler) { event.device = ibqp->device; event.element.qp = ibqp; switch (type) { case MLX4_EVENT_TYPE_PATH_MIG: event.event = IB_EVENT_PATH_MIG; break; case MLX4_EVENT_TYPE_COMM_EST: event.event = IB_EVENT_COMM_EST; break; case MLX4_EVENT_TYPE_SQ_DRAINED: event.event = IB_EVENT_SQ_DRAINED; break; case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE: event.event = IB_EVENT_QP_LAST_WQE_REACHED; break; case MLX4_EVENT_TYPE_WQ_CATAS_ERROR: event.event = IB_EVENT_QP_FATAL; break; case MLX4_EVENT_TYPE_PATH_MIG_FAILED: event.event = IB_EVENT_PATH_MIG_ERR; break; case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: event.event = IB_EVENT_QP_REQ_ERR; break; case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: event.event = IB_EVENT_QP_ACCESS_ERR; break; default: pr_warn("Unexpected event type %d " "on QP %06x\n", type, qp->qpn); return; } ibqp->event_handler(&event, ibqp->qp_context); } } static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags) { /* * UD WQEs must have a datagram segment. * RC and UC WQEs might have a remote address segment. * MLX WQEs need two extra inline data segments (for the UD * header and space for the ICRC). */ switch (type) { case MLX4_IB_QPT_UD: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_datagram_seg) + ((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0); case MLX4_IB_QPT_PROXY_SMI_OWNER: case MLX4_IB_QPT_PROXY_SMI: case MLX4_IB_QPT_PROXY_GSI: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_datagram_seg) + 64; case MLX4_IB_QPT_TUN_SMI_OWNER: case MLX4_IB_QPT_TUN_GSI: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_datagram_seg); case MLX4_IB_QPT_UC: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_raddr_seg); case MLX4_IB_QPT_RC: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_masked_atomic_seg) + sizeof (struct mlx4_wqe_raddr_seg); case MLX4_IB_QPT_SMI: case MLX4_IB_QPT_GSI: return sizeof (struct mlx4_wqe_ctrl_seg) + ALIGN(MLX4_IB_UD_HEADER_SIZE + DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE, MLX4_INLINE_ALIGN) * sizeof (struct mlx4_wqe_inline_seg), sizeof (struct mlx4_wqe_data_seg)) + ALIGN(4 + sizeof (struct mlx4_wqe_inline_seg), sizeof (struct mlx4_wqe_data_seg)); default: return sizeof (struct mlx4_wqe_ctrl_seg); } } static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, int is_user, int has_rq, struct mlx4_ib_qp *qp) { /* Sanity check RQ size before proceeding */ if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE || cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)) return -EINVAL; if (!has_rq) { if (cap->max_recv_wr) return -EINVAL; qp->rq.wqe_cnt = qp->rq.max_gs = 0; } else { /* HW requires >= 1 RQ entry with >= 1 gather entry */ if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) return -EINVAL; qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr)); qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg)); } /* leave userspace return values as they were, so as not to break ABI */ if (is_user) { cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; cap->max_recv_sge = qp->rq.max_gs; } else { cap->max_recv_wr = qp->rq.max_post = min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt); cap->max_recv_sge = min(qp->rq.max_gs, min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)); } return 0; } static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp) { int s; /* Sanity check SQ size before proceeding */ if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) || cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) || cap->max_inline_data + send_wqe_overhead(type, qp->flags) + sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) return -EINVAL; /* * For MLX transport we need 2 extra S/G entries: * one for the header and one for the checksum at the end */ if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI || type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) && cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) return -EINVAL; s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg), cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) + send_wqe_overhead(type, qp->flags); if (s > dev->dev->caps.max_sq_desc_sz) return -EINVAL; /* * Hermon supports shrinking WQEs, such that a single work * request can include multiple units of 1 << wqe_shift. This * way, work requests can differ in size, and do not have to * be a power of 2 in size, saving memory and speeding up send * WR posting. Unfortunately, if we do this then the * wqe_index field in CQEs can't be used to look up the WR ID * anymore, so we do this only if selective signaling is off. * * Further, on 32-bit platforms, we can't use vmap() to make * the QP buffer virtually contiguous. Thus we have to use * constant-sized WRs to make sure a WR is always fully within * a single page-sized chunk. * * Finally, we use NOP work requests to pad the end of the * work queue, to avoid wrap-around in the middle of WR. We * set NEC bit to avoid getting completions with error for * these NOP WRs, but since NEC is only supported starting * with firmware 2.2.232, we use constant-sized WRs for older * firmware. * * And, since MLX QPs only support SEND, we use constant-sized * WRs in this case. * * We look for the smallest value of wqe_shift such that the * resulting number of wqes does not exceed device * capabilities. * * We set WQE size to at least 64 bytes, this way stamping * invalidates each WQE. */ if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && qp->sq_signal_bits && BITS_PER_LONG == 64 && type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI && !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) qp->sq.wqe_shift = ilog2(64); else qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); for (;;) { qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift); /* * We need to leave 2 KB + 1 WR of headroom in the SQ to * allow HW to prefetch. */ qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr; qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr * qp->sq_max_wqes_per_wr + qp->sq_spare_wqes); if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes) break; if (qp->sq_max_wqes_per_wr <= 1) return -EINVAL; ++qp->sq.wqe_shift; } qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz, (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) - send_wqe_overhead(type, qp->flags)) / sizeof (struct mlx4_wqe_data_seg); qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + (qp->sq.wqe_cnt << qp->sq.wqe_shift); if (qp->rq.wqe_shift > qp->sq.wqe_shift) { qp->rq.offset = 0; qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; } else { qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; qp->sq.offset = 0; } cap->max_send_wr = qp->sq.max_post = (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr; cap->max_send_sge = min(qp->sq.max_gs, min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)); qp->max_inline_data = cap->max_inline_data; return 0; } static int set_user_sq_size(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, struct mlx4_ib_create_qp *ucmd) { /* Sanity check SQ size before proceeding */ if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes || ucmd->log_sq_stride > ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) || ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE) return -EINVAL; qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count; qp->sq.wqe_shift = ucmd->log_sq_stride; qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + (qp->sq.wqe_cnt << qp->sq.wqe_shift); return 0; } static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) { int i; qp->sqp_proxy_rcv = kmalloc(sizeof (struct mlx4_ib_buf) * qp->rq.wqe_cnt, GFP_KERNEL); if (!qp->sqp_proxy_rcv) return -ENOMEM; for (i = 0; i < qp->rq.wqe_cnt; i++) { qp->sqp_proxy_rcv[i].addr = kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr), GFP_KERNEL); if (!qp->sqp_proxy_rcv[i].addr) goto err; qp->sqp_proxy_rcv[i].map = ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr, sizeof (struct mlx4_ib_proxy_sqp_hdr), DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(dev, qp->sqp_proxy_rcv[i].map))) { pr_warn("ib_dma_map_single failed\n"); kfree(qp->sqp_proxy_rcv[i].addr); goto err; } } return 0; err: while (i > 0) { --i; ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, sizeof (struct mlx4_ib_proxy_sqp_hdr), DMA_FROM_DEVICE); kfree(qp->sqp_proxy_rcv[i].addr); } kfree(qp->sqp_proxy_rcv); qp->sqp_proxy_rcv = NULL; return -ENOMEM; } static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) { int i; for (i = 0; i < qp->rq.wqe_cnt; i++) { ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, sizeof (struct mlx4_ib_proxy_sqp_hdr), DMA_FROM_DEVICE); kfree(qp->sqp_proxy_rcv[i].addr); } kfree(qp->sqp_proxy_rcv); } static int init_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp, struct ib_qp_init_attr *attr, int *qpn) { struct mlx4_ib_qpg_data *qpg_data; int tss_num, rss_num; int tss_align_num, rss_align_num; int tss_base, rss_base = 0; int err; /* Parent is part of the TSS range (in SW TSS ARP is sent via parent) */ tss_num = 1 + attr->parent_attrib.tss_child_count; tss_align_num = roundup_pow_of_two(tss_num); rss_num = attr->parent_attrib.rss_child_count; rss_align_num = roundup_pow_of_two(rss_num); if (rss_num > 1) { /* RSS is requested */ if (!(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS)) return -ENOSYS; if (rss_align_num > dev->dev->caps.max_rss_tbl_sz) return -EINVAL; /* We must work with power of two */ attr->parent_attrib.rss_child_count = rss_align_num; } qpg_data = kzalloc(sizeof *qpg_data, GFP_KERNEL); if (!qpg_data) return -ENOMEM; if(pqp->flags & MLX4_IB_QP_NETIF) err = mlx4_ib_steer_qp_alloc(dev, tss_align_num, &tss_base); else err = mlx4_qp_reserve_range(dev->dev, tss_align_num, tss_align_num, &tss_base, MLX4_RESERVE_BF_QP); if (err) goto err1; if (tss_num > 1) { u32 alloc = BITS_TO_LONGS(tss_align_num) * sizeof(long); qpg_data->tss_bitmap = kzalloc(alloc, GFP_KERNEL); if (qpg_data->tss_bitmap == NULL) { err = -ENOMEM; goto err2; } bitmap_fill(qpg_data->tss_bitmap, tss_num); /* Note parent takes first index */ clear_bit(0, qpg_data->tss_bitmap); } if (rss_num > 1) { u32 alloc = BITS_TO_LONGS(rss_align_num) * sizeof(long); err = mlx4_qp_reserve_range(dev->dev, rss_align_num, 1, &rss_base, 0); if (err) goto err3; qpg_data->rss_bitmap = kzalloc(alloc, GFP_KERNEL); if (qpg_data->rss_bitmap == NULL) { err = -ENOMEM; goto err4; } bitmap_fill(qpg_data->rss_bitmap, rss_align_num); } qpg_data->tss_child_count = attr->parent_attrib.tss_child_count; qpg_data->rss_child_count = attr->parent_attrib.rss_child_count; qpg_data->qpg_parent = pqp; qpg_data->qpg_tss_mask_sz = ilog2(tss_align_num); qpg_data->tss_qpn_base = tss_base; qpg_data->rss_qpn_base = rss_base; pqp->qpg_data = qpg_data; *qpn = tss_base; return 0; err4: mlx4_qp_release_range(dev->dev, rss_base, rss_align_num); err3: if (tss_num > 1) kfree(qpg_data->tss_bitmap); err2: if(pqp->flags & MLX4_IB_QP_NETIF) mlx4_ib_steer_qp_free(dev, tss_base, tss_align_num); else mlx4_qp_release_range(dev->dev, tss_base, tss_align_num); err1: kfree(qpg_data); return err; } static void free_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp) { struct mlx4_ib_qpg_data *qpg_data = pqp->qpg_data; int align_num; if (qpg_data->tss_child_count > 1) kfree(qpg_data->tss_bitmap); align_num = roundup_pow_of_two(1 + qpg_data->tss_child_count); if(pqp->flags & MLX4_IB_QP_NETIF) mlx4_ib_steer_qp_free(dev, qpg_data->tss_qpn_base, align_num); else mlx4_qp_release_range(dev->dev, qpg_data->tss_qpn_base, align_num); if (qpg_data->rss_child_count > 1) { kfree(qpg_data->rss_bitmap); align_num = roundup_pow_of_two(qpg_data->rss_child_count); mlx4_qp_release_range(dev->dev, qpg_data->rss_qpn_base, align_num); } kfree(qpg_data); } static int alloc_qpg_qpn(struct ib_qp_init_attr *init_attr, struct mlx4_ib_qp *pqp, int *qpn) { struct mlx4_ib_qp *mqp = to_mqp(init_attr->qpg_parent); struct mlx4_ib_qpg_data *qpg_data = mqp->qpg_data; u32 idx, old; switch (init_attr->qpg_type) { case IB_QPG_CHILD_TX: if (qpg_data->tss_child_count == 0) return -EINVAL; do { /* Parent took index 0 */ idx = find_first_bit(qpg_data->tss_bitmap, qpg_data->tss_child_count + 1); if (idx >= qpg_data->tss_child_count + 1) return -ENOMEM; old = test_and_clear_bit(idx, qpg_data->tss_bitmap); } while (old == 0); idx += qpg_data->tss_qpn_base; break; case IB_QPG_CHILD_RX: if (qpg_data->rss_child_count == 0) return -EINVAL; do { idx = find_first_bit(qpg_data->rss_bitmap, qpg_data->rss_child_count); if (idx >= qpg_data->rss_child_count) return -ENOMEM; old = test_and_clear_bit(idx, qpg_data->rss_bitmap); } while (old == 0); idx += qpg_data->rss_qpn_base; break; default: return -EINVAL; } pqp->qpg_data = qpg_data; *qpn = idx; return 0; } static void free_qpg_qpn(struct mlx4_ib_qp *mqp, int qpn) { struct mlx4_ib_qpg_data *qpg_data = mqp->qpg_data; switch (mqp->qpg_type) { case IB_QPG_CHILD_TX: /* Do range check */ qpn -= qpg_data->tss_qpn_base; set_bit(qpn, qpg_data->tss_bitmap); break; case IB_QPG_CHILD_RX: qpn -= qpg_data->rss_qpn_base; set_bit(qpn, qpg_data->rss_bitmap); break; default: /* error */ pr_warn("wrong qpg type (%d)\n", mqp->qpg_type); break; } } static int alloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, struct ib_qp_init_attr *attr, int *qpn) { int err = 0; switch (attr->qpg_type) { case IB_QPG_NONE: /* Raw packet QPNs may not have bits 6,7 set in their qp_num; * otherwise, the WQE BlueFlame setup flow wrongly causes * VLAN insertion. */ if (attr->qp_type == IB_QPT_RAW_PACKET) { err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn, MLX4_RESERVE_BF_QP); } else { if(qp->flags & MLX4_IB_QP_NETIF) err = mlx4_ib_steer_qp_alloc(dev, 1, qpn); else err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn, 0); } break; case IB_QPG_PARENT: err = init_qpg_parent(dev, qp, attr, qpn); break; case IB_QPG_CHILD_TX: case IB_QPG_CHILD_RX: err = alloc_qpg_qpn(attr, qp, qpn); break; default: qp->qpg_type = IB_QPG_NONE; err = -EINVAL; break; } if (err) return err; qp->qpg_type = attr->qpg_type; return 0; } static void free_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, enum ib_qpg_type qpg_type, int qpn) { switch (qpg_type) { case IB_QPG_NONE: if (qp->flags & MLX4_IB_QP_NETIF) mlx4_ib_steer_qp_free(dev, qpn, 1); else mlx4_qp_release_range(dev->dev, qpn, 1); break; case IB_QPG_PARENT: free_qpg_parent(dev, qp); break; case IB_QPG_CHILD_TX: case IB_QPG_CHILD_RX: free_qpg_qpn(qp, qpn); break; default: break; } } /* Revert allocation on create_qp_common */ static void unalloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, struct ib_qp_init_attr *attr, int qpn) { free_qpn_common(dev, qp, attr->qpg_type, qpn); } static void release_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { free_qpn_common(dev, qp, qp->qpg_type, qp->mqp.qpn); } static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp) { int qpn; int err; struct mlx4_ib_sqp *sqp; struct mlx4_ib_qp *qp; enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; /* When tunneling special qps, we use a plain UD qp */ if (sqpn) { if (mlx4_is_mfunc(dev->dev) && (!mlx4_is_master(dev->dev) || !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) { if (init_attr->qp_type == IB_QPT_GSI) qp_type = MLX4_IB_QPT_PROXY_GSI; else if (mlx4_is_master(dev->dev)) qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER; else qp_type = MLX4_IB_QPT_PROXY_SMI; } qpn = sqpn; /* add extra sg entry for tunneling */ init_attr->cap.max_recv_sge++; } else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) { struct mlx4_ib_qp_tunnel_init_attr *tnl_init = container_of(init_attr, struct mlx4_ib_qp_tunnel_init_attr, init_attr); if ((tnl_init->proxy_qp_type != IB_QPT_SMI && tnl_init->proxy_qp_type != IB_QPT_GSI) || !mlx4_is_master(dev->dev)) return -EINVAL; if (tnl_init->proxy_qp_type == IB_QPT_GSI) qp_type = MLX4_IB_QPT_TUN_GSI; else if (tnl_init->slave == mlx4_master_func_num(dev->dev)) qp_type = MLX4_IB_QPT_TUN_SMI_OWNER; else qp_type = MLX4_IB_QPT_TUN_SMI; /* we are definitely in the PPF here, since we are creating * tunnel QPs. base_tunnel_sqpn is therefore valid. */ qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave + tnl_init->proxy_qp_type * 2 + tnl_init->port - 1; sqpn = qpn; } if (!*caller_qp) { if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI || (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) { sqp = kzalloc(sizeof (struct mlx4_ib_sqp), GFP_KERNEL); if (!sqp) return -ENOMEM; qp = &sqp->qp; qp->pri.vid = qp->alt.vid = 0xFFFF; } else { qp = kzalloc(sizeof (struct mlx4_ib_qp), GFP_KERNEL); if (!qp) return -ENOMEM; qp->pri.vid = qp->alt.vid = 0xFFFF; } } else qp = *caller_qp; qp->mlx4_ib_qp_type = qp_type; if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) qp->flags |= MLX4_IB_QP_LSO; if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { if (dev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED && !mlx4_is_mfunc(dev->dev)) qp->flags |= MLX4_IB_QP_NETIF; else { err = -EINVAL; goto err; } } mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); INIT_LIST_HEAD(&qp->gid_list); INIT_LIST_HEAD(&qp->steering_rules); INIT_LIST_HEAD(&qp->rules_list); qp->state = IB_QPS_RESET; if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, mlx4_ib_qp_has_rq(init_attr), qp); if (err) goto err; if (pd->uobject) { struct mlx4_ib_create_qp ucmd; int shift; int n; if (!udata || ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { err = -EFAULT; goto err; } if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL) qp->flags |= MLX4_IB_QP_CAP_CROSS_CHANNEL; if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND) qp->flags |= MLX4_IB_QP_CAP_MANAGED_SEND; if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV) qp->flags |= MLX4_IB_QP_CAP_MANAGED_RECV; qp->sq_no_prefetch = ucmd.sq_no_prefetch; err = set_user_sq_size(dev, qp, &ucmd); if (err) goto err; qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, qp->buf_size, 0, 0); if (IS_ERR(qp->umem)) { err = PTR_ERR(qp->umem); goto err; } n = ib_umem_page_count(qp->umem); shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n); err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt); if (err) goto err_buf; err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem); if (err) goto err_mtt; if (mlx4_ib_qp_has_rq(init_attr)) { err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), ucmd.db_addr, &qp->db); if (err) goto err_mtt; } } else { qp->sq_no_prefetch = 0; err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp); if (err) goto err; if (mlx4_ib_qp_has_rq(init_attr)) { err = mlx4_db_alloc(dev->dev, &qp->db, 0); if (err) goto err; *qp->db.db = 0; } if (qp->max_inline_data) { err = mlx4_bf_alloc(dev->dev, &qp->bf, 0); if (err) { pr_debug("failed to allocate blue flame" " register (%d)", err); qp->bf.uar = &dev->priv_uar; } } else qp->bf.uar = &dev->priv_uar; if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) { err = -ENOMEM; goto err_db; } err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift, &qp->mtt); if (err) goto err_buf; err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf); if (err) goto err_mtt; qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL); qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL); if (!qp->sq.wrid || !qp->rq.wrid) { err = -ENOMEM; goto err_wrid; } } if (sqpn) { if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) { if (alloc_proxy_bufs(pd->device, qp)) { err = -ENOMEM; goto err_wrid; } } } else { err = alloc_qpn_common(dev, qp, init_attr, &qpn); if (err) goto err_proxy; } err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp); if (err) goto err_qpn; if (init_attr->qp_type == IB_QPT_XRC_TGT) qp->mqp.qpn |= (1 << 23); /* * Hardware wants QPN written in big-endian order (after * shifting) for send doorbell. Precompute this value to save * a little bit when posting sends. */ qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); qp->mqp.event = mlx4_ib_qp_event; if (!*caller_qp) *caller_qp = qp; return 0; err_qpn: unalloc_qpn_common(dev, qp, init_attr, qpn); err_proxy: if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) free_proxy_bufs(pd->device, qp); err_wrid: if (pd->uobject) { if (mlx4_ib_qp_has_rq(init_attr)) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db); } else { kfree(qp->sq.wrid); kfree(qp->rq.wrid); } err_mtt: mlx4_mtt_cleanup(dev->dev, &qp->mtt); err_buf: if (pd->uobject) ib_umem_release(qp->umem); else mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); err_db: if (!pd->uobject && mlx4_ib_qp_has_rq(init_attr)) mlx4_db_free(dev->dev, &qp->db); if (qp->max_inline_data) mlx4_bf_free(dev->dev, &qp->bf); err: if (!*caller_qp) kfree(qp); return err; } static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state) { switch (state) { case IB_QPS_RESET: return MLX4_QP_STATE_RST; case IB_QPS_INIT: return MLX4_QP_STATE_INIT; case IB_QPS_RTR: return MLX4_QP_STATE_RTR; case IB_QPS_RTS: return MLX4_QP_STATE_RTS; case IB_QPS_SQD: return MLX4_QP_STATE_SQD; case IB_QPS_SQE: return MLX4_QP_STATE_SQER; case IB_QPS_ERR: return MLX4_QP_STATE_ERR; default: return -1; } } static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) __acquires(&send_cq->lock) __acquires(&recv_cq->lock) { if (send_cq == recv_cq) { spin_lock_irq(&send_cq->lock); __acquire(&recv_cq->lock); } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_lock_irq(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else { spin_lock_irq(&recv_cq->lock); spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); } } static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) __releases(&send_cq->lock) __releases(&recv_cq->lock) { if (send_cq == recv_cq) { __release(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_unlock(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); } else { spin_unlock(&send_cq->lock); spin_unlock_irq(&recv_cq->lock); } } static void del_gid_entries(struct mlx4_ib_qp *qp) { struct mlx4_ib_gid_entry *ge, *tmp; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { list_del(&ge->list); kfree(ge); } } static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp) { if (qp->ibqp.qp_type == IB_QPT_XRC_TGT) return to_mpd(to_mxrcd(qp->ibqp.xrcd)->pd); else return to_mpd(qp->ibqp.pd); } static void get_cqs(struct mlx4_ib_qp *qp, struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq) { switch (qp->ibqp.qp_type) { case IB_QPT_XRC_TGT: *send_cq = to_mcq(to_mxrcd(qp->ibqp.xrcd)->cq); *recv_cq = *send_cq; break; case IB_QPT_XRC_INI: *send_cq = to_mcq(qp->ibqp.send_cq); *recv_cq = *send_cq; break; default: *send_cq = to_mcq(qp->ibqp.send_cq); *recv_cq = to_mcq(qp->ibqp.recv_cq); break; } } static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, int is_user) { struct mlx4_ib_cq *send_cq, *recv_cq; if (qp->state != IB_QPS_RESET) { if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) pr_warn("modify QP %06x to RESET failed.\n", qp->mqp.qpn); if (qp->pri.smac) { mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); qp->pri.smac = 0; } if (qp->alt.smac) { mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); qp->alt.smac = 0; } if (qp->pri.vid < 0x1000) { mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); qp->pri.vid = 0xFFFF; qp->pri.candidate_vid = 0xFFFF; qp->pri.update_vid = 0; } if (qp->alt.vid < 0x1000) { mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); qp->alt.vid = 0xFFFF; qp->alt.candidate_vid = 0xFFFF; qp->alt.update_vid = 0; } } get_cqs(qp, &send_cq, &recv_cq); mlx4_ib_lock_cqs(send_cq, recv_cq); if (!is_user) { __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL); if (send_cq != recv_cq) __mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); } mlx4_qp_remove(dev->dev, &qp->mqp); mlx4_ib_unlock_cqs(send_cq, recv_cq); mlx4_qp_free(dev->dev, &qp->mqp); if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) release_qpn_common(dev, qp); mlx4_mtt_cleanup(dev->dev, &qp->mtt); if (is_user) { if (qp->rq.wqe_cnt) mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context), &qp->db); ib_umem_release(qp->umem); } else { kfree(qp->sq.wrid); kfree(qp->rq.wrid); if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) free_proxy_bufs(&dev->ib_dev, qp); mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); if (qp->max_inline_data) mlx4_bf_free(dev->dev, &qp->bf); if (qp->rq.wqe_cnt) mlx4_db_free(dev->dev, &qp->db); } del_gid_entries(qp); } static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr) { /* Native or PPF */ if (!mlx4_is_mfunc(dev->dev) || (mlx4_is_master(dev->dev) && attr->create_flags & MLX4_IB_SRIOV_SQP)) { return dev->dev->phys_caps.base_sqpn + (attr->qp_type == IB_QPT_SMI ? 0 : 2) + attr->port_num - 1; } /* PF or VF -- creating proxies */ if (attr->qp_type == IB_QPT_SMI) return dev->dev->caps.qp0_proxy[attr->port_num - 1]; else return dev->dev->caps.qp1_proxy[attr->port_num - 1]; } static int check_qpg_attr(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr) { if (attr->qpg_type == IB_QPG_NONE) return 0; if (attr->qp_type != IB_QPT_UD && attr->qp_type != IB_QPT_RAW_PACKET) return -EINVAL; if (attr->qpg_type == IB_QPG_PARENT) { if (attr->parent_attrib.tss_child_count == 1) return -EINVAL; /* Doesn't make sense */ if (attr->parent_attrib.rss_child_count == 1) return -EINVAL; /* Doesn't make sense */ if ((attr->parent_attrib.tss_child_count == 0) && (attr->parent_attrib.rss_child_count == 0)) /* Should be called with IP_QPG_NONE */ return -EINVAL; if (attr->parent_attrib.rss_child_count > 1) { int rss_align_num; if (!(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS)) return -ENOSYS; rss_align_num = roundup_pow_of_two( attr->parent_attrib.rss_child_count); if (rss_align_num > dev->dev->caps.max_rss_tbl_sz) return -EINVAL; } } else { struct mlx4_ib_qpg_data *qpg_data; if (attr->qpg_parent == NULL) return -EINVAL; if (IS_ERR(attr->qpg_parent)) return -EINVAL; qpg_data = to_mqp(attr->qpg_parent)->qpg_data; if (qpg_data == NULL) return -EINVAL; if (attr->qpg_type == IB_QPG_CHILD_TX && !qpg_data->tss_child_count) return -EINVAL; if (attr->qpg_type == IB_QPG_CHILD_RX && !qpg_data->rss_child_count) return -EINVAL; } return 0; } #define RESERVED_FLAGS_MASK ((((unsigned int)IB_QP_CREATE_RESERVED_END - 1) | IB_QP_CREATE_RESERVED_END) \ & ~(IB_QP_CREATE_RESERVED_START - 1)) static enum mlx4_ib_qp_flags to_mlx4_ib_qp_flags(enum ib_qp_create_flags ib_qp_flags) { enum mlx4_ib_qp_flags mlx4_ib_qp_flags = 0; if (ib_qp_flags & IB_QP_CREATE_IPOIB_UD_LSO) mlx4_ib_qp_flags |= MLX4_IB_QP_LSO; if (ib_qp_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) mlx4_ib_qp_flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; if (ib_qp_flags & IB_QP_CREATE_NETIF_QP) mlx4_ib_qp_flags |= MLX4_IB_QP_NETIF; if (ib_qp_flags & IB_QP_CREATE_CROSS_CHANNEL) mlx4_ib_qp_flags |= MLX4_IB_QP_CAP_CROSS_CHANNEL; if (ib_qp_flags & IB_QP_CREATE_MANAGED_SEND) mlx4_ib_qp_flags |= MLX4_IB_QP_CAP_MANAGED_SEND; if (ib_qp_flags & IB_QP_CREATE_MANAGED_RECV) mlx4_ib_qp_flags |= MLX4_IB_QP_CAP_MANAGED_RECV; /* reserved flags */ mlx4_ib_qp_flags |= (ib_qp_flags & RESERVED_FLAGS_MASK); return mlx4_ib_qp_flags; } struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { struct mlx4_ib_qp *qp = NULL; int err; u16 xrcdn = 0; enum mlx4_ib_qp_flags mlx4_qp_flags = to_mlx4_ib_qp_flags(init_attr->create_flags); struct ib_device *device; /* see ib_core::ib_create_qp same handling */ device = pd ? pd->device : init_attr->xrcd->device; /* * We only support LSO, vendor flag1, and multicast loopback blocking, * and only for kernel UD QPs. */ if (mlx4_qp_flags & ~(MLX4_IB_QP_LSO | MLX4_IB_QP_CAP_CROSS_CHANNEL | MLX4_IB_QP_CAP_MANAGED_SEND | MLX4_IB_QP_CAP_MANAGED_RECV | MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK | MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP | MLX4_IB_QP_NETIF)) return ERR_PTR(-EINVAL); if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { if (init_attr->qp_type != IB_QPT_UD) return ERR_PTR(-EINVAL); } if ((mlx4_qp_flags & (MLX4_IB_QP_CAP_CROSS_CHANNEL | MLX4_IB_QP_CAP_MANAGED_SEND | MLX4_IB_QP_CAP_MANAGED_RECV)) && !(to_mdev(device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_CROSS_CHANNEL)) { pr_debug("%s Does not support cross-channel operations\n", to_mdev(device)->ib_dev.name); return ERR_PTR(-EINVAL); } if ((init_attr->create_flags & ~(IB_QP_CREATE_CROSS_CHANNEL | IB_QP_CREATE_MANAGED_SEND | IB_QP_CREATE_MANAGED_RECV)) && (((mlx4_qp_flags & ~MLX4_IB_SRIOV_SQP) && init_attr->qp_type != IB_QPT_UD) || ((mlx4_qp_flags & MLX4_IB_SRIOV_SQP) && init_attr->qp_type > IB_QPT_GSI))) return ERR_PTR(-EINVAL); err = check_qpg_attr(to_mdev(device), init_attr); if (err) return ERR_PTR(err); switch (init_attr->qp_type) { case IB_QPT_XRC_TGT: pd = to_mxrcd(init_attr->xrcd)->pd; xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq; /* fall through */ case IB_QPT_XRC_INI: if (!(to_mdev(device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) return ERR_PTR(-ENOSYS); init_attr->recv_cq = init_attr->send_cq; /* fall through */ case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_RAW_PACKET: qp = kzalloc(sizeof *qp, GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); qp->pri.vid = qp->alt.vid = 0xFFFF; /* fall through */ case IB_QPT_UD: { err = create_qp_common(to_mdev(device), pd, init_attr, udata, 0, &qp); if (err) { kfree(qp); return ERR_PTR(err); } qp->ibqp.qp_num = qp->mqp.qpn; qp->xrcdn = xrcdn; break; } case IB_QPT_SMI: case IB_QPT_GSI: { /* Userspace is not allowed to create special QPs: */ if (udata) return ERR_PTR(-EINVAL); err = create_qp_common(to_mdev(device), pd, init_attr, udata, get_sqp_num(to_mdev(device), init_attr), &qp); if (err) return ERR_PTR(err); qp->port = init_attr->port_num; qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1; break; } default: /* Don't support raw QPs */ return ERR_PTR(-EINVAL); } return &qp->ibqp; } int mlx4_ib_destroy_qp(struct ib_qp *qp) { struct mlx4_ib_dev *dev = to_mdev(qp->device); struct mlx4_ib_qp *mqp = to_mqp(qp); struct mlx4_ib_pd *pd; if (is_qp0(dev, mqp)) mlx4_CLOSE_PORT(dev->dev, mqp->port); pd = get_pd(mqp); destroy_qp_common(dev, mqp, !!pd->ibpd.uobject); if (is_sqp(dev, mqp)) kfree(to_msqp(mqp)); else kfree(mqp); return 0; } static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type) { switch (type) { case MLX4_IB_QPT_RC: return MLX4_QP_ST_RC; case MLX4_IB_QPT_UC: return MLX4_QP_ST_UC; case MLX4_IB_QPT_UD: return MLX4_QP_ST_UD; case MLX4_IB_QPT_XRC_INI: case MLX4_IB_QPT_XRC_TGT: return MLX4_QP_ST_XRC; case MLX4_IB_QPT_SMI: case MLX4_IB_QPT_GSI: case MLX4_IB_QPT_RAW_PACKET: return MLX4_QP_ST_MLX; case MLX4_IB_QPT_PROXY_SMI_OWNER: case MLX4_IB_QPT_TUN_SMI_OWNER: return (mlx4_is_mfunc(dev->dev) ? MLX4_QP_ST_MLX : -1); case MLX4_IB_QPT_PROXY_SMI: case MLX4_IB_QPT_TUN_SMI: case MLX4_IB_QPT_PROXY_GSI: case MLX4_IB_QPT_TUN_GSI: return (mlx4_is_mfunc(dev->dev) ? MLX4_QP_ST_UD : -1); default: return -1; } } static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr, int attr_mask) { u8 dest_rd_atomic; u32 access_flags; u32 hw_access_flags = 0; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) dest_rd_atomic = attr->max_dest_rd_atomic; else dest_rd_atomic = qp->resp_depth; if (attr_mask & IB_QP_ACCESS_FLAGS) access_flags = attr->qp_access_flags; else access_flags = qp->atomic_rd_en; if (!dest_rd_atomic) access_flags &= IB_ACCESS_REMOTE_WRITE; if (access_flags & IB_ACCESS_REMOTE_READ) hw_access_flags |= MLX4_QP_BIT_RRE; if (access_flags & IB_ACCESS_REMOTE_ATOMIC) hw_access_flags |= MLX4_QP_BIT_RAE; if (access_flags & IB_ACCESS_REMOTE_WRITE) hw_access_flags |= MLX4_QP_BIT_RWE; return cpu_to_be32(hw_access_flags); } static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr, int attr_mask) { if (attr_mask & IB_QP_PKEY_INDEX) sqp->pkey_index = attr->pkey_index; if (attr_mask & IB_QP_QKEY) sqp->qkey = attr->qkey; if (attr_mask & IB_QP_SQ_PSN) sqp->send_psn = attr->sq_psn; } static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) { path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6); } static int ib_rate_to_mlx4(struct mlx4_ib_dev *dev, u8 rate) { if (rate == IB_RATE_PORT_CURRENT) { return 0; } else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) { return -EINVAL; } else { while (rate != IB_RATE_2_5_GBPS && !(1 << (rate + MLX4_STAT_RATE_OFFSET) & dev->dev->caps.stat_rate_support)) --rate; } return rate + MLX4_STAT_RATE_OFFSET; } static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, u8 *smac, u16 vlan_id, struct mlx4_ib_qp *qp, struct mlx4_qp_path *path, u8 port, int is_primary) { int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_ETHERNET; u16 vlan_tag; int vidx; int smac_index; int err; u64 u64_mac; struct mlx4_roce_smac_vlan_info *smac_info; path->grh_mylmc = ah->src_path_bits & 0x7f; path->rlid = cpu_to_be16(ah->dlid); err = ib_rate_to_mlx4(dev, ah->static_rate); if (err < 0) return err; path->static_rate = err; if (ah->ah_flags & IB_AH_GRH) { if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) { pr_err("sgid_index (%u) too large. max is %d\n", ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1); return -1; } path->grh_mylmc |= 1 << 7; path->mgid_index = ah->grh.sgid_index; path->hop_limit = ah->grh.hop_limit; path->tclass_flowlabel = cpu_to_be32((ah->grh.traffic_class << 20) | (ah->grh.flow_label)); memcpy(path->rgid, ah->grh.dgid.raw, 16); } if (is_eth) { if (!(ah->ah_flags & IB_AH_GRH)) return -1; path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((port - 1) << 6) | ((ah->sl & 7) << 3); if (is_primary) smac_info = &qp->pri; else smac_info = &qp->alt; vlan_tag = vlan_id; if (vlan_tag < 0x1000) { if (smac_info->vid < 0x1000) { /* both valid vlan ids */ if (smac_info->vid != vlan_tag) { /* different VIDs. unreg old and reg new */ err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); if (err) return err; smac_info->candidate_vid = vlan_tag; smac_info->candidate_vlan_index = vidx; smac_info->candidate_vlan_port = port; smac_info->update_vid = 1; path->vlan_index = vidx; path->fl = 1 << 6; } else { path->vlan_index = smac_info->vlan_index; path->fl = 1 << 6; } } else { /* no current vlan tag in qp */ err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); if (err) return err; smac_info->candidate_vid = vlan_tag; smac_info->candidate_vlan_index = vidx; smac_info->candidate_vlan_port = port; smac_info->update_vid = 1; path->vlan_index = vidx; path->fl = 1 << 6; } } else { /* have current vlan tag. unregister it at modify-qp success */ if (smac_info->vid < 0x1000) { smac_info->candidate_vid = 0xFFFF; smac_info->update_vid = 1; } } /* get smac_index for RoCE use. * If no smac was yet assigned, register one. * If one was already assigned, but the new mac differs, * unregister the old one and register the new one. */ - u64_mac = mlx4_mac_to_u64(smac); + u64_mac = mlx4_mac_to_u64(smac); if (!smac_info->smac || smac_info->smac != u64_mac) { /* register candidate now, unreg if needed, after success */ smac_index = mlx4_register_mac(dev->dev, port, u64_mac); if (smac_index >= 0) { smac_info->candidate_smac_index = smac_index; smac_info->candidate_smac = u64_mac; smac_info->candidate_smac_port = port; } else return -EINVAL; } else smac_index = smac_info->smac_index; memcpy(path->dmac, ah->dmac, 6); path->ackto = MLX4_IB_LINK_TYPE_ETH; /* put MAC table smac index for IBoE */ path->grh_mylmc = (u8) (smac_index) | 0x80 ; } else path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((port - 1) << 6) | ((ah->sl & 0xf) << 2); return 0; } static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { struct mlx4_ib_gid_entry *ge, *tmp; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) { ge->added = 1; ge->port = qp->port; } } } static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, const u8 *smac, struct mlx4_qp_context *context) { struct net_device *ndev; u64 u64_mac; int smac_index; ndev = dev->iboe.netdevs[qp->port - 1]; if (ndev) { smac = IF_LLADDR(ndev); u64_mac = mlx4_mac_to_u64(smac); } else { u64_mac = dev->dev->caps.def_mac[qp->port]; } context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6); if (!qp->pri.smac) { smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac); if (smac_index >= 0) { qp->pri.candidate_smac_index = smac_index; qp->pri.candidate_smac = u64_mac; qp->pri.candidate_smac_port = qp->port; context->pri_path.grh_mylmc = 0x80 | (u8) smac_index; } else return -ENOENT; } return 0; } static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) { struct mlx4_ib_dev *dev = to_mdev(ibqp->device); struct mlx4_ib_qp *qp = to_mqp(ibqp); struct mlx4_ib_pd *pd; struct mlx4_ib_cq *send_cq, *recv_cq; struct mlx4_qp_context *context; enum mlx4_qp_optpar optpar = 0; int sqd_event; int steer_qp = 0; int err = -EINVAL; int is_eth = -1; context = kzalloc(sizeof *context, GFP_KERNEL); if (!context) return -ENOMEM; context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) | (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16)); if (!(attr_mask & IB_QP_PATH_MIG_STATE)) context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); else { optpar |= MLX4_QP_OPTPAR_PM_STATE; switch (attr->path_mig_state) { case IB_MIG_MIGRATED: context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); break; case IB_MIG_REARM: context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11); break; case IB_MIG_ARMED: context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11); break; } } if (qp->max_inlr_data) context->param3 |= cpu_to_be32(1 << 25); if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) context->mtu_msgmax = (IB_MTU_4096 << 5) | 11; else if (ibqp->qp_type == IB_QPT_RAW_PACKET) context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX; else if (ibqp->qp_type == IB_QPT_UD) { if (qp->flags & MLX4_IB_QP_LSO) context->mtu_msgmax = (IB_MTU_4096 << 5) | ilog2(dev->dev->caps.max_gso_sz); else context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; } else if (attr_mask & IB_QP_PATH_MTU) { if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) { pr_err("path MTU (%u) is invalid\n", attr->path_mtu); goto out; } context->mtu_msgmax = (attr->path_mtu << 5) | ilog2(dev->dev->caps.max_msg_sz); } if (qp->rq.wqe_cnt) context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3; context->rq_size_stride |= qp->rq.wqe_shift - 4; if (qp->sq.wqe_cnt) context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3; context->sq_size_stride |= qp->sq.wqe_shift - 4; if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { context->sq_size_stride |= !!qp->sq_no_prefetch << 7; context->xrcd = cpu_to_be32((u32) qp->xrcdn); context->param3 |= cpu_to_be32(1 << 30); } if (qp->ibqp.uobject) context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index); else context->usr_page = cpu_to_be32(qp->bf.uar->index); if (attr_mask & IB_QP_DEST_QPN) context->remote_qpn = cpu_to_be32(attr->dest_qp_num); if (attr_mask & IB_QP_PORT) { if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD && !(attr_mask & IB_QP_AV)) { mlx4_set_sched(&context->pri_path, attr->port_num); optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE; } } if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { if (dev->counters[qp->port - 1].counter_index != -1) { context->pri_path.counter_index = dev->counters[qp->port - 1].counter_index; optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX; } else { context->pri_path.counter_index = 0xff; } if (qp->flags & MLX4_IB_QP_NETIF && (qp->qpg_type == IB_QPG_NONE || qp->qpg_type == IB_QPG_PARENT)) { mlx4_ib_steer_qp_reg(dev, qp, 1); steer_qp = 1; } } if (attr_mask & IB_QP_PKEY_INDEX) { if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) context->pri_path.disable_pkey_check = 0x40; context->pri_path.pkey_index = attr->pkey_index; optpar |= MLX4_QP_OPTPAR_PKEY_INDEX; } if ((attr_mask & IB_QP_AV) && (ibqp->qp_type != IB_QPT_RAW_PACKET)) { if (mlx4_set_path(dev, &attr->ah_attr, (u8 *)attr->smac, attr_mask & IB_QP_VID ? attr->vlan_id : 0xffff , qp, &context->pri_path, attr_mask & IB_QP_PORT ? attr->port_num : qp->port, 1)) goto out; optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | MLX4_QP_OPTPAR_SCHED_QUEUE); } if (attr_mask & IB_QP_TIMEOUT) { context->pri_path.ackto |= attr->timeout << 3; optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT; } if (attr_mask & IB_QP_ALT_PATH) { if (attr->alt_port_num == 0 || attr->alt_port_num > dev->dev->caps.num_ports) goto out; if (attr->alt_pkey_index >= dev->dev->caps.pkey_table_len[attr->alt_port_num]) goto out; if (mlx4_set_path(dev, &attr->alt_ah_attr, (u8 *)attr->smac, attr_mask & IB_QP_ALT_VID ? attr->alt_vlan_id : 0xffff, qp, &context->alt_path, attr->alt_port_num, 0)) goto out; context->alt_path.pkey_index = attr->alt_pkey_index; context->alt_path.ackto = attr->alt_timeout << 3; context->alt_path.counter_index = dev->counters[attr->alt_port_num - 1].counter_index; optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH; } pd = get_pd(qp); get_cqs(qp, &send_cq, &recv_cq); context->pd = cpu_to_be32(pd->pdn); context->cqn_send = cpu_to_be32(send_cq->mcq.cqn); context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn); context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); /* Set "fast registration enabled" for all kernel QPs */ if (!qp->ibqp.uobject) context->params1 |= cpu_to_be32(1 << 11); if (attr_mask & IB_QP_RNR_RETRY) { context->params1 |= cpu_to_be32(attr->rnr_retry << 13); optpar |= MLX4_QP_OPTPAR_RNR_RETRY; } if (attr_mask & IB_QP_RETRY_CNT) { context->params1 |= cpu_to_be32(attr->retry_cnt << 16); optpar |= MLX4_QP_OPTPAR_RETRY_COUNT; } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { if (attr->max_rd_atomic) context->params1 |= cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); optpar |= MLX4_QP_OPTPAR_SRA_MAX; } if (attr_mask & IB_QP_SQ_PSN) context->next_send_psn = cpu_to_be32(attr->sq_psn); if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { if (attr->max_dest_rd_atomic) context->params2 |= cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); optpar |= MLX4_QP_OPTPAR_RRA_MAX; } if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask); optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE; } if (attr_mask & IB_M_EXT_CLASS_1) context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_MASTER); /* for now we enable also sqe on send */ if (attr_mask & IB_M_EXT_CLASS_2) { context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_SYNC_SQ); context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_MASTER); } if (attr_mask & IB_M_EXT_CLASS_3) context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_SYNC_RQ); if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { context->params2 |= (qp->flags & MLX4_IB_QP_CAP_CROSS_CHANNEL ? cpu_to_be32(MLX4_QP_BIT_COLL_MASTER) : 0); context->params2 |= (qp->flags & MLX4_IB_QP_CAP_MANAGED_SEND ? cpu_to_be32(MLX4_QP_BIT_COLL_MASTER | MLX4_QP_BIT_COLL_SYNC_SQ) : 0); context->params2 |= (qp->flags & MLX4_IB_QP_CAP_MANAGED_RECV ? cpu_to_be32(MLX4_QP_BIT_COLL_MASTER | MLX4_QP_BIT_COLL_SYNC_RQ) : 0); } if (ibqp->srq) context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC); if (attr_mask & IB_QP_MIN_RNR_TIMER) { context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT; } if (attr_mask & IB_QP_RQ_PSN) context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); /* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */ if (attr_mask & IB_QP_QKEY) { if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) context->qkey = cpu_to_be32(IB_QP_SET_QKEY); else { if (mlx4_is_mfunc(dev->dev) && !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) && (attr->qkey & MLX4_RESERVED_QKEY_MASK) == MLX4_RESERVED_QKEY_BASE) { pr_err("Cannot use reserved QKEY" " 0x%x (range 0xffff0000..0xffffffff" " is reserved)\n", attr->qkey); err = -EINVAL; goto out; } context->qkey = cpu_to_be32(attr->qkey); } optpar |= MLX4_QP_OPTPAR_Q_KEY; } if (ibqp->srq) context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn); if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->db_rec_addr = cpu_to_be64(qp->db.dma); if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR && (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI || ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == IB_QPT_RAW_PACKET)) { context->pri_path.sched_queue = (qp->port - 1) << 6; if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) { context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE; if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI) context->pri_path.fl = 0x80; } else { if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) context->pri_path.fl = 0x80; context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE; } if (ibqp->qp_type == IB_QPT_RAW_PACKET && (attr_mask & IB_QP_AV)) { context->pri_path.sched_queue |= ((attr->ah_attr.sl & 0xf) << 3); context->pri_path.feup = 1 << 6; } is_eth = rdma_port_get_link_layer(&dev->ib_dev, qp->port) == IB_LINK_LAYER_ETHERNET; if (is_eth) { if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI || qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) context->pri_path.feup = 1 << 7; /* don't fsm */ /* handle smac_index */ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD || qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI || qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) { err = handle_eth_ud_smac_index(dev, qp, (const u8 *)attr->smac, context); if (err) return -EINVAL; } } } if (ibqp->qp_type == IB_QPT_UD) if (is_eth && (new_state == IB_QPS_RTR)) { context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH; optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH; } if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) sqd_event = 1; else sqd_event = 0; if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->rlkey |= (1 << 4); if ((attr_mask & IB_QP_GROUP_RSS) && (qp->qpg_data->rss_child_count > 1)) { struct mlx4_ib_qpg_data *qpg_data = qp->qpg_data; void *rss_context_base = &context->pri_path; struct mlx4_rss_context *rss_context = (struct mlx4_rss_context *) (rss_context_base + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH); context->flags |= cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET); /* This should be tbl_sz_base_qpn */ rss_context->base_qpn = cpu_to_be32(qpg_data->rss_qpn_base | (ilog2(qpg_data->rss_child_count) << 24)); rss_context->default_qpn = cpu_to_be32(qpg_data->rss_qpn_base); /* This should be flags_hash_fn */ rss_context->flags = MLX4_RSS_TCP_IPV6 | MLX4_RSS_TCP_IPV4; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS) { rss_context->base_qpn_udp = rss_context->default_qpn; rss_context->flags |= MLX4_RSS_IPV6 | MLX4_RSS_IPV4 | MLX4_RSS_UDP_IPV6 | MLX4_RSS_UDP_IPV4; } if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP) { static const u32 rsskey[10] = { 0xD181C62C, 0xF7F4DB5B, 0x1983A2FC, 0x943E1ADB, 0xD9389E6B, 0xD1039C2C, 0xA74499AD, 0x593D56D9, 0xF3253C06, 0x2ADC1FFC}; rss_context->hash_fn = MLX4_RSS_HASH_TOP; memcpy(rss_context->rss_key, rsskey, sizeof(rss_context->rss_key)); } else { rss_context->hash_fn = MLX4_RSS_HASH_XOR; memset(rss_context->rss_key, 0, sizeof(rss_context->rss_key)); } } /* * Before passing a kernel QP to the HW, make sure that the * ownership bits of the send queue are set and the SQ * headroom is stamped so that the hardware doesn't start * processing stale work requests. */ if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { struct mlx4_wqe_ctrl_seg *ctrl; int i; for (i = 0; i < qp->sq.wqe_cnt; ++i) { ctrl = get_send_wqe(qp, i); ctrl->owner_opcode = cpu_to_be32(1 << 31); if (qp->sq_max_wqes_per_wr == 1) ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4); stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift); } } if ((qp->port && rdma_port_get_link_layer(&dev->ib_dev, qp->port) == IB_LINK_LAYER_ETHERNET) && (qp->ibqp.qp_type == IB_QPT_RAW_PACKET)) context->pri_path.ackto = (context->pri_path.ackto & 0xf8) | MLX4_IB_LINK_TYPE_ETH; err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state), to_mlx4_state(new_state), context, optpar, sqd_event, &qp->mqp); if (err) goto out; qp->state = new_state; if (attr_mask & IB_QP_ACCESS_FLAGS) qp->atomic_rd_en = attr->qp_access_flags; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) qp->resp_depth = attr->max_dest_rd_atomic; if (attr_mask & IB_QP_PORT) { qp->port = attr->port_num; update_mcg_macs(dev, qp); } if (attr_mask & IB_QP_ALT_PATH) qp->alt_port = attr->alt_port_num; if (is_sqp(dev, qp)) store_sqp_attrs(to_msqp(qp), attr, attr_mask); /* Set 'ignore_cq_overrun' bits for collectives offload */ if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { if (attr_mask & (IB_M_EXT_CLASS_2 | IB_M_EXT_CLASS_3)) { err = mlx4_ib_ignore_overrun_cq(ibqp->send_cq); if (err) { pr_err("Failed to set ignore CQ " "overrun for QP 0x%x's send CQ\n", ibqp->qp_num); goto out; } if (ibqp->recv_cq != ibqp->send_cq) { err = mlx4_ib_ignore_overrun_cq(ibqp->recv_cq); if (err) { pr_err("Failed to set ignore " "CQ overrun for QP 0x%x's recv " "CQ\n", ibqp->qp_num); goto out; } } } } /* * If we moved QP0 to RTR, bring the IB link up; if we moved * QP0 to RESET or ERROR, bring the link back down. */ if (is_qp0(dev, qp)) { if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR) if (mlx4_INIT_PORT(dev->dev, qp->port)) pr_warn("INIT_PORT failed for port %d\n", qp->port); if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) mlx4_CLOSE_PORT(dev->dev, qp->port); } /* * If we moved a kernel QP to RESET, clean up all old CQ * entries and reinitialize the QP. */ if (new_state == IB_QPS_RESET) { if (!ibqp->uobject) { mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, ibqp->srq ? to_msrq(ibqp->srq) : NULL); if (send_cq != recv_cq) mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); qp->rq.head = 0; qp->rq.tail = 0; qp->sq.head = 0; qp->sq.tail = 0; qp->sq_next_wqe = 0; if (qp->rq.wqe_cnt) *qp->db.db = 0; if (qp->flags & MLX4_IB_QP_NETIF && (qp->qpg_type == IB_QPG_NONE || qp->qpg_type == IB_QPG_PARENT)) mlx4_ib_steer_qp_reg(dev, qp, 0); } if (qp->pri.smac) { mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); qp->pri.smac = 0; } if (qp->alt.smac) { mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); qp->alt.smac = 0; } if (qp->pri.vid < 0x1000) { mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); qp->pri.vid = 0xFFFF; qp->pri.candidate_vid = 0xFFFF; qp->pri.update_vid = 0; } if (qp->alt.vid < 0x1000) { mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); qp->alt.vid = 0xFFFF; qp->alt.candidate_vid = 0xFFFF; qp->alt.update_vid = 0; } } out: if (err && steer_qp) mlx4_ib_steer_qp_reg(dev, qp, 0); kfree(context); if (qp->pri.candidate_smac) { if (err) mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac); else { if (qp->pri.smac) { mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); } qp->pri.smac = qp->pri.candidate_smac; qp->pri.smac_index = qp->pri.candidate_smac_index; qp->pri.smac_port = qp->pri.candidate_smac_port; } qp->pri.candidate_smac = 0; qp->pri.candidate_smac_index = 0; qp->pri.candidate_smac_port = 0; } if (qp->alt.candidate_smac) { if (err) mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->pri.candidate_smac); else { if (qp->pri.smac) { mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); } qp->alt.smac = qp->alt.candidate_smac; qp->alt.smac_index = qp->alt.candidate_smac_index; qp->alt.smac_port = qp->alt.candidate_smac_port; } qp->pri.candidate_smac = 0; qp->pri.candidate_smac_index = 0; qp->pri.candidate_smac_port = 0; } if (qp->pri.update_vid) { if (err) { if (qp->pri.candidate_vid < 0x1000) mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port, qp->pri.candidate_vid); } else { if (qp->pri.vid < 0x1000) mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); qp->pri.vid = qp->pri.candidate_vid; qp->pri.vlan_port = qp->pri.candidate_vlan_port; qp->pri.vlan_index = qp->pri.candidate_vlan_index; } qp->pri.candidate_vid = 0xFFFF; qp->pri.update_vid = 0; } if (qp->alt.update_vid) { if (err) { if (qp->alt.candidate_vid < 0x1000) mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port, qp->alt.candidate_vid); } else { if (qp->alt.vid < 0x1000) mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); qp->alt.vid = qp->alt.candidate_vid; qp->alt.vlan_port = qp->alt.candidate_vlan_port; qp->alt.vlan_index = qp->alt.candidate_vlan_index; } qp->alt.candidate_vid = 0xFFFF; qp->alt.update_vid = 0; } return err; } int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(ibqp->device); struct mlx4_ib_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; int err = -EINVAL; int ll; mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; if (cur_state == new_state && cur_state == IB_QPS_RESET) { ll = IB_LINK_LAYER_UNSPECIFIED; } else { int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; ll = rdma_port_get_link_layer(&dev->ib_dev, port); } if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask & ~IB_M_QP_MOD_VEND_MASK, ll)) { pr_debug("qpn 0x%x: invalid attribute mask specified " "for transition %d to %d. qp_type %d," " attr_mask 0x%x\n", ibqp->qp_num, cur_state, new_state, ibqp->qp_type, attr_mask); goto out; } if ((attr_mask & IB_M_QP_MOD_VEND_MASK) && !dev->dev->caps.sync_qp) { pr_err("extended verbs are not supported by %s\n", dev->ib_dev.name); goto out; } if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || attr->port_num > dev->num_ports)) { pr_debug("qpn 0x%x: invalid port number (%d) specified " "for transition %d to %d. qp_type %d\n", ibqp->qp_num, attr->port_num, cur_state, new_state, ibqp->qp_type); goto out; } if (attr_mask & IB_QP_PKEY_INDEX) { int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) { pr_debug("qpn 0x%x: invalid pkey index (%d) specified " "for transition %d to %d. qp_type %d\n", ibqp->qp_num, attr->pkey_index, cur_state, new_state, ibqp->qp_type); goto out; } } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) { pr_debug("qpn 0x%x: max_rd_atomic (%d) too large. " "Transition %d to %d. qp_type %d\n", ibqp->qp_num, attr->max_rd_atomic, cur_state, new_state, ibqp->qp_type); goto out; } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) { pr_debug("qpn 0x%x: max_dest_rd_atomic (%d) too large. " "Transition %d to %d. qp_type %d\n", ibqp->qp_num, attr->max_dest_rd_atomic, cur_state, new_state, ibqp->qp_type); goto out; } if (cur_state == new_state && cur_state == IB_QPS_RESET) { err = 0; goto out; } err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); out: mutex_unlock(&qp->mutex); return err; } static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len) { struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device); struct ib_device *ib_dev = &mdev->ib_dev; struct mlx4_wqe_mlx_seg *mlx = wqe; struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); u16 pkey; u32 qkey; int send_size; int header_size; int spc; int i; if (wr->opcode != IB_WR_SEND) return -EINVAL; send_size = 0; for (i = 0; i < wr->num_sge; ++i) send_size += wr->sg_list[i].length; /* for proxy-qp0 sends, need to add in size of tunnel header */ /* for tunnel-qp0 sends, tunnel header is already in s/g list */ if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) send_size += sizeof (struct mlx4_ib_tunnel_header); ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header); if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) { sqp->ud_header.lrh.service_level = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; sqp->ud_header.lrh.destination_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); } mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); /* force loopback */ mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR); mlx->rlid = sqp->ud_header.lrh.destination_lid; sqp->ud_header.lrh.virtual_lane = 0; sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey); sqp->ud_header.bth.pkey = cpu_to_be16(pkey); if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER) sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); else sqp->ud_header.bth.destination_qpn = cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]); sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) return -EINVAL; sqp->ud_header.deth.qkey = cpu_to_be32(qkey); sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn); sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; sqp->ud_header.immediate_present = 0; header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); /* * Inline data segments may not cross a 64 byte boundary. If * our UD header is bigger than the space available up to the * next 64 byte boundary in the WQE, use two inline data * segments to hold the UD header. */ spc = MLX4_INLINE_ALIGN - ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); if (header_size <= spc) { inl->byte_count = cpu_to_be32(1 << 31 | header_size); memcpy(inl + 1, sqp->header_buf, header_size); i = 1; } else { inl->byte_count = cpu_to_be32(1 << 31 | spc); memcpy(inl + 1, sqp->header_buf, spc); inl = (void *) (inl + 1) + spc; memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); /* * Need a barrier here to make sure all the data is * visible before the byte_count field is set. * Otherwise the HCA prefetcher could grab the 64-byte * chunk with this inline segment and get a valid (!= * 0xffffffff) byte count but stale data, and end up * generating a packet with bad headers. * * The first inline segment's byte_count field doesn't * need a barrier, because it comes after a * control/MLX segment and therefore is at an offset * of 16 mod 64. */ wmb(); inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); i = 2; } *mlx_seg_len = ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); return 0; } static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len) { struct ib_device *ib_dev = sqp->qp.ibqp.device; struct mlx4_wqe_mlx_seg *mlx = wqe; struct mlx4_wqe_ctrl_seg *ctrl = wqe; struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); union ib_gid sgid; u16 pkey; int send_size; int header_size; int spc; int i; int is_eth; int is_vlan = 0; int is_grh; u16 uninitialized_var(vlan); int err = 0; send_size = 0; for (i = 0; i < wr->num_sge; ++i) send_size += wr->sg_list[i].length; is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; is_grh = mlx4_ib_ah_grh_present(ah); if (is_eth) { if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { /* When multi-function is enabled, the ib_core gid * indexes don't necessarily match the hw ones, so * we must use our own cache */ err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, ah->av.ib.gid_index, &sgid.raw[0]); if (err) return err; } else { err = ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, ah->av.ib.gid_index, &sgid); if (err) return err; } if (is_eth && ah->av.eth.vlan != 0xffff) { vlan = cpu_to_be16(ah->av.eth.vlan) & 0x0fff; is_vlan = 1; } } ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); if (!is_eth) { sqp->ud_header.lrh.service_level = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid; sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); } if (is_grh) { sqp->ud_header.grh.traffic_class = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; sqp->ud_header.grh.flow_label = ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit; if (is_eth) memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16); else { if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { /* When multi-function is enabled, the ib_core gid * indexes don't necessarily match the hw ones, so * we must use our own cache */ sqp->ud_header.grh.source_gid.global.subnet_prefix = to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. subnet_prefix; sqp->ud_header.grh.source_gid.global.interface_id = to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. guid_cache[ah->av.ib.gid_index]; } else ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid); } memcpy(sqp->ud_header.grh.destination_gid.raw, ah->av.ib.dgid, 16); } mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); if (!is_eth) { mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | (sqp->ud_header.lrh.service_level << 8)); if (ah->av.ib.port_pd & cpu_to_be32(0x80000000)) mlx->flags |= cpu_to_be32(0x1); /* force loopback */ mlx->rlid = sqp->ud_header.lrh.destination_lid; } switch (wr->opcode) { case IB_WR_SEND: sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; sqp->ud_header.immediate_present = 0; break; case IB_WR_SEND_WITH_IMM: sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; sqp->ud_header.immediate_present = 1; sqp->ud_header.immediate_data = wr->ex.imm_data; break; default: return -EINVAL; } if (is_eth) { u8 *smac; struct in6_addr in6; u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13; mlx->sched_prio = cpu_to_be16(pcp); memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); /* FIXME: cache smac value? */ memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2); memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4); memcpy(&in6, sgid.raw, sizeof(in6)); if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) smac = IF_LLADDR(to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]); else smac = ah->av.eth.s_mac; /* use the src mac of the tunnel */ memcpy(sqp->ud_header.eth.smac_h, smac, 6); if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); if (!is_vlan) { sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); } else { sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp); } } else { sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; } sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); if (!sqp->qp.ibqp.qp_num) ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); else ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey); sqp->ud_header.bth.pkey = cpu_to_be16(pkey); sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? sqp->qkey : wr->wr.ud.remote_qkey); sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); if (0) { pr_err("built UD header of size %d:\n", header_size); for (i = 0; i < header_size / 4; ++i) { if (i % 8 == 0) pr_err(" [%02x] ", i * 4); pr_cont(" %08x", be32_to_cpu(((__be32 *) sqp->header_buf)[i])); if ((i + 1) % 8 == 0) pr_cont("\n"); } pr_err("\n"); } /* * Inline data segments may not cross a 64 byte boundary. If * our UD header is bigger than the space available up to the * next 64 byte boundary in the WQE, use two inline data * segments to hold the UD header. */ spc = MLX4_INLINE_ALIGN - ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); if (header_size <= spc) { inl->byte_count = cpu_to_be32(1 << 31 | header_size); memcpy(inl + 1, sqp->header_buf, header_size); i = 1; } else { inl->byte_count = cpu_to_be32(1 << 31 | spc); memcpy(inl + 1, sqp->header_buf, spc); inl = (void *) (inl + 1) + spc; memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); /* * Need a barrier here to make sure all the data is * visible before the byte_count field is set. * Otherwise the HCA prefetcher could grab the 64-byte * chunk with this inline segment and get a valid (!= * 0xffffffff) byte count but stale data, and end up * generating a packet with bad headers. * * The first inline segment's byte_count field doesn't * need a barrier, because it comes after a * control/MLX segment and therefore is at an offset * of 16 mod 64. */ wmb(); inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); i = 2; } *mlx_seg_len = ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); return 0; } static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq) { unsigned cur; struct mlx4_ib_cq *cq; cur = wq->head - wq->tail; if (likely(cur + nreq < wq->max_post)) return 0; cq = to_mcq(ib_cq); spin_lock(&cq->lock); cur = wq->head - wq->tail; spin_unlock(&cq->lock); return cur + nreq >= wq->max_post; } static __be32 convert_access(int acc) { return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC) : 0) | (acc & IB_ACCESS_REMOTE_WRITE ? cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) | (acc & IB_ACCESS_REMOTE_READ ? cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ) : 0) | - (acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0) | cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ); } static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr) { struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list); int i; for (i = 0; i < wr->wr.fast_reg.page_list_len; ++i) mfrpl->mapped_page_list[i] = cpu_to_be64(wr->wr.fast_reg.page_list->page_list[i] | MLX4_MTT_FLAG_PRESENT); fseg->flags = convert_access(wr->wr.fast_reg.access_flags); fseg->mem_key = cpu_to_be32(wr->wr.fast_reg.rkey); fseg->buf_list = cpu_to_be64(mfrpl->map); fseg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); fseg->reg_len = cpu_to_be64(wr->wr.fast_reg.length); fseg->offset = 0; /* XXX -- is this just for ZBVA? */ fseg->page_size = cpu_to_be32(wr->wr.fast_reg.page_shift); fseg->reserved[0] = 0; fseg->reserved[1] = 0; } static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ib_send_wr *wr) { bseg->flags1 = convert_access(wr->wr.bind_mw.bind_info.mw_access_flags) & cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ | MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE | MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC); bseg->flags2 = 0; if (wr->wr.bind_mw.mw->type == IB_MW_TYPE_2) bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_TYPE_2); if (wr->wr.bind_mw.bind_info.mw_access_flags & IB_ZERO_BASED) bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_ZERO_BASED); bseg->new_rkey = cpu_to_be32(wr->wr.bind_mw.rkey); bseg->lkey = cpu_to_be32(wr->wr.bind_mw.bind_info.mr->lkey); bseg->addr = cpu_to_be64(wr->wr.bind_mw.bind_info.addr); bseg->length = cpu_to_be64(wr->wr.bind_mw.bind_info.length); } static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey) { iseg->mem_key = cpu_to_be32(rkey); iseg->reserved1 = 0; iseg->reserved2 = 0; iseg->reserved3[0] = 0; iseg->reserved3[1] = 0; } static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, u64 remote_addr, u32 rkey) { rseg->raddr = cpu_to_be64(remote_addr); rseg->rkey = cpu_to_be32(rkey); rseg->reserved = 0; } static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *wr) { if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap); aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add); } else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) { aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add_mask); } else { aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); aseg->compare = 0; } } static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg, struct ib_send_wr *wr) { aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap); aseg->swap_add_mask = cpu_to_be64(wr->wr.atomic.swap_mask); aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add); aseg->compare_mask = cpu_to_be64(wr->wr.atomic.compare_add_mask); } static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, struct ib_send_wr *wr) { memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan; memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6); } static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, struct mlx4_wqe_datagram_seg *dseg, struct ib_send_wr *wr, enum ib_qp_type qpt) { union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av; struct mlx4_av sqp_av = {0}; int port = *((u8 *) &av->ib.port_pd) & 0x3; /* force loopback */ sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000); sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */ sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel & cpu_to_be32(0xf0000000); memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av)); /* This function used only for sending on QP1 proxies */ dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]); /* Use QKEY from the QP context, which is set by master */ dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY); } static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len) { struct mlx4_wqe_inline_seg *inl = wqe; struct mlx4_ib_tunnel_header hdr; struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); int spc; int i; memcpy(&hdr.av, &ah->av, sizeof hdr.av); hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index); hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); memcpy(hdr.mac, ah->av.eth.mac, 6); hdr.vlan = cpu_to_be16(ah->av.eth.vlan); spc = MLX4_INLINE_ALIGN - ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); if (sizeof (hdr) <= spc) { memcpy(inl + 1, &hdr, sizeof (hdr)); wmb(); inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr)); i = 1; } else { memcpy(inl + 1, &hdr, spc); wmb(); inl->byte_count = cpu_to_be32(1 << 31 | spc); inl = (void *) (inl + 1) + spc; memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc); wmb(); inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc)); i = 2; } *mlx_seg_len = ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof (hdr), 16); } static void set_mlx_icrc_seg(void *dseg) { u32 *t = dseg; struct mlx4_wqe_inline_seg *iseg = dseg; t[1] = 0; /* * Need a barrier here before writing the byte_count field to * make sure that all the data is visible before the * byte_count field is set. Otherwise, if the segment begins * a new cacheline, the HCA prefetcher could grab the 64-byte * chunk and get a valid (!= * 0xffffffff) byte count but * stale data, and end up sending the wrong data. */ wmb(); iseg->byte_count = cpu_to_be32((1 << 31) | 4); } static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) { dseg->lkey = cpu_to_be32(sg->lkey); dseg->addr = cpu_to_be64(sg->addr); /* * Need a barrier here before writing the byte_count field to * make sure that all the data is visible before the * byte_count field is set. Otherwise, if the segment begins * a new cacheline, the HCA prefetcher could grab the 64-byte * chunk and get a valid (!= * 0xffffffff) byte count but * stale data, and end up sending the wrong data. */ wmb(); dseg->byte_count = cpu_to_be32(sg->length); } static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) { dseg->byte_count = cpu_to_be32(sg->length); dseg->lkey = cpu_to_be32(sg->lkey); dseg->addr = cpu_to_be64(sg->addr); } static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, struct mlx4_ib_qp *qp, unsigned *lso_seg_len, __be32 *lso_hdr_sz, __be32 *blh) { unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16); if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE)) *blh = cpu_to_be32(1 << 6); if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) && wr->num_sge > qp->sq.max_gs - (halign >> 4))) return -EINVAL; memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen); *lso_hdr_sz = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 | wr->wr.ud.hlen); *lso_seg_len = halign; return 0; } static __be32 send_ieth(struct ib_send_wr *wr) { switch (wr->opcode) { case IB_WR_SEND_WITH_IMM: case IB_WR_RDMA_WRITE_WITH_IMM: return wr->ex.imm_data; case IB_WR_SEND_WITH_INV: return cpu_to_be32(wr->ex.invalidate_rkey); default: return 0; } } static void add_zero_len_inline(void *wqe) { struct mlx4_wqe_inline_seg *inl = wqe; memset(wqe, 0, 16); inl->byte_count = cpu_to_be32(1 << 31); } static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr, void *wqe, int *sz) { struct mlx4_wqe_inline_seg *seg; void *addr; int len, seg_len; int num_seg; int off, to_copy; int i; int inl = 0; seg = wqe; wqe += sizeof *seg; off = ((unsigned long)wqe) & (unsigned long)(MLX4_INLINE_ALIGN - 1); num_seg = 0; seg_len = 0; for (i = 0; i < wr->num_sge; ++i) { addr = (void *) (unsigned long)(wr->sg_list[i].addr); len = wr->sg_list[i].length; inl += len; if (inl > qp->max_inline_data) { inl = 0; return -1; } while (len >= MLX4_INLINE_ALIGN - off) { to_copy = MLX4_INLINE_ALIGN - off; memcpy(wqe, addr, to_copy); len -= to_copy; wqe += to_copy; addr += to_copy; seg_len += to_copy; wmb(); /* see comment below */ seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); seg_len = 0; seg = wqe; wqe += sizeof *seg; off = sizeof *seg; ++num_seg; } memcpy(wqe, addr, len); wqe += len; seg_len += len; off += len; } if (seg_len) { ++num_seg; /* * Need a barrier here to make sure * all the data is visible before the * byte_count field is set. Otherwise * the HCA prefetcher could grab the * 64-byte chunk with this inline * segment and get a valid (!= * 0xffffffff) byte count but stale * data, and end up sending the wrong * data. */ wmb(); seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); } *sz = (inl + num_seg * sizeof *seg + 15) / 16; return 0; } /* * Avoid using memcpy() to copy to BlueFlame page, since memcpy() * implementations may use move-string-buffer assembler instructions, * which do not guarantee order of copying. */ static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt) { __iowrite64_copy(dst, src, bytecnt / 8); } int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { struct mlx4_ib_qp *qp = to_mqp(ibqp); void *wqe; struct mlx4_wqe_ctrl_seg *uninitialized_var(ctrl); struct mlx4_wqe_data_seg *dseg; unsigned long flags; int nreq; int err = 0; unsigned ind; int uninitialized_var(stamp); int uninitialized_var(size); unsigned uninitialized_var(seglen); __be32 dummy; __be32 *lso_wqe; __be32 uninitialized_var(lso_hdr_sz); __be32 blh; int i; int inl = 0; spin_lock_irqsave(&qp->sq.lock, flags); ind = qp->sq_next_wqe; for (nreq = 0; wr; ++nreq, wr = wr->next) { lso_wqe = &dummy; blh = 0; if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { err = -ENOMEM; *bad_wr = wr; goto out; } if (unlikely(wr->num_sge > qp->sq.max_gs)) { err = -EINVAL; *bad_wr = wr; goto out; } ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); *((u32 *) (&ctrl->vlan_tag)) = 0; qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id; ctrl->srcrb_flags = (wr->send_flags & IB_SEND_SIGNALED ? cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | (wr->send_flags & IB_SEND_SOLICITED ? cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) | ((wr->send_flags & IB_SEND_IP_CSUM) ? cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) | qp->sq_signal_bits; ctrl->imm = send_ieth(wr); wqe += sizeof *ctrl; size = sizeof *ctrl / 16; switch (qp->mlx4_ib_qp_type) { case MLX4_IB_QPT_RC: case MLX4_IB_QPT_UC: switch (wr->opcode) { case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: set_raddr_seg(wqe, wr->wr.atomic.remote_addr, wr->wr.atomic.rkey); wqe += sizeof (struct mlx4_wqe_raddr_seg); set_atomic_seg(wqe, wr); wqe += sizeof (struct mlx4_wqe_atomic_seg); size += (sizeof (struct mlx4_wqe_raddr_seg) + sizeof (struct mlx4_wqe_atomic_seg)) / 16; break; case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: set_raddr_seg(wqe, wr->wr.atomic.remote_addr, wr->wr.atomic.rkey); wqe += sizeof (struct mlx4_wqe_raddr_seg); set_masked_atomic_seg(wqe, wr); wqe += sizeof (struct mlx4_wqe_masked_atomic_seg); size += (sizeof (struct mlx4_wqe_raddr_seg) + sizeof (struct mlx4_wqe_masked_atomic_seg)) / 16; break; case IB_WR_RDMA_READ: case IB_WR_RDMA_WRITE: case IB_WR_RDMA_WRITE_WITH_IMM: set_raddr_seg(wqe, wr->wr.rdma.remote_addr, wr->wr.rdma.rkey); wqe += sizeof (struct mlx4_wqe_raddr_seg); size += sizeof (struct mlx4_wqe_raddr_seg) / 16; break; case IB_WR_LOCAL_INV: ctrl->srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); set_local_inv_seg(wqe, wr->ex.invalidate_rkey); wqe += sizeof (struct mlx4_wqe_local_inval_seg); size += sizeof (struct mlx4_wqe_local_inval_seg) / 16; break; case IB_WR_FAST_REG_MR: ctrl->srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); set_fmr_seg(wqe, wr); wqe += sizeof (struct mlx4_wqe_fmr_seg); size += sizeof (struct mlx4_wqe_fmr_seg) / 16; break; case IB_WR_BIND_MW: ctrl->srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); set_bind_seg(wqe, wr); wqe += sizeof(struct mlx4_wqe_bind_seg); size += sizeof(struct mlx4_wqe_bind_seg) / 16; default: /* No extra segments required for sends */ break; } break; case MLX4_IB_QPT_TUN_SMI_OWNER: err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen); if (unlikely(err)) { *bad_wr = wr; goto out; } wqe += seglen; size += seglen / 16; break; case MLX4_IB_QPT_TUN_SMI: case MLX4_IB_QPT_TUN_GSI: /* this is a UD qp used in MAD responses to slaves. */ set_datagram_seg(wqe, wr); /* set the forced-loopback bit in the data seg av */ *(__be32 *) wqe |= cpu_to_be32(0x80000000); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; break; case MLX4_IB_QPT_UD: set_datagram_seg(wqe, wr); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; if (wr->opcode == IB_WR_LSO) { err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh); if (unlikely(err)) { *bad_wr = wr; goto out; } lso_wqe = (__be32 *) wqe; wqe += seglen; size += seglen / 16; } break; case MLX4_IB_QPT_PROXY_SMI_OWNER: if (unlikely(!mlx4_is_master(to_mdev(ibqp->device)->dev))) { err = -ENOSYS; *bad_wr = wr; goto out; } err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen); if (unlikely(err)) { *bad_wr = wr; goto out; } wqe += seglen; size += seglen / 16; /* to start tunnel header on a cache-line boundary */ add_zero_len_inline(wqe); wqe += 16; size++; build_tunnel_header(wr, wqe, &seglen); wqe += seglen; size += seglen / 16; break; case MLX4_IB_QPT_PROXY_SMI: /* don't allow QP0 sends on guests */ err = -ENOSYS; *bad_wr = wr; goto out; case MLX4_IB_QPT_PROXY_GSI: /* If we are tunneling special qps, this is a UD qp. * In this case we first add a UD segment targeting * the tunnel qp, and then add a header with address * information */ set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, ibqp->qp_type); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; build_tunnel_header(wr, wqe, &seglen); wqe += seglen; size += seglen / 16; break; case MLX4_IB_QPT_SMI: case MLX4_IB_QPT_GSI: err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen); if (unlikely(err)) { *bad_wr = wr; goto out; } wqe += seglen; size += seglen / 16; break; default: break; } /* * Write data segments in reverse order, so as to * overwrite cacheline stamp last within each * cacheline. This avoids issues with WQE * prefetching. */ dseg = wqe; dseg += wr->num_sge - 1; /* Add one more inline data segment for ICRC for MLX sends */ if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI || qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))) { set_mlx_icrc_seg(dseg + 1); size += sizeof (struct mlx4_wqe_data_seg) / 16; } if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) { int sz; err = lay_inline_data(qp, wr, wqe, &sz); if (!err) { inl = 1; size += sz; } } else { size += wr->num_sge * (sizeof(struct mlx4_wqe_data_seg) / 16); for (i = wr->num_sge - 1; i >= 0; --i, --dseg) set_data_seg(dseg, wr->sg_list + i); } /* * Possibly overwrite stamping in cacheline with LSO * segment only after making sure all data segments * are written. */ wmb(); *lso_wqe = lso_hdr_sz; ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ? MLX4_WQE_CTRL_FENCE : 0) | size; /* * Make sure descriptor is fully written before * setting ownership bit (because HW can start * executing as soon as we do). */ wmb(); if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) { *bad_wr = wr; err = -EINVAL; goto out; } ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh; stamp = ind + qp->sq_spare_wqes; ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); /* * We can improve latency by not stamping the last * send queue WQE until after ringing the doorbell, so * only stamp here if there are still more WQEs to post. * * Same optimization applies to padding with NOP wqe * in case of WQE shrinking (used to prevent wrap-around * in the middle of WR). */ if (wr->next) { stamp_send_wqe(qp, stamp, size * 16); ind = pad_wraparound(qp, ind); } } out: if (nreq == 1 && inl && size > 1 && size < qp->bf.buf_size / 16) { ctrl->owner_opcode |= htonl((qp->sq_next_wqe & 0xffff) << 8); /* We set above doorbell_qpn bits to 0 as part of vlan * tag initialization, so |= should be correct. */ *(u32 *) (&ctrl->vlan_tag) |= qp->doorbell_qpn; /* * Make sure that descriptor is written to memory * before writing to BlueFlame page. */ wmb(); ++qp->sq.head; mlx4_bf_copy(qp->bf.reg + qp->bf.offset, (unsigned long *) ctrl, ALIGN(size * 16, 64)); wc_wmb(); qp->bf.offset ^= qp->bf.buf_size; } else if (nreq) { qp->sq.head += nreq; /* * Make sure that descriptors are written before * doorbell record. */ wmb(); writel(qp->doorbell_qpn, qp->bf.uar->map + MLX4_SEND_DOORBELL); /* * Make sure doorbells don't leak out of SQ spinlock * and reach the HCA out of order. */ mmiowb(); } if (likely(nreq)) { stamp_send_wqe(qp, stamp, size * 16); ind = pad_wraparound(qp, ind); qp->sq_next_wqe = ind; } spin_unlock_irqrestore(&qp->sq.lock, flags); return err; } int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr) { struct mlx4_ib_qp *qp = to_mqp(ibqp); struct mlx4_wqe_data_seg *scat; unsigned long flags; int err = 0; int nreq; int ind; int max_gs; int i; max_gs = qp->rq.max_gs; spin_lock_irqsave(&qp->rq.lock, flags); ind = qp->rq.head & (qp->rq.wqe_cnt - 1); for (nreq = 0; wr; ++nreq, wr = wr->next) { if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { err = -ENOMEM; *bad_wr = wr; goto out; } if (unlikely(wr->num_sge > qp->rq.max_gs)) { err = -EINVAL; *bad_wr = wr; goto out; } scat = get_recv_wqe(qp, ind); if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) { ib_dma_sync_single_for_device(ibqp->device, qp->sqp_proxy_rcv[ind].map, sizeof (struct mlx4_ib_proxy_sqp_hdr), DMA_FROM_DEVICE); scat->byte_count = cpu_to_be32(sizeof (struct mlx4_ib_proxy_sqp_hdr)); /* use dma lkey from upper layer entry */ scat->lkey = cpu_to_be32(wr->sg_list->lkey); scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map); scat++; max_gs--; } for (i = 0; i < wr->num_sge; ++i) __set_data_seg(scat + i, wr->sg_list + i); if (i < max_gs) { scat[i].byte_count = 0; scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); scat[i].addr = 0; } qp->rq.wrid[ind] = wr->wr_id; ind = (ind + 1) & (qp->rq.wqe_cnt - 1); } out: if (likely(nreq)) { qp->rq.head += nreq; /* * Make sure that descriptors are written before * doorbell record. */ wmb(); *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); } spin_unlock_irqrestore(&qp->rq.lock, flags); return err; } static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state) { switch (mlx4_state) { case MLX4_QP_STATE_RST: return IB_QPS_RESET; case MLX4_QP_STATE_INIT: return IB_QPS_INIT; case MLX4_QP_STATE_RTR: return IB_QPS_RTR; case MLX4_QP_STATE_RTS: return IB_QPS_RTS; case MLX4_QP_STATE_SQ_DRAINING: case MLX4_QP_STATE_SQD: return IB_QPS_SQD; case MLX4_QP_STATE_SQER: return IB_QPS_SQE; case MLX4_QP_STATE_ERR: return IB_QPS_ERR; default: return -1; } } static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state) { switch (mlx4_mig_state) { case MLX4_QP_PM_ARMED: return IB_MIG_ARMED; case MLX4_QP_PM_REARM: return IB_MIG_REARM; case MLX4_QP_PM_MIGRATED: return IB_MIG_MIGRATED; default: return -1; } } static int to_ib_qp_access_flags(int mlx4_flags) { int ib_flags = 0; if (mlx4_flags & MLX4_QP_BIT_RRE) ib_flags |= IB_ACCESS_REMOTE_READ; if (mlx4_flags & MLX4_QP_BIT_RWE) ib_flags |= IB_ACCESS_REMOTE_WRITE; if (mlx4_flags & MLX4_QP_BIT_RAE) ib_flags |= IB_ACCESS_REMOTE_ATOMIC; return ib_flags; } static void to_ib_ah_attr(struct mlx4_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr, struct mlx4_qp_path *path) { struct mlx4_dev *dev = ibdev->dev; int is_eth; memset(ib_ah_attr, 0, sizeof *ib_ah_attr); ib_ah_attr->port_num = path->sched_queue & 0x40 ? 2 : 1; if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports) return; is_eth = rdma_port_get_link_layer(&ibdev->ib_dev, ib_ah_attr->port_num) == IB_LINK_LAYER_ETHERNET; if (is_eth) ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) | ((path->sched_queue & 4) << 1); else ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf; ib_ah_attr->dlid = be16_to_cpu(path->rlid); ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f; ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; ib_ah_attr->ah_flags = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0; if (ib_ah_attr->ah_flags) { ib_ah_attr->grh.sgid_index = path->mgid_index; ib_ah_attr->grh.hop_limit = path->hop_limit; ib_ah_attr->grh.traffic_class = (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; ib_ah_attr->grh.flow_label = be32_to_cpu(path->tclass_flowlabel) & 0xfffff; memcpy(ib_ah_attr->grh.dgid.raw, path->rgid, sizeof ib_ah_attr->grh.dgid.raw); } } int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { struct mlx4_ib_dev *dev = to_mdev(ibqp->device); struct mlx4_ib_qp *qp = to_mqp(ibqp); struct mlx4_qp_context context; int mlx4_state; int err = 0; mutex_lock(&qp->mutex); if (qp->state == IB_QPS_RESET) { qp_attr->qp_state = IB_QPS_RESET; goto done; } err = mlx4_qp_query(dev->dev, &qp->mqp, &context); if (err) { err = -EINVAL; goto out; } mlx4_state = be32_to_cpu(context.flags) >> 28; qp->state = to_ib_qp_state(mlx4_state); qp_attr->qp_state = qp->state; qp_attr->path_mtu = context.mtu_msgmax >> 5; qp_attr->path_mig_state = to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3); qp_attr->qkey = be32_to_cpu(context.qkey); qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff; qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff; qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff; qp_attr->qp_access_flags = to_ib_qp_access_flags(be32_to_cpu(context.params2)); if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path); to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path); qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f; qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; } qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f; if (qp_attr->qp_state == IB_QPS_INIT) qp_attr->port_num = qp->port; else qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1; /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING; qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7); qp_attr->max_dest_rd_atomic = 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7); qp_attr->min_rnr_timer = (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f; qp_attr->timeout = context.pri_path.ackto >> 3; qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7; qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7; qp_attr->alt_timeout = context.alt_path.ackto >> 3; done: qp_attr->cur_qp_state = qp_attr->qp_state; qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; qp_attr->cap.max_recv_sge = qp->rq.max_gs; if (!ibqp->uobject) { qp_attr->cap.max_send_wr = qp->sq.wqe_cnt; qp_attr->cap.max_send_sge = qp->sq.max_gs; } else { qp_attr->cap.max_send_wr = 0; qp_attr->cap.max_send_sge = 0; } /* * We don't support inline sends for kernel QPs (yet), and we * don't know what userspace's value should be. */ qp_attr->cap.max_inline_data = 0; qp_init_attr->cap = qp_attr->cap; qp_init_attr->create_flags = 0; if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; if (qp->flags & MLX4_IB_QP_LSO) qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; if (qp->flags & MLX4_IB_QP_NETIF) qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP; qp_init_attr->sq_sig_type = qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; if (qp->flags & MLX4_IB_QP_CAP_CROSS_CHANNEL) qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL; if (qp->flags & MLX4_IB_QP_CAP_MANAGED_SEND) qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND; if (qp->flags & MLX4_IB_QP_CAP_MANAGED_RECV) qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV; qp_init_attr->qpg_type = ibqp->qpg_type; if (ibqp->qpg_type == IB_QPG_PARENT) qp_init_attr->cap.qpg_tss_mask_sz = qp->qpg_data->qpg_tss_mask_sz; else qp_init_attr->cap.qpg_tss_mask_sz = 0; out: mutex_unlock(&qp->mutex); return err; } Index: head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c =================================================================== --- head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c (revision 296381) +++ head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c (revision 296382) @@ -1,999 +1,998 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ipoib.h" #include #include #include #include #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA static int data_debug_level; module_param(data_debug_level, int, 0644); MODULE_PARM_DESC(data_debug_level, "Enable data path debug tracing if > 0"); #endif static DEFINE_MUTEX(pkey_mutex); struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *priv, struct ib_pd *pd, struct ib_ah_attr *attr) { struct ipoib_ah *ah; ah = kmalloc(sizeof *ah, GFP_KERNEL); if (!ah) return NULL; ah->priv = priv; ah->last_send = 0; kref_init(&ah->ref); ah->ah = ib_create_ah(pd, attr); if (IS_ERR(ah->ah)) { kfree(ah); ah = NULL; } else ipoib_dbg(priv, "Created ah %p\n", ah->ah); return ah; } void ipoib_free_ah(struct kref *kref) { struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref); struct ipoib_dev_priv *priv = ah->priv; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); list_add_tail(&ah->list, &priv->dead_ahs); spin_unlock_irqrestore(&priv->lock, flags); } void ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req) { struct mbuf *m; int i; for (i = 0, m = rx_req->mb; m != NULL; m = m->m_next, i++) ib_dma_unmap_single(priv->ca, rx_req->mapping[i], m->m_len, DMA_FROM_DEVICE); } void ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length) { m_adj(mb, -(mb->m_pkthdr.len - length)); } struct mbuf * ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req, int size) { struct mbuf *mb, *m; int i, j; rx_req->mb = NULL; mb = m_getm2(NULL, size, M_NOWAIT, MT_DATA, M_PKTHDR); if (mb == NULL) return (NULL); for (i = 0, m = mb; m != NULL; m = m->m_next, i++) { m->m_len = M_SIZE(m); mb->m_pkthdr.len += m->m_len; rx_req->mapping[i] = ib_dma_map_single(priv->ca, mtod(m, void *), m->m_len, DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(priv->ca, rx_req->mapping[i]))) goto error; } rx_req->mb = mb; return (mb); error: for (j = 0, m = mb; j < i; m = m->m_next, j++) ib_dma_unmap_single(priv->ca, rx_req->mapping[j], m->m_len, DMA_FROM_DEVICE); m_freem(mb); return (NULL); } static int ipoib_ib_post_receive(struct ipoib_dev_priv *priv, int id) { struct ipoib_rx_buf *rx_req; struct ib_recv_wr *bad_wr; struct mbuf *m; int ret; int i; rx_req = &priv->rx_ring[id]; for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) { priv->rx_sge[i].addr = rx_req->mapping[i]; priv->rx_sge[i].length = m->m_len; } priv->rx_wr.num_sge = i; priv->rx_wr.wr_id = id | IPOIB_OP_RECV; ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret); ipoib_dma_unmap_rx(priv, &priv->rx_ring[id]); m_freem(priv->rx_ring[id].mb); priv->rx_ring[id].mb = NULL; } return ret; } static struct mbuf * ipoib_alloc_rx_mb(struct ipoib_dev_priv *priv, int id) { return ipoib_alloc_map_mb(priv, &priv->rx_ring[id], priv->max_ib_mtu + IB_GRH_BYTES); } static int ipoib_ib_post_receives(struct ipoib_dev_priv *priv) { int i; for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_alloc_rx_mb(priv, i)) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); return -ENOMEM; } if (ipoib_ib_post_receive(priv, i)) { ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i); return -EIO; } } return 0; } static void ipoib_ib_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { struct ipoib_rx_buf saverx; unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV; struct ifnet *dev = priv->dev; struct ipoib_header *eh; struct mbuf *mb; ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n", wr_id, wc->status); if (unlikely(wr_id >= ipoib_recvq_size)) { ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n", wr_id, ipoib_recvq_size); return; } mb = priv->rx_ring[wr_id].mb; if (unlikely(wc->status != IB_WC_SUCCESS)) { if (wc->status != IB_WC_WR_FLUSH_ERR) { ipoib_warn(priv, "failed recv event " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); goto repost; } if (mb) { ipoib_dma_unmap_rx(priv, &priv->rx_ring[wr_id]); m_freem(mb); priv->rx_ring[wr_id].mb = NULL; } return; } /* * Drop packets that this interface sent, ie multicast packets * that the HCA has replicated. */ if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) goto repost; memcpy(&saverx, &priv->rx_ring[wr_id], sizeof(saverx)); /* * If we can't allocate a new RX buffer, dump * this packet and reuse the old buffer. */ if (unlikely(!ipoib_alloc_rx_mb(priv, wr_id))) { memcpy(&priv->rx_ring[wr_id], &saverx, sizeof(saverx)); if_inc_counter(dev, IFCOUNTER_IQDROPS, 1); goto repost; } ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", wc->byte_len, wc->slid); ipoib_dma_unmap_rx(priv, &saverx); ipoib_dma_mb(priv, mb, wc->byte_len); if_inc_counter(dev, IFCOUNTER_IPACKETS, 1); if_inc_counter(dev, IFCOUNTER_IBYTES, mb->m_pkthdr.len); mb->m_pkthdr.rcvif = dev; m_adj(mb, sizeof(struct ib_grh) - INFINIBAND_ALEN); eh = mtod(mb, struct ipoib_header *); bzero(eh->hwaddr, 4); /* Zero the queue pair, only dgid is in grh */ if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok)) mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID; dev->if_input(dev, mb); repost: if (unlikely(ipoib_ib_post_receive(priv, wr_id))) ipoib_warn(priv, "ipoib_ib_post_receive failed " "for buf %d\n", wr_id); } int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max) { struct mbuf *mb = tx_req->mb; u64 *mapping = tx_req->mapping; struct mbuf *m, *p; int error; int i; for (m = mb, p = NULL, i = 0; m != NULL; p = m, m = m->m_next, i++) { if (m->m_len != 0) continue; if (p == NULL) panic("ipoib_dma_map_tx: First mbuf empty\n"); p->m_next = m_free(m); m = p; i--; } i--; if (i >= max) { tx_req->mb = mb = m_defrag(mb, M_NOWAIT); if (mb == NULL) return -EIO; for (m = mb, i = 0; m != NULL; m = m->m_next, i++); if (i >= max) return -EIO; } error = 0; for (m = mb, i = 0; m != NULL; m = m->m_next, i++) { mapping[i] = ib_dma_map_single(ca, mtod(m, void *), m->m_len, DMA_TO_DEVICE); if (unlikely(ib_dma_mapping_error(ca, mapping[i]))) { error = -EIO; break; } } if (error) { int end; end = i; for (m = mb, i = 0; i < end; m = m->m_next, i++) ib_dma_unmap_single(ca, mapping[i], m->m_len, DMA_TO_DEVICE); } return error; } void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req) { struct mbuf *mb = tx_req->mb; u64 *mapping = tx_req->mapping; struct mbuf *m; int i; for (m = mb, i = 0; m != NULL; m = m->m_next, i++) ib_dma_unmap_single(ca, mapping[i], m->m_len, DMA_TO_DEVICE); } static void ipoib_ib_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { struct ifnet *dev = priv->dev; unsigned int wr_id = wc->wr_id; struct ipoib_tx_buf *tx_req; ipoib_dbg_data(priv, "send completion: id %d, status: %d\n", wr_id, wc->status); if (unlikely(wr_id >= ipoib_sendq_size)) { ipoib_warn(priv, "send completion event with wrid %d (> %d)\n", wr_id, ipoib_sendq_size); return; } tx_req = &priv->tx_ring[wr_id]; ipoib_dma_unmap_tx(priv->ca, tx_req); if_inc_counter(dev, IFCOUNTER_OPACKETS, 1); m_freem(tx_req->mb); ++priv->tx_tail; if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && (dev->if_drv_flags & IFF_DRV_OACTIVE) && test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) dev->if_drv_flags &= ~IFF_DRV_OACTIVE; if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) ipoib_warn(priv, "failed send event " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); } int ipoib_poll_tx(struct ipoib_dev_priv *priv) { int n, i; n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc); for (i = 0; i < n; ++i) { struct ib_wc *wc = priv->send_wc + i; if (wc->wr_id & IPOIB_OP_CM) ipoib_cm_handle_tx_wc(priv, wc); else ipoib_ib_handle_tx_wc(priv, wc); } return n == MAX_SEND_CQE; } static void ipoib_poll(struct ipoib_dev_priv *priv) { int n, i; poll_more: spin_lock(&priv->drain_lock); for (;;) { n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); - for (i = 0; i < n; i++) { struct ib_wc *wc = priv->ibwc + i; if ((wc->wr_id & IPOIB_OP_RECV) == 0) panic("ipoib_poll: Bad wr_id 0x%jX\n", (intmax_t)wc->wr_id); if (wc->wr_id & IPOIB_OP_CM) ipoib_cm_handle_rx_wc(priv, wc); else ipoib_ib_handle_rx_wc(priv, wc); } if (n != IPOIB_NUM_WC) break; } spin_unlock(&priv->drain_lock); if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)) goto poll_more; } void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr) { struct ipoib_dev_priv *priv = dev_ptr; ipoib_poll(priv); } static void drain_tx_cq(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; spin_lock(&priv->lock); while (ipoib_poll_tx(priv)) ; /* nothing */ if (dev->if_drv_flags & IFF_DRV_OACTIVE) mod_timer(&priv->poll_timer, jiffies + 1); spin_unlock(&priv->lock); } void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr) { struct ipoib_dev_priv *priv = dev_ptr; mod_timer(&priv->poll_timer, jiffies); } static inline int post_send(struct ipoib_dev_priv *priv, unsigned int wr_id, struct ib_ah *address, u32 qpn, struct ipoib_tx_buf *tx_req, void *head, int hlen) { struct ib_send_wr *bad_wr; struct mbuf *mb = tx_req->mb; u64 *mapping = tx_req->mapping; struct mbuf *m; int i; for (m = mb, i = 0; m != NULL; m = m->m_next, i++) { priv->tx_sge[i].addr = mapping[i]; priv->tx_sge[i].length = m->m_len; } priv->tx_wr.num_sge = i; priv->tx_wr.wr_id = wr_id; priv->tx_wr.wr.ud.remote_qpn = qpn; priv->tx_wr.wr.ud.ah = address; if (head) { priv->tx_wr.wr.ud.mss = 0; /* XXX mb_shinfo(mb)->gso_size; */ priv->tx_wr.wr.ud.header = head; priv->tx_wr.wr.ud.hlen = hlen; priv->tx_wr.opcode = IB_WR_LSO; } else priv->tx_wr.opcode = IB_WR_SEND; return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr); } void ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_ah *address, u32 qpn) { struct ifnet *dev = priv->dev; struct ipoib_tx_buf *tx_req; int hlen; void *phead; if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) while (ipoib_poll_tx(priv)) ; /* nothing */ m_adj(mb, sizeof (struct ipoib_pseudoheader)); if (0 /* XXX segment offload mb_is_gso(mb) */) { /* XXX hlen = mb_transport_offset(mb) + tcp_hdrlen(mb); */ phead = mtod(mb, void *); if (mb->m_len < hlen) { ipoib_warn(priv, "linear data too small\n"); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); m_freem(mb); return; } m_adj(mb, hlen); } else { if (unlikely(mb->m_pkthdr.len - IPOIB_ENCAP_LEN > priv->mcast_mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", mb->m_pkthdr.len, priv->mcast_mtu); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); ipoib_cm_mb_too_long(priv, mb, priv->mcast_mtu); return; } phead = NULL; hlen = 0; } ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n", mb->m_pkthdr.len, address, qpn); /* * We put the mb into the tx_ring _before_ we call post_send() * because it's entirely possible that the completion handler will * run before we execute anything after the post_send(). That * means we have to make sure everything is properly recorded and * our state is consistent before we call post_send(). */ tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; tx_req->mb = mb; if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req, IPOIB_UD_TX_SG))) { if_inc_counter(dev, IFCOUNTER_OERRORS, 1); if (tx_req->mb) m_freem(tx_req->mb); return; } if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP)) priv->tx_wr.send_flags |= IB_SEND_IP_CSUM; else priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; if (++priv->tx_outstanding == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) ipoib_warn(priv, "request notify on send CQ failed\n"); dev->if_drv_flags |= IFF_DRV_OACTIVE; } if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), address->ah, qpn, tx_req, phead, hlen))) { ipoib_warn(priv, "post_send failed\n"); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); --priv->tx_outstanding; ipoib_dma_unmap_tx(priv->ca, tx_req); m_freem(mb); if (dev->if_drv_flags & IFF_DRV_OACTIVE) dev->if_drv_flags &= ~IFF_DRV_OACTIVE; } else { address->last_send = priv->tx_head; ++priv->tx_head; } } static void __ipoib_reap_ah(struct ipoib_dev_priv *priv) { struct ipoib_ah *ah, *tah; LIST_HEAD(remove_list); unsigned long flags; spin_lock_irqsave(&priv->lock, flags); list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list) if ((int) priv->tx_tail - (int) ah->last_send >= 0) { list_del(&ah->list); ib_destroy_ah(ah->ah); kfree(ah); } spin_unlock_irqrestore(&priv->lock, flags); } void ipoib_reap_ah(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, ah_reap_task.work); __ipoib_reap_ah(priv); if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); } static void ipoib_ah_dev_cleanup(struct ipoib_dev_priv *priv) { unsigned long begin; begin = jiffies; while (!list_empty(&priv->dead_ahs)) { __ipoib_reap_ah(priv); if (time_after(jiffies, begin + HZ)) { ipoib_warn(priv, "timing out; will leak address handles\n"); break; } msleep(1); } } static void ipoib_ib_tx_timer_func(unsigned long ctx) { drain_tx_cq((struct ipoib_dev_priv *)ctx); } int ipoib_ib_dev_open(struct ipoib_dev_priv *priv) { int ret; if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) { ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey); clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); return -1; } set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); ret = ipoib_init_qp(priv); if (ret) { ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret); return -1; } ret = ipoib_ib_post_receives(priv); if (ret) { ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); ipoib_ib_dev_stop(priv, 1); return -1; } ret = ipoib_cm_dev_open(priv); if (ret) { ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); ipoib_ib_dev_stop(priv, 1); return -1; } clear_bit(IPOIB_STOP_REAPER, &priv->flags); queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); return 0; } static void ipoib_pkey_dev_check_presence(struct ipoib_dev_priv *priv) { u16 pkey_index = 0; if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); else set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); } int ipoib_ib_dev_up(struct ipoib_dev_priv *priv) { ipoib_pkey_dev_check_presence(priv); if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { ipoib_dbg(priv, "PKEY is not assigned.\n"); return 0; } set_bit(IPOIB_FLAG_OPER_UP, &priv->flags); return ipoib_mcast_start_thread(priv); } int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush) { ipoib_dbg(priv, "downing ib_dev\n"); clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); if_link_state_change(priv->dev, LINK_STATE_DOWN); /* Shutdown the P_Key thread if still active */ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { mutex_lock(&pkey_mutex); set_bit(IPOIB_PKEY_STOP, &priv->flags); cancel_delayed_work(&priv->pkey_poll_task); mutex_unlock(&pkey_mutex); if (flush) flush_workqueue(ipoib_workqueue); } ipoib_mcast_stop_thread(priv, flush); ipoib_mcast_dev_flush(priv); ipoib_flush_paths(priv); return 0; } static int recvs_pending(struct ipoib_dev_priv *priv) { int pending = 0; int i; for (i = 0; i < ipoib_recvq_size; ++i) if (priv->rx_ring[i].mb) ++pending; return pending; } void ipoib_drain_cq(struct ipoib_dev_priv *priv) { int i, n; spin_lock(&priv->drain_lock); do { n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); for (i = 0; i < n; ++i) { /* * Convert any successful completions to flush * errors to avoid passing packets up the * stack after bringing the device down. */ if (priv->ibwc[i].status == IB_WC_SUCCESS) priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR; if ((priv->ibwc[i].wr_id & IPOIB_OP_RECV) == 0) panic("ipoib_drain_cq: Bad wrid 0x%jX\n", (intmax_t)priv->ibwc[i].wr_id); if (priv->ibwc[i].wr_id & IPOIB_OP_CM) ipoib_cm_handle_rx_wc(priv, priv->ibwc + i); else ipoib_ib_handle_rx_wc(priv, priv->ibwc + i); } } while (n == IPOIB_NUM_WC); spin_unlock(&priv->drain_lock); spin_lock(&priv->lock); while (ipoib_poll_tx(priv)) ; /* nothing */ spin_unlock(&priv->lock); } int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush) { struct ib_qp_attr qp_attr; unsigned long begin; struct ipoib_tx_buf *tx_req; int i; ipoib_cm_dev_stop(priv); /* * Move our QP to the error state and then reinitialize in * when all work requests have completed or have been flushed. */ qp_attr.qp_state = IB_QPS_ERR; if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) ipoib_warn(priv, "Failed to modify QP to ERROR state\n"); /* Wait for all sends and receives to complete */ begin = jiffies; while (priv->tx_head != priv->tx_tail || recvs_pending(priv)) { if (time_after(jiffies, begin + 5 * HZ)) { ipoib_warn(priv, "timing out; %d sends %d receives not completed\n", priv->tx_head - priv->tx_tail, recvs_pending(priv)); /* * assume the HW is wedged and just free up * all our pending work requests. */ while ((int) priv->tx_tail - (int) priv->tx_head < 0) { tx_req = &priv->tx_ring[priv->tx_tail & (ipoib_sendq_size - 1)]; ipoib_dma_unmap_tx(priv->ca, tx_req); m_freem(tx_req->mb); ++priv->tx_tail; --priv->tx_outstanding; } for (i = 0; i < ipoib_recvq_size; ++i) { struct ipoib_rx_buf *rx_req; rx_req = &priv->rx_ring[i]; if (!rx_req->mb) continue; ipoib_dma_unmap_rx(priv, &priv->rx_ring[i]); m_freem(rx_req->mb); rx_req->mb = NULL; } goto timeout; } ipoib_drain_cq(priv); msleep(1); } ipoib_dbg(priv, "All sends and receives done.\n"); timeout: del_timer_sync(&priv->poll_timer); qp_attr.qp_state = IB_QPS_RESET; if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) ipoib_warn(priv, "Failed to modify QP to RESET state\n"); /* Wait for all AHs to be reaped */ set_bit(IPOIB_STOP_REAPER, &priv->flags); cancel_delayed_work(&priv->ah_reap_task); if (flush) flush_workqueue(ipoib_workqueue); ipoib_ah_dev_cleanup(priv); ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); return 0; } int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port) { struct ifnet *dev = priv->dev; priv->ca = ca; priv->port = port; priv->qp = NULL; if (ipoib_transport_dev_init(priv, ca)) { printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name); return -ENODEV; } setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func, (unsigned long) priv); if (dev->if_flags & IFF_UP) { if (ipoib_ib_dev_open(priv)) { ipoib_transport_dev_cleanup(priv); return -ENODEV; } } return 0; } static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, enum ipoib_flush_level level) { struct ipoib_dev_priv *cpriv; u16 new_index; mutex_lock(&priv->vlan_mutex); /* * Flush any child interfaces too -- they might be up even if * the parent is down. */ list_for_each_entry(cpriv, &priv->child_intfs, list) __ipoib_ib_dev_flush(cpriv, level); mutex_unlock(&priv->vlan_mutex); if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); return; } if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); return; } if (level == IPOIB_FLUSH_HEAVY) { if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); ipoib_ib_dev_down(priv, 0); ipoib_ib_dev_stop(priv, 0); if (ipoib_pkey_dev_delay_open(priv)) return; } /* restart QP only if P_Key index is changed */ if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && new_index == priv->pkey_index) { ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); return; } priv->pkey_index = new_index; } if (level == IPOIB_FLUSH_LIGHT) { ipoib_mark_paths_invalid(priv); ipoib_mcast_dev_flush(priv); } if (level >= IPOIB_FLUSH_NORMAL) ipoib_ib_dev_down(priv, 0); if (level == IPOIB_FLUSH_HEAVY) { ipoib_ib_dev_stop(priv, 0); ipoib_ib_dev_open(priv); } /* * The device could have been brought down between the start and when * we get here, don't bring it back up if it's not configured up */ if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { if (level >= IPOIB_FLUSH_NORMAL) ipoib_ib_dev_up(priv); ipoib_mcast_restart_task(&priv->restart_task); } } void ipoib_ib_dev_flush_light(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, flush_light); __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT); } void ipoib_ib_dev_flush_normal(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, flush_normal); __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL); } void ipoib_ib_dev_flush_heavy(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, flush_heavy); __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY); } void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv) { ipoib_dbg(priv, "cleaning up ib_dev\n"); ipoib_mcast_stop_thread(priv, 1); ipoib_mcast_dev_flush(priv); ipoib_ah_dev_cleanup(priv); ipoib_transport_dev_cleanup(priv); } /* * Delayed P_Key Assigment Interim Support * * The following is initial implementation of delayed P_Key assigment * mechanism. It is using the same approach implemented for the multicast * group join. The single goal of this implementation is to quickly address * Bug #2507. This implementation will probably be removed when the P_Key * change async notification is available. */ void ipoib_pkey_poll(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, pkey_poll_task.work); ipoib_pkey_dev_check_presence(priv); if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) ipoib_open(priv); else { mutex_lock(&pkey_mutex); if (!test_bit(IPOIB_PKEY_STOP, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->pkey_poll_task, HZ); mutex_unlock(&pkey_mutex); } } int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv) { /* Look for the interface pkey value in the IB Port P_Key table and */ /* set the interface pkey assigment flag */ ipoib_pkey_dev_check_presence(priv); /* P_Key value not assigned yet - start polling */ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { mutex_lock(&pkey_mutex); clear_bit(IPOIB_PKEY_STOP, &priv->flags); queue_delayed_work(ipoib_workqueue, &priv->pkey_poll_task, HZ); mutex_unlock(&pkey_mutex); return 1; } return 0; } Index: head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c =================================================================== --- head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c (revision 296381) +++ head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c (revision 296382) @@ -1,293 +1,294 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ipoib.h" int ipoib_mcast_attach(struct ipoib_dev_priv *priv, u16 mlid, union ib_gid *mgid, int set_qkey) { struct ib_qp_attr *qp_attr = NULL; int ret; u16 pkey_index; if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) { clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); ret = -ENXIO; goto out; } set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); if (set_qkey) { ret = -ENOMEM; qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); if (!qp_attr) goto out; /* set correct QKey for QP */ qp_attr->qkey = priv->qkey; ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY); if (ret) { ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret); goto out; } } /* attach QP to multicast group */ ret = ib_attach_mcast(priv->qp, mgid, mlid); if (ret) ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret); out: kfree(qp_attr); return ret; } int ipoib_init_qp(struct ipoib_dev_priv *priv) { int ret; struct ib_qp_attr qp_attr; int attr_mask; if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return -1; qp_attr.qp_state = IB_QPS_INIT; qp_attr.qkey = 0; qp_attr.port_num = priv->port; qp_attr.pkey_index = priv->pkey_index; attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE; + ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to init, ret = %d\n", ret); goto out_fail; } qp_attr.qp_state = IB_QPS_RTR; /* Can't set this in a INIT->RTR transition */ attr_mask &= ~IB_QP_PORT; ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTR, ret = %d\n", ret); goto out_fail; } qp_attr.qp_state = IB_QPS_RTS; qp_attr.sq_psn = 0; attr_mask |= IB_QP_SQ_PSN; attr_mask &= ~IB_QP_PKEY_INDEX; ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTS, ret = %d\n", ret); goto out_fail; } return 0; out_fail: qp_attr.qp_state = IB_QPS_RESET; if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) ipoib_warn(priv, "Failed to modify QP to RESET state\n"); return ret; } int ipoib_transport_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca) { struct ib_qp_init_attr init_attr = { .cap = { .max_send_wr = ipoib_sendq_size, .max_recv_wr = ipoib_recvq_size, .max_send_sge = 1, .max_recv_sge = IPOIB_UD_RX_SG }, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_UD }; int ret, size; int i; /* XXX struct ethtool_coalesce *coal; */ priv->pd = ib_alloc_pd(priv->ca); if (IS_ERR(priv->pd)) { printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); return -ENODEV; } priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(priv->mr)) { printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name); goto out_free_pd; } size = ipoib_recvq_size + 1; ret = ipoib_cm_dev_init(priv); if (!ret) { size += ipoib_sendq_size; if (ipoib_cm_has_srq(priv)) size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */ else size += ipoib_recvq_size * ipoib_max_conn_qp; } priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, priv, size, 0); if (IS_ERR(priv->recv_cq)) { printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name); goto out_free_mr; } priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL, priv, ipoib_sendq_size, 0); if (IS_ERR(priv->send_cq)) { printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name); goto out_free_recv_cq; } if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP)) goto out_free_send_cq; #if 0 /* XXX */ coal = kzalloc(sizeof *coal, GFP_KERNEL); if (coal) { coal->rx_coalesce_usecs = 10; coal->tx_coalesce_usecs = 10; coal->rx_max_coalesced_frames = 16; coal->tx_max_coalesced_frames = 16; dev->ethtool_ops->set_coalesce(dev, coal); kfree(coal); } #endif init_attr.send_cq = priv->send_cq; init_attr.recv_cq = priv->recv_cq; if (priv->hca_caps & IB_DEVICE_UD_TSO) init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK) init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; init_attr.cap.max_send_sge = IPOIB_UD_TX_SG; priv->qp = ib_create_qp(priv->pd, &init_attr); if (IS_ERR(priv->qp)) { printk(KERN_WARNING "%s: failed to create QP\n", ca->name); goto out_free_send_cq; } IF_LLADDR(priv->dev)[1] = (priv->qp->qp_num >> 16) & 0xff; IF_LLADDR(priv->dev)[2] = (priv->qp->qp_num >> 8) & 0xff; IF_LLADDR(priv->dev)[3] = (priv->qp->qp_num ) & 0xff; for (i = 0; i < IPOIB_MAX_TX_SG; ++i) priv->tx_sge[i].lkey = priv->mr->lkey; priv->tx_wr.opcode = IB_WR_SEND; priv->tx_wr.sg_list = priv->tx_sge; priv->tx_wr.send_flags = IB_SEND_SIGNALED; for (i = 0; i < IPOIB_UD_RX_SG; ++i) priv->rx_sge[i].lkey = priv->mr->lkey; priv->rx_wr.next = NULL; priv->rx_wr.sg_list = priv->rx_sge; return 0; out_free_send_cq: ib_destroy_cq(priv->send_cq); out_free_recv_cq: ib_destroy_cq(priv->recv_cq); out_free_mr: ib_dereg_mr(priv->mr); ipoib_cm_dev_cleanup(priv); out_free_pd: ib_dealloc_pd(priv->pd); return -ENODEV; } void ipoib_transport_dev_cleanup(struct ipoib_dev_priv *priv) { if (priv->qp) { if (ib_destroy_qp(priv->qp)) ipoib_warn(priv, "ib_qp_destroy failed\n"); priv->qp = NULL; clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); } if (ib_destroy_cq(priv->send_cq)) ipoib_warn(priv, "ib_cq_destroy (send) failed\n"); if (ib_destroy_cq(priv->recv_cq)) ipoib_warn(priv, "ib_cq_destroy (recv) failed\n"); ipoib_cm_dev_cleanup(priv); if (ib_dereg_mr(priv->mr)) ipoib_warn(priv, "ib_dereg_mr failed\n"); if (ib_dealloc_pd(priv->pd)) ipoib_warn(priv, "ib_dealloc_pd failed\n"); } void ipoib_event(struct ib_event_handler *handler, struct ib_event *record) { struct ipoib_dev_priv *priv = container_of(handler, struct ipoib_dev_priv, event_handler); if (record->element.port_num != priv->port) return; ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event, record->device->name, record->element.port_num); if (record->event == IB_EVENT_SM_CHANGE || record->event == IB_EVENT_CLIENT_REREGISTER) { queue_work(ipoib_workqueue, &priv->flush_light); } else if (record->event == IB_EVENT_PORT_ERR || record->event == IB_EVENT_PORT_ACTIVE || record->event == IB_EVENT_LID_CHANGE) { queue_work(ipoib_workqueue, &priv->flush_normal); } else if (record->event == IB_EVENT_PKEY_CHANGE) { queue_work(ipoib_workqueue, &priv->flush_heavy); } }