Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -4384,6 +4384,27 @@ dev/mlx5/mlx5_en/mlx5_en_txrx.c optional mlx5en pci inet inet6 \ compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_ah.c optional mlx5ib pci inet inet6 \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_cq.c optional mlx5ib pci inet inet6 \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_doorbell.c optional mlx5ib pci inet inet6 \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_mad.c optional mlx5ib pci inet inet6 \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_mem.c optional mlx5ib pci inet inet6 \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_mr.c optional mlx5ib pci inet inet6 \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_roce.c optional mlx5ib pci inet inet6 \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_srq.c optional mlx5ib pci inet inet6 \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_main.c optional mlx5ib pci inet inet6 \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_qp.c optional mlx5ib pci inet inet6 \ + compile-with "${OFED_C}" + ofed/drivers/infiniband/hw/mthca/mthca_allocator.c optional mthca \ compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_av.c optional mthca \ Index: sys/dev/mlx5/mlx5_ib/mlx5_ah.c =================================================================== --- /dev/null +++ sys/dev/mlx5/mlx5_ib/mlx5_ah.c @@ -0,0 +1,128 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "mlx5_ib.h" + +#define IPV6_DEFAULT_HOPLIMIT 64 + +struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, + struct ib_ah_attr *ah_attr, + struct mlx5_ib_ah *ah, enum rdma_link_layer ll) +{ + int err; + int gid_type; + + if (ah_attr->ah_flags & IB_AH_GRH) { + memcpy(ah->av.rgid, &ah_attr->grh.dgid, 16); + ah->av.grh_gid_fl = cpu_to_be32(ah_attr->grh.flow_label | + (1 << 30) | + ah_attr->grh.sgid_index << 20); + ah->av.hop_limit = ah_attr->grh.hop_limit; + ah->av.tclass = ah_attr->grh.traffic_class; + } + + ah->av.stat_rate_sl = (ah_attr->static_rate << 4); + + if (ll == IB_LINK_LAYER_ETHERNET) { + err = mlx5_get_roce_gid_type(dev, ah_attr->port_num, + ah_attr->grh.sgid_index, + &gid_type); + if (err) + return ERR_PTR(err); + memcpy(ah->av.rmac, ah_attr->dmac, sizeof(ah_attr->dmac)); + ah->av.udp_sport = mlx5_get_roce_udp_sport( + dev, + ah_attr->port_num, + ah_attr->grh.sgid_index, + 0); + ah->av.stat_rate_sl |= (ah_attr->sl & 0x7) << 1; + if ((gid_type != IB_GID_TYPE_IB) && + (ah_attr->grh.hop_limit < 2)) + ah->av.hop_limit = IPV6_DEFAULT_HOPLIMIT; + else + ah->av.hop_limit = ah_attr->grh.hop_limit; + /* TODO: initialize other eth fields */ + } else { + ah->av.rlid = cpu_to_be16(ah_attr->dlid); + ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f; + ah->av.stat_rate_sl |= (ah_attr->sl & 0xf); + } + + return &ah->ibah; +} + +struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + struct mlx5_ib_ah *ah; + struct mlx5_ib_dev *dev = to_mdev(pd->device); + enum rdma_link_layer ll; + struct ib_ah *ret = ERR_PTR(-EINVAL); + + ah = kzalloc(sizeof(*ah), GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + ll = pd->device->get_link_layer(pd->device, ah_attr->port_num); + + if (ll == IB_LINK_LAYER_ETHERNET && !(ah_attr->ah_flags & IB_AH_GRH)) + goto err_kfree_ah; + + return create_ib_ah(dev, ah_attr, ah, ll); /* never fails */ + +err_kfree_ah: + kfree(ah); + return ret; +} + +int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct mlx5_ib_ah *ah = to_mah(ibah); + u32 tmp; + + memset(ah_attr, 0, sizeof(*ah_attr)); + + tmp = be32_to_cpu(ah->av.grh_gid_fl); + if (tmp & (1 << 30)) { + ah_attr->ah_flags = IB_AH_GRH; + ah_attr->grh.sgid_index = (tmp >> 20) & 0xff; + ah_attr->grh.flow_label = tmp & 0xfffff; + memcpy(&ah_attr->grh.dgid, ah->av.rgid, 16); + ah_attr->grh.hop_limit = ah->av.hop_limit; + ah_attr->grh.traffic_class = ah->av.tclass; + } + ah_attr->dlid = be16_to_cpu(ah->av.rlid); + ah_attr->static_rate = ah->av.stat_rate_sl >> 4; + ah_attr->sl = ah->av.stat_rate_sl & 0xf; + + return 0; +} + +int mlx5_ib_destroy_ah(struct ib_ah *ah) +{ + kfree(to_mah(ah)); + return 0; +} Index: sys/dev/mlx5/mlx5_ib/mlx5_cq.c =================================================================== --- /dev/null +++ sys/dev/mlx5/mlx5_ib/mlx5_cq.c @@ -0,0 +1,1383 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include "mlx5_ib.h" +#include "user.h" + +static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq) +{ + struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; + + ibcq->comp_handler(ibcq, ibcq->cq_context); +} + +static void mlx5_ib_cq_event(struct mlx5_core_cq *mcq, int type) +{ + struct mlx5_ib_cq *cq = container_of(mcq, struct mlx5_ib_cq, mcq); + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct ib_cq *ibcq = &cq->ibcq; + struct ib_event event; + + if (type != MLX5_EVENT_TYPE_CQ_ERROR) { + mlx5_ib_warn(dev, "Unexpected event type %d on CQ %06x\n", + type, mcq->cqn); + return; + } + + if (ibcq->event_handler) { + event.device = &dev->ib_dev; + event.event = IB_EVENT_CQ_ERR; + event.element.cq = ibcq; + ibcq->event_handler(&event, ibcq->cq_context); + } +} + +static void *get_cqe_from_buf(struct mlx5_ib_cq_buf *buf, int n, int size) +{ + return mlx5_buf_offset(&buf->buf, n * size); +} + +static void *get_cqe(struct mlx5_ib_cq *cq, int n) +{ + return get_cqe_from_buf(&cq->buf, n, cq->mcq.cqe_sz); +} + +static u8 sw_ownership_bit(int n, int nent) +{ + return (n & nent) ? 1 : 0; +} + +static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n) +{ + void *cqe = get_cqe(cq, n & cq->ibcq.cqe); + struct mlx5_cqe64 *cqe64; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) && + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) { + return cqe; + } else { + return NULL; + } +} + +static void *next_cqe_sw(struct mlx5_ib_cq *cq) +{ + return get_sw_cqe(cq, cq->mcq.cons_index); +} + +static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx) +{ + switch (wq->swr_ctx[idx].wr_data) { + case MLX5_IB_WR_UMR: + return 0; + + case IB_WR_LOCAL_INV: + return IB_WC_LOCAL_INV; + + case IB_WR_FAST_REG_MR: + return IB_WC_FAST_REG_MR; + + case IB_WR_REG_INDIR_MR: + return IB_WC_REG_INDIR_MR; + + default: + printf("mlx5_ib: WARN: ""unknown completion status\n"); + return 0; + } +} + +static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_wq *wq, int idx) +{ + wc->wc_flags = 0; + switch (be32_to_cpu(cqe->sop_drop_qpn) >> 24) { + case MLX5_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX5_OPCODE_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case MLX5_OPCODE_SEND_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX5_OPCODE_NOP: + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_INVAL: + wc->opcode = IB_WC_SEND; + break; + case MLX5_OPCODE_RDMA_READ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + break; + case MLX5_OPCODE_ATOMIC_CS: + wc->opcode = IB_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_FA: + wc->opcode = IB_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_CS: + wc->opcode = IB_WC_MASKED_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_FA: + wc->opcode = IB_WC_MASKED_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + case MLX5_OPCODE_UMR: + wc->opcode = get_umr_comp(wq, idx); + break; + } +} + +enum { + MLX5_GRH_IN_BUFFER = 1, + MLX5_GRH_IN_CQE = 2, +}; + +static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + struct mlx5_ib_srq *srq; + struct mlx5_ib_wq *wq; + u16 wqe_ctr; + u8 g; +#if defined(DX_ROCE_V1_5) || defined(DX_WINDOWS) + u8 udp_header_valid; +#endif + + if (qp->ibqp.srq || qp->ibqp.xrcd) { + struct mlx5_core_srq *msrq = NULL; + + if (qp->ibqp.xrcd) { + msrq = mlx5_core_get_srq(dev->mdev, + be32_to_cpu(cqe->srqn)); + srq = to_mibsrq(msrq); + } else { + srq = to_msrq(qp->ibqp.srq); + } + if (srq) { + wqe_ctr = be16_to_cpu(cqe->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + if (msrq && atomic_dec_and_test(&msrq->refcount)) + complete(&msrq->free); + } + } else { + wq = &qp->rq; + wc->wr_id = wq->rwr_ctx[wq->tail & (wq->wqe_cnt - 1)].wrid; + ++wq->tail; + } + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + + switch (cqe->op_own >> 4) { + case MLX5_CQE_RESP_WR_IMM: + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND: + wc->opcode = IB_WC_RECV; + wc->wc_flags = 0; + break; + case MLX5_CQE_RESP_SEND_IMM: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND_INV: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = be32_to_cpu(cqe->imm_inval_pkey); + break; + } + wc->slid = be16_to_cpu(cqe->slid); + wc->sl = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0xf; + wc->src_qp = be32_to_cpu(cqe->flags_rqpn) & 0xffffff; + wc->dlid_path_bits = cqe->ml_path; + g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3; + wc->wc_flags |= g ? IB_WC_GRH : 0; + wc->pkey_index = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff; + +#if defined(DX_ROCE_V1_5) || defined(DX_WINDOWS) + udp_header_valid = wc->sl & 0x8; + if (udp_header_valid) + wc->wc_flags |= IB_WC_WITH_UDP_HDR; + +#endif + switch (wc->sl & 0x3) { + case CQE_ROCE_L3_HEADER_TYPE_GRH: + wc->network_hdr_type = RDMA_NETWORK_IB; + break; + case CQE_ROCE_L3_HEADER_TYPE_IPV6: + wc->network_hdr_type = RDMA_NETWORK_IPV6; + break; + case CQE_ROCE_L3_HEADER_TYPE_IPV4: + wc->network_hdr_type = RDMA_NETWORK_IPV4; + break; + } + wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE; +} + +static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe) +{ + __be32 *p = (__be32 *)cqe; + int i; + + mlx5_ib_warn(dev, "dump error cqe\n"); + for (i = 0; i < sizeof(*cqe) / 16; i++, p += 4) + printf("mlx5_ib: INFO: ""%08x %08x %08x %08x\n", be32_to_cpu(p[0]), be32_to_cpu(p[1]), be32_to_cpu(p[2]), be32_to_cpu(p[3])); +} + +static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, + struct mlx5_err_cqe *cqe, + struct ib_wc *wc) +{ + int dump = 1; + + switch (cqe->syndrome) { + case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR: + wc->status = IB_WC_LOC_QP_OP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case MLX5_CQE_SYNDROME_WR_FLUSH_ERR: + dump = 0; + wc->status = IB_WC_WR_FLUSH_ERR; + break; + case MLX5_CQE_SYNDROME_MW_BIND_ERR: + wc->status = IB_WC_MW_BIND_ERR; + break; + case MLX5_CQE_SYNDROME_BAD_RESP_ERR: + wc->status = IB_WC_BAD_RESP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + wc->status = IB_WC_REM_INV_REQ_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR: + wc->status = IB_WC_REM_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_OP_ERR: + wc->status = IB_WC_REM_OP_ERR; + break; + case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + wc->status = IB_WC_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + wc->status = IB_WC_RNR_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR: + wc->status = IB_WC_REM_ABORT_ERR; + break; + default: + wc->status = IB_WC_GENERAL_ERR; + break; + } + + wc->vendor_err = cqe->vendor_err_synd; + if (dump) + dump_cqe(dev, cqe); +} + +static int is_atomic_response(struct mlx5_ib_qp *qp, u16 idx) +{ + /* TBD: waiting decision + */ + return 0; +} + +static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, u16 idx) +{ + struct mlx5_wqe_data_seg *dpseg; + void *addr; + + dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_atomic_seg); + addr = (void *)(uintptr_t)be64_to_cpu(dpseg->addr); + return addr; +} + +static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, + u16 idx) +{ + void *addr; + int byte_count; + int i; + + if (!is_atomic_response(qp, idx)) + return; + + byte_count = be32_to_cpu(cqe64->byte_cnt); + addr = mlx5_get_atomic_laddr(qp, idx); + + if (byte_count == 4) { + *(u32 *)addr = be32_to_cpu(*((__be32 *)addr)); + } else { + for (i = 0; i < byte_count; i += 8) { + *(u64 *)addr = be64_to_cpu(*((__be64 *)addr)); + addr += 8; + } + } + + return; +} + +static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, + u16 tail, u16 head) +{ + u16 idx; + + do { + idx = tail & (qp->sq.wqe_cnt - 1); + handle_atomic(qp, cqe64, idx); + if (idx == head) + break; + + tail = qp->sq.swr_ctx[idx].w_list.next; + } while (1); + tail = qp->sq.swr_ctx[idx].w_list.next; + qp->sq.last_poll = tail; +} + +static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) +{ + mlx5_buf_free(dev->mdev, &buf->buf); +} + +static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe, + struct ib_sig_err *item) +{ + u16 syndrome = be16_to_cpu(cqe->syndrome); + +#define GUARD_ERR (1 << 13) +#define APPTAG_ERR (1 << 12) +#define REFTAG_ERR (1 << 11) + + if (syndrome & GUARD_ERR) { + item->err_type = IB_SIG_BAD_GUARD; + item->expected = be32_to_cpu(cqe->expected_trans_sig) >> 16; + item->actual = be32_to_cpu(cqe->actual_trans_sig) >> 16; + } else + if (syndrome & REFTAG_ERR) { + item->err_type = IB_SIG_BAD_REFTAG; + item->expected = be32_to_cpu(cqe->expected_reftag); + item->actual = be32_to_cpu(cqe->actual_reftag); + } else + if (syndrome & APPTAG_ERR) { + item->err_type = IB_SIG_BAD_APPTAG; + item->expected = be32_to_cpu(cqe->expected_trans_sig) & 0xffff; + item->actual = be32_to_cpu(cqe->actual_trans_sig) & 0xffff; + } else { + printf("mlx5_ib: ERR: ""Got signature completion error with bad syndrome %04x\n", syndrome); + } + + item->sig_err_offset = be64_to_cpu(cqe->err_offset); + item->key = be32_to_cpu(cqe->mkey); +} + +static void sw_send_comp(struct mlx5_ib_qp *qp, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx5_ib_wq *wq; + unsigned cur; + unsigned idx; + int np; + int i; + + wq = &qp->sq; + cur = wq->head - wq->tail; + np = *npolled; + + if (cur == 0) + return; + + for (i = 0; i < cur && np < num_entries; i++) { + idx = wq->last_poll & (wq->wqe_cnt - 1); + wc->wr_id = wq->swr_ctx[idx].wrid; + wc->status = IB_WC_WR_FLUSH_ERR; + wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR; + wq->tail++; + np++; + wc->qp = &qp->ibqp; + wc++; + wq->last_poll = wq->swr_ctx[idx].w_list.next; + } + *npolled = np; +} + +static void sw_recv_comp(struct mlx5_ib_qp *qp, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx5_ib_wq *wq; + unsigned cur; + int np; + int i; + + wq = &qp->rq; + cur = wq->head - wq->tail; + np = *npolled; + + if (cur == 0) + return; + + for (i = 0; i < cur && np < num_entries; i++) { + wc->wr_id = wq->rwr_ctx[wq->tail & (wq->wqe_cnt - 1)].wrid; + wc->status = IB_WC_WR_FLUSH_ERR; + wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR; + wq->tail++; + np++; + wc->qp = &qp->ibqp; + wc++; + } + *npolled = np; +} + +static void mlx5_ib_poll_sw_comp(struct mlx5_ib_cq *cq, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx5_ib_qp *qp; + + *npolled = 0; + /* Find uncompleted WQEs belonging to that cq and retrun mmics ones */ + list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) { + sw_send_comp(qp, num_entries, wc + *npolled, npolled); + if (*npolled >= num_entries) + return; + } + + list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) { + sw_recv_comp(qp, num_entries, wc + *npolled, npolled); + if (*npolled >= num_entries) + return; + } +} + + +static int mlx5_poll_one(struct mlx5_ib_cq *cq, + struct mlx5_ib_qp **cur_qp, + struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_err_cqe *err_cqe; + struct mlx5_cqe64 *cqe64; + struct mlx5_core_qp *mqp; + struct mlx5_ib_wq *wq; + struct mlx5_sig_err_cqe *sig_err_cqe; + struct mlx5_core_mr *mmr; + struct mlx5_ib_mr *mr; + unsigned long flags; + u8 opcode; + u32 qpn; + u16 wqe_ctr; + void *cqe; + int idx; + +repoll: + cqe = next_cqe_sw(cq); + if (!cqe) + return -EAGAIN; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + ++cq->mcq.cons_index; + + /* Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + opcode = cqe64->op_own >> 4; + if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) { + if (likely(cq->resize_buf)) { + free_cq_buf(dev, &cq->buf); + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + goto repoll; + } else { + mlx5_ib_warn(dev, "unexpected resize cqe\n"); + } + } + + qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; + if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) { + /* We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + mqp = __mlx5_qp_lookup(dev->mdev, qpn); + if (unlikely(!mqp)) { + mlx5_ib_warn(dev, "CQE@CQ %06x for unknown QPN %6x\n", + cq->mcq.cqn, qpn); + return -EINVAL; + } + + *cur_qp = to_mibqp(mqp); + } + + wc->qp = &(*cur_qp)->ibqp; + switch (opcode) { + case MLX5_CQE_REQ: + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + handle_good_req(wc, cqe64, wq, idx); + handle_atomics(*cur_qp, cqe64, wq->last_poll, idx); + wc->wr_id = wq->swr_ctx[idx].wrid; + wq->tail = wq->swr_ctx[idx].wqe_head + 1; + if (unlikely(wq->swr_ctx[idx].w_list.opcode & + MLX5_OPCODE_SIGNATURE_CANCELED)) + wc->status = IB_WC_SIG_PIPELINE_CANCELED; + else + wc->status = IB_WC_SUCCESS; + break; + case MLX5_CQE_RESP_WR_IMM: + case MLX5_CQE_RESP_SEND: + case MLX5_CQE_RESP_SEND_IMM: + case MLX5_CQE_RESP_SEND_INV: + handle_responder(wc, cqe64, *cur_qp); + wc->status = IB_WC_SUCCESS; + break; + case MLX5_CQE_RESIZE_CQ: + break; + case MLX5_CQE_REQ_ERR: + case MLX5_CQE_RESP_ERR: + err_cqe = (struct mlx5_err_cqe *)cqe64; + mlx5_handle_error_cqe(dev, err_cqe, wc); + mlx5_ib_dbg(dev, "%s error cqe on cqn 0x%x:\n", + opcode == MLX5_CQE_REQ_ERR ? + "Requestor" : "Responder", cq->mcq.cqn); + mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n", + err_cqe->syndrome, err_cqe->vendor_err_synd); + if (opcode == MLX5_CQE_REQ_ERR) { + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + wc->wr_id = wq->swr_ctx[idx].wrid; + wq->tail = wq->swr_ctx[idx].wqe_head + 1; + } else { + struct mlx5_ib_srq *srq; + + if ((*cur_qp)->ibqp.srq) { + srq = to_msrq((*cur_qp)->ibqp.srq); + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + } else { + wq = &(*cur_qp)->rq; + wc->wr_id = wq->rwr_ctx[wq->tail & (wq->wqe_cnt - 1)].wrid; + ++wq->tail; + } + } + break; + case MLX5_CQE_SIG_ERR: + sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; + + spin_lock_irqsave(&dev->mdev->priv.mr_table.lock, flags); + mmr = __mlx5_mr_lookup(dev->mdev, + mlx5_mkey_to_idx(be32_to_cpu(sig_err_cqe->mkey))); + if (unlikely(!mmr)) { + spin_unlock_irqrestore(&dev->mdev->priv.mr_table.lock, flags); + mlx5_ib_warn(dev, "CQE@CQ %06x for unknown MR %6x\n", + cq->mcq.cqn, be32_to_cpu(sig_err_cqe->mkey)); + return -EINVAL; + } + + mr = to_mibmr(mmr); + get_sig_err_item(sig_err_cqe, &mr->sig->err_item); + mr->sig->sig_err_exists = true; + mr->sig->sigerr_count++; + + mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n", + cq->mcq.cqn, mr->sig->err_item.key, + mr->sig->err_item.err_type, + (unsigned long long) + mr->sig->err_item.sig_err_offset, + mr->sig->err_item.expected, + mr->sig->err_item.actual); + + spin_unlock_irqrestore(&dev->mdev->priv.mr_table.lock, flags); + goto repoll; + } + + return 0; +} + +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_ib_qp *cur_qp = NULL; + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_core_dev *mdev = dev->mdev; + unsigned long flags; + int npolled; + int err = 0; + + spin_lock_irqsave(&cq->lock, flags); + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + mlx5_ib_poll_sw_comp(cq, num_entries, wc, &npolled); + goto out; + } + + for (npolled = 0; npolled < num_entries; npolled++) { + err = mlx5_poll_one(cq, &cur_qp, wc + npolled); + if (err) + break; + } + + if (npolled) + mlx5_cq_set_ci(&cq->mcq); +out: + spin_unlock_irqrestore(&cq->lock, flags); + + if (err == 0 || err == -EAGAIN) + return npolled; + else + return err; +} + +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev; + void __iomem *uar_page = mdev->priv.uuari.uars[0].map; + + + mlx5_cq_arm(&to_mcq(ibcq)->mcq, + (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT, + uar_page, + MLX5_GET_DOORBELL_LOCK(&mdev->priv.cq_uar_lock), + to_mcq(ibcq)->mcq.cons_index); + + return 0; +} + +static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, + int nent, int cqe_size) +{ + int err; + + err = mlx5_buf_alloc(dev->mdev, nent * cqe_size, + PAGE_SIZE * 2, &buf->buf); + if (err) { + mlx5_ib_err(dev, "alloc failed\n"); + return err; + } + + buf->cqe_size = cqe_size; + buf->nent = nent; + + return 0; +} + +static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, + struct ib_ucontext *context, struct mlx5_ib_cq *cq, + int entries, struct mlx5_create_cq_mbox_in **cqb, + int *cqe_size, int *index, int *inlen) +{ + struct mlx5_exp_ib_create_cq ucmd; + size_t ucmdlen; + int page_shift; + int npages; + int ncont; + int err; + + memset(&ucmd, 0, sizeof(ucmd)); + + if (udata->src == IB_UDATA_EXP_CMD) + ucmdlen = min(sizeof(ucmd), udata->inlen); + else + ucmdlen = + (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < + sizeof(struct mlx5_ib_create_cq)) ? + (sizeof(struct mlx5_ib_create_cq) - sizeof(ucmd.reserved)) : + sizeof(struct mlx5_ib_create_cq); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { + mlx5_ib_err(dev, "copy failed\n"); + return -EFAULT; + } + + if (ucmdlen == sizeof(ucmd) && ucmd.reserved != 0) { + mlx5_ib_err(dev, "command corrupted\n"); + return -EINVAL; + } + + if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128) { + mlx5_ib_warn(dev, "wrong CQE size %d\n", ucmd.cqe_size); + return -EINVAL; + } + + *cqe_size = ucmd.cqe_size; + + cq->buf.umem = ib_umem_get(context, ucmd.buf_addr, + entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(cq->buf.umem)) { + err = PTR_ERR(cq->buf.umem); + return err; + } + + err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr, + &cq->db); + if (err) { + mlx5_ib_warn(dev, "map failed\n"); + goto err_umem; + } + + mlx5_ib_cont_pages(cq->buf.umem, ucmd.buf_addr, &npages, &page_shift, + &ncont, NULL); + mlx5_ib_dbg(dev, "addr 0x%llx, size %u, npages %d, page_shift %d, ncont %d\n", + (unsigned long long)ucmd.buf_addr, entries * ucmd.cqe_size, + npages, page_shift, ncont); + + *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * ncont; + *cqb = mlx5_vzalloc(*inlen); + if (!*cqb) { + err = -ENOMEM; + goto err_db; + } + + mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0); + (*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + + *index = to_mucontext(context)->uuari.uars[0].index; + + if (*cqe_size == 64 && MLX5_CAP_GEN(dev->mdev, cqe_compression)) { + if (ucmd.exp_data.cqe_comp_en == 1 && + (ucmd.exp_data.comp_mask & MLX5_EXP_CREATE_CQ_MASK_CQE_COMP_EN)) { + MLX5_SET(cqc, &(*cqb)->ctx, cqe_compression_en, 1); + if (ucmd.exp_data.cqe_comp_recv_type == + MLX5_IB_CQE_FORMAT_CSUM && + (ucmd.exp_data.comp_mask & + MLX5_EXP_CREATE_CQ_MASK_CQE_COMP_RECV_TYPE)) + MLX5_SET(cqc, &(*cqb)->ctx, mini_cqe_res_format, + MLX5_IB_CQE_FORMAT_CSUM); + } + } + + return 0; + +err_db: + mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db); + +err_umem: + ib_umem_release(cq->buf.umem); + return err; +} + +static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context) +{ + + mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db); + ib_umem_release(cq->buf.umem); +} + +static void init_cq_buf(struct mlx5_ib_cq *cq, struct mlx5_ib_cq_buf *buf) +{ + int i; + void *cqe; + struct mlx5_cqe64 *cqe64; + + for (i = 0; i < buf->nent; i++) { + cqe = get_cqe_from_buf(buf, i, buf->cqe_size); + cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; + cqe64->op_own = MLX5_CQE_INVALID << 4; + } +} + +static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size, + struct mlx5_create_cq_mbox_in **cqb, + int *index, int *inlen) +{ + int err; + + err = mlx5_db_alloc(dev->mdev, &cq->db); + if (err) + return err; + + cq->mcq.set_ci_db = cq->db.db; + cq->mcq.arm_db = cq->db.db + 1; + cq->mcq.cqe_sz = cqe_size; + + err = alloc_cq_buf(dev, &cq->buf, entries, cqe_size); + if (err) + goto err_db; + + init_cq_buf(cq, &cq->buf); + + *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * cq->buf.buf.npages; + *cqb = mlx5_vzalloc(*inlen); + if (!*cqb) { + err = -ENOMEM; + goto err_buf; + } + mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas); + + (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; + *index = dev->mdev->priv.uuari.uars[0].index; + + return 0; + +err_buf: + free_cq_buf(dev, &cq->buf); + +err_db: + mlx5_db_free(dev->mdev, &cq->db); + return err; +} + +static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, &cq->buf); + mlx5_db_free(dev->mdev, &cq->db); +} + +enum { + CQ_CREATE_FLAGS_SUPPORTED = IB_CQ_TIMESTAMP | + IB_CQ_CREATE_CROSS_CHANNEL +}; + +struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, + struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_create_cq_mbox_in *cqb = NULL; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + int vector = attr->comp_vector; + int entries = attr->cqe; + struct mlx5_ib_cq *cq; + int uninitialized_var(index); + int uninitialized_var(inlen); + int cqe_size; + int irqn; + int eqn; + int err; + + if (entries < 0 || roundup_pow_of_two(entries + 1) > + (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) { + mlx5_ib_warn(dev, "wrong entries number %d(%ld), max %d\n", + entries, roundup_pow_of_two(entries + 1), + 1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)); + return ERR_PTR(-EINVAL); + } + + entries = roundup_pow_of_two(entries + 1); + + if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED) { + mlx5_ib_warn(dev, "CQ create flag %x not supported\n", + attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED); + return ERR_PTR(-EOPNOTSUPP); + } + + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + + cq->ibcq.cqe = entries - 1; + mutex_init(&cq->resize_mutex); + spin_lock_init(&cq->lock); + cq->resize_buf = NULL; + cq->resize_umem = NULL; + + INIT_LIST_HEAD(&cq->list_send_qp); + INIT_LIST_HEAD(&cq->list_recv_qp); + + if (context) { + err = create_cq_user(dev, udata, context, cq, entries, + &cqb, &cqe_size, &index, &inlen); + if (err) + goto err_create; + } else { + cqe_size = (cache_line_size() >= 128 ? 128 : 64); + err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb, + &index, &inlen); + if (err) + goto err_create; + } + + cq->cqe_size = cqe_size; + cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index); + err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn); + if (err) + goto err_cqb; + + cqb->ctx.c_eqn = cpu_to_be16(eqn); + cqb->ctx.db_record_addr = cpu_to_be64(cq->db.dma); + + err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen); + if (err) + goto err_cqb; + + mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); + cq->mcq.irqn = irqn; + cq->mcq.comp = mlx5_ib_cq_comp; + cq->mcq.event = mlx5_ib_cq_event; + + if (context) + if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) { + err = -EFAULT; + goto err_cmd; + } + + + kvfree(cqb); + return &cq->ibcq; + +err_cmd: + mlx5_core_destroy_cq(dev->mdev, &cq->mcq); + +err_cqb: + kvfree(cqb); + if (context) + destroy_cq_user(cq, context); + else + destroy_cq_kernel(dev, cq); + +err_create: + kfree(cq); + + return ERR_PTR(err); +} + + +int mlx5_ib_destroy_cq(struct ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + struct ib_ucontext *context = NULL; + + if (cq->uobject) + context = cq->uobject->context; + + mlx5_core_destroy_cq(dev->mdev, &mcq->mcq); + if (context) + destroy_cq_user(mcq, context); + else + destroy_cq_kernel(dev, mcq); + + kfree(mcq); + + return 0; +} + +static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) +{ + return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff); +} + +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq) +{ + struct mlx5_cqe64 *cqe64, *dest64; + void *cqe, *dest; + u32 prod_index; + int nfreed = 0; + u8 owner_bit; + + if (!cq) + return; + + /* First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); prod_index++) + if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe) + break; + + /* Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + if (is_equal_rsn(cqe64, rsn)) { + if (srq && (ntohl(cqe64->srqn) & 0xffffff)) + mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter)); + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + dest64 = (cq->mcq.cqe_sz == 64) ? dest : dest + 64; + owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK; + memcpy(dest, cqe, cq->mcq.cqe_sz); + dest64->op_own = owner_bit | + (dest64->op_own & ~MLX5_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->mcq.cons_index += nfreed; + /* Make sure update of buffer contents is done before + * updating consumer index. + */ + wmb(); + mlx5_cq_set_ci(&cq->mcq); + } +} + +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq) +{ + if (!cq) + return; + + spin_lock_irq(&cq->lock); + __mlx5_ib_cq_clean(cq, qpn, srq); + spin_unlock_irq(&cq->lock); +} + +int mlx5_ib_modify_cq(struct ib_cq *cq, + struct ib_cq_attr *cq_attr, + int cq_attr_mask) +{ + struct mlx5_modify_cq_mbox_in *in; + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + int err; + u32 fsel = 0; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->cqn = cpu_to_be32(mcq->mcq.cqn); + if (cq_attr_mask & IB_CQ_MODERATION) { + u16 cq_count = cq_attr->moderation.cq_count; + u16 cq_period = cq_attr->moderation.cq_period; + if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) { + fsel |= (MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT); + if (cq_period & 0xf000) { + /* A value higher than 0xfff is required, better + * use the largest value possible. */ + cq_period = 0xfff; + printf("mlx5_ib: INFO: ""period supported is limited to 12 bits\n"); + } + + in->ctx.cq_period = cpu_to_be16(cq_period); + in->ctx.cq_max_count = cpu_to_be16(cq_count); + } else { + err = -ENOSYS; + goto out; + } + } + + if (cq_attr_mask & IB_CQ_CAP_FLAGS) { + if (MLX5_CAP_GEN(dev->mdev, cq_oi)) { + fsel |= MLX5_CQ_MODIFY_OVERRUN; + if (cq_attr->cq_cap_flags & IB_CQ_IGNORE_OVERRUN) + in->ctx.cqe_sz_flags |= MLX5_CQ_FLAGS_OI; + else + in->ctx.cqe_sz_flags &= ~MLX5_CQ_FLAGS_OI; + } else { + err = -ENOSYS; + goto out; + } + } + in->field_select = cpu_to_be32(fsel); + err = mlx5_core_modify_cq(dev->mdev, &mcq->mcq, in, sizeof(*in)); + +out: + kfree(in); + if (err) + mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn); + + return err; +} + +static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, struct ib_udata *udata, int *npas, + int *page_shift, int *cqe_size) +{ + struct mlx5_ib_resize_cq ucmd; + struct ib_umem *umem; + int err; + int npages; + struct ib_ucontext *context = cq->buf.umem->context; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) + return err; + + if (ucmd.reserved0 || ucmd.reserved1) + return -EINVAL; + + umem = ib_umem_get(context, ucmd.buf_addr, entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + return err; + } + + mlx5_ib_cont_pages(umem, ucmd.buf_addr, &npages, page_shift, + npas, NULL); + + cq->resize_umem = umem; + *cqe_size = ucmd.cqe_size; + + return 0; +} + +static void un_resize_user(struct mlx5_ib_cq *cq) +{ + ib_umem_release(cq->resize_umem); +} + +static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size) +{ + int err; + + cq->resize_buf = kzalloc(sizeof(*cq->resize_buf), GFP_KERNEL); + if (!cq->resize_buf) + return -ENOMEM; + + err = alloc_cq_buf(dev, cq->resize_buf, entries, cqe_size); + if (err) + goto ex; + + init_cq_buf(cq, cq->resize_buf); + + return 0; + +ex: + kfree(cq->resize_buf); + return err; +} + +static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, cq->resize_buf); + cq->resize_buf = NULL; +} + +static int copy_resize_cqes(struct mlx5_ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_cqe64 *scqe64; + struct mlx5_cqe64 *dcqe64; + void *start_cqe; + void *scqe; + void *dcqe; + int ssize; + int dsize; + int i; + u8 sw_own; + + ssize = cq->buf.cqe_size; + dsize = cq->resize_buf->cqe_size; + if (ssize != dsize) { + mlx5_ib_warn(dev, "resize from different cqe size is not supported\n"); + return -EINVAL; + } + + i = cq->mcq.cons_index; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + start_cqe = scqe; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) { + dcqe = get_cqe_from_buf(cq->resize_buf, + (i + 1) & (cq->resize_buf->nent), + dsize); + dcqe64 = dsize == 64 ? dcqe : dcqe + 64; + sw_own = sw_ownership_bit(i + 1, cq->resize_buf->nent); + memcpy(dcqe, scqe, dsize); + dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own; + + ++i; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + if (scqe == start_cqe) { + printf("mlx5_ib: WARN: ""resize CQ failed to get resize CQE, CQN 0x%x\n", cq->mcq.cqn); + return -ENOMEM; + } + } + ++cq->mcq.cons_index; + return 0; +} + +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibcq->device); + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_modify_cq_mbox_in *in; + int err; + int npas; + int page_shift; + int inlen; + int uninitialized_var(cqe_size); + unsigned long flags; + + if (!MLX5_CAP_GEN(dev->mdev, cq_resize)) { + mlx5_ib_warn(dev, "Firmware does not support resize CQ\n"); + return -ENOSYS; + } + + if (entries < 1 || roundup_pow_of_two(entries + 1) > + (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) { + mlx5_ib_warn(dev, "wrong entries number %d(%ld), max %d\n", + entries, roundup_pow_of_two(entries + 1), + 1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)); + return -EINVAL; + } + + entries = roundup_pow_of_two(entries + 1); + + if (entries == ibcq->cqe + 1) + return 0; + + mutex_lock(&cq->resize_mutex); + if (udata) { + err = resize_user(dev, cq, entries, udata, &npas, &page_shift, + &cqe_size); + } else { + cqe_size = 64; + err = resize_kernel(dev, cq, entries, cqe_size); + if (!err) { + npas = cq->resize_buf->buf.npages; + page_shift = cq->resize_buf->buf.page_shift; + } + } + + if (err) { + mlx5_ib_warn(dev, "resize failed: %d\n", err); + goto ex; + } + + inlen = sizeof(*in) + npas * sizeof(in->pas[0]); + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto ex_resize; + } + + if (udata) + mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift, + in->pas, 0); + else + mlx5_fill_page_array(&cq->resize_buf->buf, in->pas); + + in->field_select = cpu_to_be32(MLX5_MODIFY_CQ_MASK_LOG_SIZE | + MLX5_MODIFY_CQ_MASK_PG_OFFSET | + MLX5_MODIFY_CQ_MASK_PG_SIZE); + in->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + in->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + in->ctx.page_offset = 0; + in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(entries) << 24); + in->hdr.opmod = cpu_to_be16(MLX5_CQ_OPMOD_RESIZE); + in->cqn = cpu_to_be32(cq->mcq.cqn); + + err = mlx5_core_modify_cq(dev->mdev, &cq->mcq, in, inlen); + if (err) { + mlx5_ib_warn(dev, "modify cq failed: %d\n", err); + goto ex_alloc; + } + + if (udata) { + cq->ibcq.cqe = entries - 1; + ib_umem_release(cq->buf.umem); + cq->buf.umem = cq->resize_umem; + cq->resize_umem = NULL; + } else { + struct mlx5_ib_cq_buf tbuf; + int resized = 0; + + spin_lock_irqsave(&cq->lock, flags); + if (cq->resize_buf) { + err = copy_resize_cqes(cq); + if (!err) { + tbuf = cq->buf; + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + resized = 1; + } + } + cq->ibcq.cqe = entries - 1; + spin_unlock_irqrestore(&cq->lock, flags); + if (resized) + free_cq_buf(dev, &tbuf); + } + mutex_unlock(&cq->resize_mutex); + + kvfree(in); + return 0; + +ex_alloc: + kvfree(in); + +ex_resize: + if (udata) + un_resize_user(cq); + else + un_resize_kernel(dev, cq); +ex: + mutex_unlock(&cq->resize_mutex); + return err; +} + +int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) +{ + struct mlx5_ib_cq *cq; + + if (!ibcq) + return 128; + + cq = to_mcq(ibcq); + return cq->cqe_size; +} Index: sys/dev/mlx5/mlx5_ib/mlx5_doorbell.c =================================================================== --- /dev/null +++ sys/dev/mlx5/mlx5_ib/mlx5_doorbell.c @@ -0,0 +1,93 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include + +#include "mlx5_ib.h" + +struct mlx5_ib_user_db_page { + struct list_head list; + struct ib_umem *umem; + uintptr_t user_virt; + int refcnt; +}; + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, uintptr_t virt, + struct mlx5_db *db) +{ + struct mlx5_ib_user_db_page *page; + int err = 0; + + mutex_lock(&context->db_page_mutex); + + list_for_each_entry(page, &context->db_page_list, list) + if (page->user_virt == (virt & PAGE_MASK)) + goto found; + + page = kmalloc(sizeof(*page), GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + page->user_virt = (virt & PAGE_MASK); + page->refcnt = 0; + page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, + PAGE_SIZE, 0, 0); + if (IS_ERR(page->umem)) { + err = PTR_ERR(page->umem); + kfree(page); + goto out; + } + + list_add(&page->list, &context->db_page_list); + +found: + db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); + db->u.user_page = page; + ++page->refcnt; + +out: + mutex_unlock(&context->db_page_mutex); + + return err; +} + +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db) +{ + mutex_lock(&context->db_page_mutex); + + if (!--db->u.user_page->refcnt) { + list_del(&db->u.user_page->list); + ib_umem_release(db->u.user_page->umem); + kfree(db->u.user_page); + } + + mutex_unlock(&context->db_page_mutex); +} Index: sys/dev/mlx5/mlx5_ib/mlx5_ib.h =================================================================== --- /dev/null +++ sys/dev/mlx5/mlx5_ib/mlx5_ib.h @@ -0,0 +1,933 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef MLX5_IB_H +#define MLX5_IB_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define mlx5_ib_dbg(dev, format, arg...) \ +pr_debug("mlx5_dbg:%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, curthread->td_proc->p_pid, ##arg) + +#define mlx5_ib_err(dev, format, arg...) \ +printf("mlx5_ib: ERR: ""mlx5_err:%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, curthread->td_proc->p_pid, ##arg) + +#define mlx5_ib_warn(dev, format, arg...) \ +printf("mlx5_ib: WARN: ""mlx5_warn:%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, curthread->td_proc->p_pid, ##arg) +#define BF_ENABLE 0 + +extern struct workqueue_struct *mlx5_ib_wq; + +enum { + MLX5_IB_MMAP_CMD_SHIFT = 8, + MLX5_IB_MMAP_CMD_MASK = 0xff, +}; + +enum mlx5_ib_mmap_cmd { + MLX5_IB_MMAP_REGULAR_PAGE = 0, + MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, + MLX5_IB_MMAP_WC_PAGE = 2, + MLX5_IB_MMAP_NC_PAGE = 3, + MLX5_IB_MMAP_MAP_DC_INFO_PAGE = 4, + + /* Use EXP mmap commands until it is pushed to upstream */ + MLX5_IB_EXP_MMAP_CORE_CLOCK = 0xFB, + MLX5_IB_EXP_MMAP_GET_CONTIGUOUS_PAGES_CPU_NUMA = 0xFC, + MLX5_IB_EXP_MMAP_GET_CONTIGUOUS_PAGES_DEV_NUMA = 0xFD, + MLX5_IB_EXP_ALLOC_N_MMAP_WC = 0xFE, +}; + +enum { + MLX5_RES_SCAT_DATA32_CQE = 0x1, + MLX5_RES_SCAT_DATA64_CQE = 0x2, + MLX5_REQ_SCAT_DATA32_CQE = 0x11, + MLX5_REQ_SCAT_DATA64_CQE = 0x22, +}; + +enum { + MLX5_DCT_CS_RES_64 = 2, + MLX5_CNAK_RX_POLL_CQ_QUOTA = 256, +}; + +enum mlx5_ib_latency_class { + MLX5_IB_LATENCY_CLASS_LOW, + MLX5_IB_LATENCY_CLASS_MEDIUM, + MLX5_IB_LATENCY_CLASS_HIGH, + MLX5_IB_LATENCY_CLASS_FAST_PATH +}; + +enum mlx5_ib_mad_ifc_flags { + MLX5_MAD_IFC_IGNORE_MKEY = 1, + MLX5_MAD_IFC_IGNORE_BKEY = 2, + MLX5_MAD_IFC_NET_VIEW = 4, +}; + +enum { + MLX5_CROSS_CHANNEL_UUAR = 0, +}; + +enum { + MLX5_IB_MAX_CTX_DYNAMIC_UARS = 256, + MLX5_IB_INVALID_UAR_INDEX = -1U +}; + +enum { + MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES = 13, + MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES = 6, + MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES = 16, + MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES = 9, +}; + +struct mlx5_ib_ucontext { + struct ib_ucontext ibucontext; + struct list_head db_page_list; + + /* protect doorbell record alloc/free + */ + struct mutex db_page_mutex; + struct mlx5_uuar_info uuari; + u32 dynamic_wc_uar_index[MLX5_IB_MAX_CTX_DYNAMIC_UARS]; + /* Transport Domain number */ + u32 tdn; +}; + +static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct mlx5_ib_ucontext, ibucontext); +} + +struct mlx5_ib_pd { + struct ib_pd ibpd; + u32 pdn; + u32 pa_lkey; +}; + +/* Use macros here so that don't have to duplicate + * enum ib_send_flags and enum ib_qp_type for low-level driver + */ + +#define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START +#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1) +#define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2) +#define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 +#define MLX5_IB_QPT_SW_CNAK IB_QPT_RESERVED2 +#define MLX5_IB_WR_UMR IB_WR_RESERVED1 + +struct wr_list { + u16 opcode; + u16 next; +}; + +struct mlx5_swr_ctx { + u64 wrid; + u32 wr_data; + struct wr_list w_list; + u32 wqe_head; + u8 sig_piped; + u8 rsvd[11]; +}; + +struct mlx5_rwr_ctx { + u64 wrid; +}; + +struct mlx5_ib_wq { + union { + struct mlx5_swr_ctx *swr_ctx; + struct mlx5_rwr_ctx *rwr_ctx; + }; + u16 unsig_count; + + /* serialize post to the work queue + */ + spinlock_t lock; + int wqe_cnt; + int max_post; + int max_gs; + int offset; + int wqe_shift; + unsigned head; + unsigned tail; + u16 cur_post; + u16 last_poll; + void *qend; +}; + +struct mlx5_ib_rwq { + struct ib_wq ibwq; + u32 rqn; + u32 rq_num_pas; + u32 log_rq_stride; + u32 log_rq_size; + u32 rq_page_offset; + u32 log_page_size; + struct ib_umem *umem; + int buf_size; + unsigned int page_shift; + int create_type; + struct mlx5_db db; + u32 user_index; + u32 wqe_count; + u32 wqe_shift; + int wq_sig; +}; + +enum { + MLX5_QP_USER, + MLX5_QP_KERNEL, + MLX5_QP_EMPTY +}; + +enum { + MLX5_WQ_USER, + MLX5_WQ_KERNEL +}; + +struct mlx5_ib_rwq_ind_table { + struct ib_rwq_ind_table ib_rwq_ind_tbl; + u32 rqtn; +}; + + +struct mlx5_ib_sqd { + struct mlx5_ib_qp *qp; + struct work_struct work; +}; + +struct mlx5_ib_mc_flows_list { + struct list_head flows_list; + /*Protect the flows_list*/ + struct mutex lock; +}; + +struct mlx5_ib_qp { + struct ib_qp ibqp; + struct mlx5_core_qp mqp; + struct mlx5_core_qp mrq; + struct mlx5_core_qp msq; + u32 tisn; + u32 tirn; + struct mlx5_buf buf; + + struct mlx5_db db; + struct mlx5_ib_wq rq; + + u32 doorbell_qpn; + u8 sq_signal_bits; + u8 fm_cache; + int sq_max_wqes_per_wr; + int sq_spare_wqes; + struct mlx5_ib_wq sq; + + struct ib_umem *umem; + int buf_size; + /* Raw Ethernet QP's SQ is allocated seperately + * from the RQ's buffer in user-space. + */ + struct ib_umem *sq_umem; + int sq_buf_size; + u64 sq_buf_addr; + int allow_mp_wqe; + + /* serialize qp state modifications + */ + struct mutex mutex; + u16 xrcdn; + u32 flags; + u8 port; + u8 alt_port; + u8 atomic_rd_en; + u8 resp_depth; + u8 state; + /* Raw Ethernet QP's SQ and RQ states */ + u8 rq_state; + u8 sq_state; + int mlx_type; + int wq_sig; + int scat_cqe; + int max_inline_data; + struct mlx5_bf *bf; + int has_rq; + + /* only for user space QPs. For kernel + * we have it from the bf object + */ + int uuarn; + + int create_type; + u32 pa_lkey; + + /* Store signature errors */ + bool signature_en; + + struct list_head qps_list; + struct list_head cq_recv_list; + struct list_head cq_send_list; + + struct mlx5_ib_mc_flows_list mc_flows_list; +}; + +struct mlx5_ib_dct { + struct ib_dct ibdct; + struct mlx5_core_dct mdct; +}; + +struct mlx5_ib_cq_buf { + struct mlx5_buf buf; + struct ib_umem *umem; + int cqe_size; + int nent; +}; + +enum mlx5_ib_qp_flags { + MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 0, + MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, + MLX5_IB_QP_CAP_CROSS_CHANNEL = IB_QP_CREATE_CROSS_CHANNEL, + MLX5_IB_QP_CAP_MANAGED_SEND = IB_QP_CREATE_MANAGED_SEND, + MLX5_IB_QP_CAP_MANAGED_RECV = IB_QP_CREATE_MANAGED_RECV, + MLX5_IB_QP_CAP_RX_END_PADDING = 1 << 5, +}; + +struct mlx5_umr_wr { + union { + u64 virt_addr; + u64 offset; + } target; + struct ib_pd *pd; + unsigned int page_shift; + unsigned int npages; + u64 length; + int access_flags; + u32 mkey; +}; + +struct mlx5_shared_mr_info { + int mr_id; + struct ib_umem *umem; +}; + +struct mlx5_ib_cq { + struct ib_cq ibcq; + struct mlx5_core_cq mcq; + struct mlx5_ib_cq_buf buf; + struct mlx5_db db; + + /* serialize access to the CQ + */ + spinlock_t lock; + + /* protect resize cq + */ + struct mutex resize_mutex; + struct mlx5_ib_cq_buf *resize_buf; + struct ib_umem *resize_umem; + int cqe_size; + struct list_head list_send_qp; + struct list_head list_recv_qp; +}; + +struct mlx5_ib_srq { + struct ib_srq ibsrq; + struct mlx5_core_srq msrq; + struct mlx5_buf buf; + struct mlx5_db db; + u64 *wrid; + /* protect SRQ hanlding + */ + spinlock_t lock; + int head; + int tail; + u16 wqe_ctr; + struct ib_umem *umem; + /* serialize arming a SRQ + */ + struct mutex mutex; + int wq_sig; +}; + +struct mlx5_ib_xrcd { + struct ib_xrcd ibxrcd; + u32 xrcdn; +}; + +enum mlx5_ib_mtt_access_flags { + MLX5_IB_MTT_READ = (1 << 0), + MLX5_IB_MTT_WRITE = (1 << 1), +}; + +#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) + +struct mlx5_ib_mr { + struct ib_mr ibmr; + struct mlx5_core_mr mmr; + struct ib_umem *umem; + struct mlx5_shared_mr_info *smr_info; + struct list_head list; + int order; + int umred; + dma_addr_t dma; + int npages; + struct mlx5_ib_dev *dev; + struct mlx5_create_mkey_mbox_out out; + struct mlx5_core_sig_ctx *sig; + u32 max_reg_descriptors; + u64 size; + u64 page_count; + struct mlx5_ib_mr **children; + int nchild; +}; + +struct mlx5_ib_fast_reg_page_list { + struct ib_fast_reg_page_list ibfrpl; + __be64 *mapped_page_list; + dma_addr_t map; +}; + +struct mlx5_ib_indir_reg_list { + struct ib_indir_reg_list ib_irl; + void *mapped_ilist; + struct mlx5_klm *klms; + dma_addr_t map; +}; + +struct mlx5_ib_umr_context { + enum ib_wc_status status; + struct completion done; +}; + +static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) +{ + context->status = -1; + init_completion(&context->done); +} + +struct umr_common { + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + struct ib_mr *mr; + /* control access to UMR QP + */ + struct semaphore sem; +}; + +enum { + MLX5_FMR_INVALID, + MLX5_FMR_VALID, + MLX5_FMR_BUSY, +}; + +struct mlx5_ib_fmr { + struct ib_fmr ibfmr; + struct mlx5_core_mr mr; + int access_flags; + int state; + /* protect fmr state + */ + spinlock_t lock; + u64 wrid; + struct ib_send_wr wr[2]; + u8 page_shift; + struct ib_fast_reg_page_list page_list; +}; + +struct cache_order { + struct kobject kobj; + int order; + int index; + struct mlx5_ib_dev *dev; +}; + +struct mlx5_cache_ent { + struct list_head head; + /* sync access to the cahce entry + */ + spinlock_t lock; + + + u32 order; + u32 size; + u32 cur; + u32 miss; + u32 limit; + + struct mlx5_ib_dev *dev; + struct work_struct work; + struct delayed_work dwork; + int pending; + struct cache_order co; +}; + +struct mlx5_mr_cache { + struct workqueue_struct *wq; + struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES]; + int stopped; + struct dentry *root; + int last_add; + int rel_timeout; + int rel_imm; +}; + +struct mlx5_ib_resources { + struct ib_cq *c0; + struct ib_xrcd *x0; + struct ib_xrcd *x1; + struct ib_pd *p0; + struct ib_srq *s0; + struct ib_srq *s1; +}; + +struct mlx5_dc_tracer { + struct page *pg; + dma_addr_t dma; + int size; + int order; +}; + +struct mlx5_dc_desc { + dma_addr_t dma; + void *buf; +}; + +enum mlx5_op { + MLX5_WR_OP_MLX = 1, +}; + +struct mlx5_mlx_wr { + u8 sl; + u16 dlid; + int icrc; +}; + +struct mlx5_send_wr { + struct ib_send_wr wr; + union { + struct mlx5_mlx_wr mlx; + } sel; +}; + +struct mlx5_dc_data { + struct ib_mr *mr; + struct ib_qp *dcqp; + struct ib_cq *rcq; + struct ib_cq *scq; + unsigned int rx_npages; + unsigned int tx_npages; + struct mlx5_dc_desc *rxdesc; + struct mlx5_dc_desc *txdesc; + unsigned int max_wqes; + unsigned int cur_send; + unsigned int last_send_completed; + int tx_pending; + struct mlx5_ib_dev *dev; + int port; + int initialized; + struct kobject kobj; + unsigned long connects; + unsigned long cnaks; + unsigned long discards; + struct ib_wc wc_tbl[MLX5_CNAK_RX_POLL_CQ_QUOTA]; +}; + +#define MLX5_IB_FS_LAST_PRIO 7 +#define MLX5_IB_FS_MCAST_PRIO (MLX5_IB_FS_LAST_PRIO + 1) +#if (MLX5_NUM_BYPASS_FTS <= MLX5_IB_FS_MCAST_PRIO || \ + MLX5_NUM_BYPASS_FTS <= MLX5_IB_FS_LAST_PRIO) +#error "num of mlx5_ib flow tables is greater than supported" +#endif +#define MLX5_IB_FS_LEFTOVERS_PRIO (MLX5_IB_FS_MCAST_PRIO + 1) + +#define MLX5_IB_NUM_FS_FT (MLX5_IB_FS_LEFTOVERS_PRIO + 1) + +struct mlx5_ib_fs_mc_flow { + unsigned int refcount; + struct ib_flow *ib_flow; + union ib_gid gid; + struct list_head list; +}; + +struct mlx5_ib_fs_prio { + struct mlx5_flow_table *ft; + unsigned int refcount; +}; + +struct mlx5_ib_fs_handler { + struct list_head list; + struct ib_flow ibflow; + unsigned int prio; + struct mlx5_flow_rule *rule; +}; + +struct mlx5_ib_fs { + struct mlx5_ib_fs_prio prios[MLX5_IB_NUM_FS_FT]; + /*Protect flow steering bypass flow tables*/ + struct mutex lock; +}; + +struct mlx5_ib_port_sysfs_group { + struct kobject kobj; + bool enabled; + struct attribute_group counters; +}; + +struct mlx5_ib_port { + struct mlx5_ib_dev *dev; + u8 port_num; /* 0 based */ + u16 q_cnt_id; + struct mlx5_ib_port_sysfs_group group; +}; + +struct mlx5_ib_dev { + struct ib_device ib_dev; + struct mlx5_core_dev *mdev; + MLX5_DECLARE_DOORBELL_LOCK(uar_lock); + int num_ports; + /* serialize update of capability mask + */ + struct mutex cap_mask_mutex; + bool ib_active; + struct umr_common umrc; + /* sync used page count stats + */ + struct mlx5_ib_resources devr; + struct mutex slow_path_mutex; + int enable_atomic_resp; + enum ib_atomic_cap atomic_cap; + struct mlx5_mr_cache cache; + struct kobject mr_cache; + /* protect resources needed as part of reset flow */ + spinlock_t reset_flow_resource_lock; + struct list_head qp_list; + struct timer_list delay_timer; + int fill_delay; + struct mlx5_dc_tracer dctr; + struct mlx5_dc_data dcd[MLX5_MAX_PORTS]; + struct kobject *dc_kobj; + struct mlx5_ib_fs fs; + /* Array with num_ports elements */ + struct mlx5_ib_port *port; + struct kobject *ports_parent; +}; + +static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) +{ + return container_of(mcq, struct mlx5_ib_cq, mcq); +} + +static inline struct mlx5_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) +{ + return container_of(ibxrcd, struct mlx5_ib_xrcd, ibxrcd); +} + +static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct mlx5_ib_dev, ib_dev); +} + +static inline struct mlx5_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct mlx5_ib_fmr, ibfmr); +} + +static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct mlx5_ib_cq, ibcq); +} + +static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) +{ + return container_of(mqp, struct mlx5_ib_qp, mqp); +} + +static inline struct mlx5_ib_qp *sq_to_mibqp(struct mlx5_core_qp *msq) +{ + return container_of(msq, struct mlx5_ib_qp, msq); +} + +static inline struct mlx5_ib_qp *rq_to_mibqp(struct mlx5_core_qp *mrq) +{ + return container_of(mrq, struct mlx5_ib_qp, mrq); +} + +static inline struct mlx5_ib_dct *to_mibdct(struct mlx5_core_dct *mdct) +{ + return container_of(mdct, struct mlx5_ib_dct, mdct); +} + +static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr) +{ + return container_of(mmr, struct mlx5_ib_mr, mmr); +} + +static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct mlx5_ib_pd, ibpd); +} + +static inline struct mlx5_ib_srq *to_msrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct mlx5_ib_srq, ibsrq); +} + +static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct mlx5_ib_qp, ibqp); +} + +static inline struct mlx5_ib_dct *to_mdct(struct ib_dct *ibdct) +{ + return container_of(ibdct, struct mlx5_ib_dct, ibdct); +} + +static inline struct mlx5_ib_rwq *to_mrwq(struct ib_wq *ibwq) +{ + return container_of(ibwq, struct mlx5_ib_rwq, ibwq); +} + +static inline struct mlx5_ib_rwq_ind_table *to_mrwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + return container_of(ib_rwq_ind_tbl, struct mlx5_ib_rwq_ind_table, ib_rwq_ind_tbl); +} + +static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq) +{ + return container_of(msrq, struct mlx5_ib_srq, msrq); +} + +static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct mlx5_ib_mr, ibmr); +} + +static inline struct mlx5_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl) +{ + return container_of(ibfrpl, struct mlx5_ib_fast_reg_page_list, ibfrpl); +} + +static inline struct mlx5_ib_indir_reg_list * +to_mindir_list(struct ib_indir_reg_list *ib_irl) +{ + return container_of(ib_irl, struct mlx5_ib_indir_reg_list, ib_irl); +} + +struct mlx5_ib_ah { + struct ib_ah ibah; + struct mlx5_av av; +}; + +static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah) +{ + return container_of(ibah, struct mlx5_ib_ah, ibah); +} + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, uintptr_t virt, + struct mlx5_db *db); +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index); +int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, + u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad); +struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, struct ib_ah_attr *ah_attr, + struct mlx5_ib_ah *ah, enum rdma_link_layer ll); +struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); +int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); +int mlx5_ib_destroy_ah(struct ib_ah *ah); +struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr); +int mlx5_ib_destroy_srq(struct ib_srq *srq); +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +struct ib_qp *mlx5_ib_exp_create_qp(struct ib_pd *pd, + struct ib_exp_qp_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx5_ib_destroy_qp(struct ib_qp *qp); +int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); +struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, + struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata); +int mlx5_ib_destroy_cq(struct ib_cq *cq); +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +int mlx5_ib_modify_cq(struct ib_cq *cq, + struct ib_cq_attr *cq_attr, + int cq_attr_mask); +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata, int mr_id); +int mlx5_ib_dereg_mr(struct ib_mr *ibmr); +int mlx5_ib_destroy_mr(struct ib_mr *ibmr); +struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr); +struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len); +struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len); +void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); + +struct ib_indir_reg_list * +mlx5_ib_alloc_indir_reg_list(struct ib_device *device, + unsigned int max_indir_list_len); +void mlx5_ib_free_indir_reg_list(struct ib_indir_reg_list *indir_list); + +struct ib_fmr *mlx5_ib_fmr_alloc(struct ib_pd *pd, int acc, + struct ib_fmr_attr *fmr_attr); +int mlx5_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int npages, u64 iova); +int mlx5_ib_unmap_fmr(struct list_head *fmr_list); +int mlx5_ib_fmr_dealloc(struct ib_fmr *ibfmr); +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); +struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata); +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd); +int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset); +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port); +int mlx5_query_smp_attr_node_info_mad_ifc(struct ib_device *ibdev, + struct ib_smp *out_mad); +int mlx5_query_system_image_guid_mad_ifc(struct ib_device *ibdev, + __be64 *sys_image_guid); +int mlx5_query_max_pkeys_mad_ifc(struct ib_device *ibdev, + u16 *max_pkeys); +int mlx5_query_vendor_id_mad_ifc(struct ib_device *ibdev, + u32 *vendor_id); +int mlx5_query_pkey_mad_ifc(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey); +int mlx5_query_node_desc_mad_ifc(struct mlx5_ib_dev *dev, char *node_desc); +int mlx5_query_node_guid_mad_ifc(struct mlx5_ib_dev *dev, u64 *node_guid); +int mlx5_query_gids_mad_ifc(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid); +int mlx5_query_port_mad_ifc(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev); +void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); +void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, + int *ncont, int *order); +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int umr); +void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); +int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); +int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); +void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context); +struct ib_dct *mlx5_ib_create_dct(struct ib_pd *pd, + struct ib_dct_init_attr *attr, + struct ib_udata *udata); +int mlx5_ib_destroy_dct(struct ib_dct *dct); +int mlx5_ib_query_dct(struct ib_dct *dct, struct ib_dct_attr *attr); +int mlx5_ib_arm_dct(struct ib_dct *dct, struct ib_udata *udata); +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status); +int mlx5_query_port_roce(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port, int index, + __be16 ah_udp_s_port); +int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port, + int index, int *gid_type); +int mlx5_ib_exp_query_mkey(struct ib_mr *mr, u64 mkey_attr_mask, + struct ib_mkey_attr *mkey_attr); +struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_destroy_wq(struct ib_wq *wq); +int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + enum ib_wq_attr_mask attr_mask, struct ib_udata *udata); +struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); +int mlx5_ib_destroy_flow(struct ib_flow *flow_id); +struct net_device *mlx5_ib_get_netdev(struct ib_device *ib_dev, u8 port); +int modify_gid_roce(struct ib_device *ib_dev, u8 port, unsigned int index, + const union ib_gid *gid, const struct ib_gid_attr *attr); +int query_gid_roce(struct ib_device *ib_dev, u8 port, int index, + union ib_gid *gid, struct ib_gid_attr *attr); +int mlx5_process_mad_mad_ifc(struct ib_device *ibdev, int mad_flags, + u8 port_num, struct ib_wc *in_wc, + struct ib_grh *in_grh, struct ib_mad *in_mad, + struct ib_mad *out_mad); + +static inline void init_query_mad(struct ib_smp *mad) +{ + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + mad->class_version = 1; + mad->method = IB_MGMT_METHOD_GET; +} + +static inline u8 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ; +} + +#define MLX5_MAX_UMR_SHIFT 16 +#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) + +#endif /* MLX5_IB_H */ Index: sys/dev/mlx5/mlx5_ib/mlx5_mad.c =================================================================== --- /dev/null +++ sys/dev/mlx5/mlx5_ib/mlx5_mad.c @@ -0,0 +1,523 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include "mlx5_ib.h" +#include + +enum { + MLX5_IB_VENDOR_CLASS1 = 0x9, + MLX5_IB_VENDOR_CLASS2 = 0xa +}; + +int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, + u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad) +{ + u8 op_modifier = 0; + + /* Key check traps can't be generated unless we have in_wc to + * tell us where to send the trap. + */ + if (ignore_mkey || !in_wc) + op_modifier |= 0x1; + if (ignore_bkey || !in_wc) + op_modifier |= 0x2; + + return mlx5_core_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, port); +} + +static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + u16 slid; + int err; + + slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS) + return IB_MAD_RESULT_SUCCESS; + + /* Don't process SMInfo queries -- the SMA can't handle them. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO) + return IB_MAD_RESULT_SUCCESS; + } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || + in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS1 || + in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS2 || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET) + return IB_MAD_RESULT_SUCCESS; + } else { + return IB_MAD_RESULT_SUCCESS; + } + + err = mlx5_MAD_IFC(to_mdev(ibdev), + mad_flags & IB_MAD_IGNORE_MKEY, + mad_flags & IB_MAD_IGNORE_BKEY, + port_num, in_wc, in_grh, in_mad, out_mad); + if (err) + return IB_MAD_RESULT_FAILURE; + + /* set return bit in status of directed route responses */ + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + out_mad->mad_hdr.status |= cpu_to_be16(1 << 15); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) + /* no response for trap repress */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static void pma_cnt_ext_assign(struct ib_pma_portcounters_ext *pma_cnt_ext, + struct mlx5_vport_counters *vc) +{ + pma_cnt_ext->port_xmit_data = cpu_to_be64((vc->transmitted_ib_unicast.octets + + vc->transmitted_ib_multicast.octets) >> 2); + pma_cnt_ext->port_rcv_data = cpu_to_be64((vc->received_ib_unicast.octets + + vc->received_ib_multicast.octets) >> 2); + pma_cnt_ext->port_xmit_packets = cpu_to_be64(vc->transmitted_ib_unicast.packets + + vc->transmitted_ib_multicast.packets); + pma_cnt_ext->port_rcv_packets = cpu_to_be64(vc->received_ib_unicast.packets + + vc->received_ib_multicast.packets); + pma_cnt_ext->port_unicast_xmit_packets = cpu_to_be64(vc->transmitted_ib_unicast.packets); + pma_cnt_ext->port_unicast_rcv_packets = cpu_to_be64(vc->received_ib_unicast.packets); + pma_cnt_ext->port_multicast_xmit_packets = cpu_to_be64(vc->transmitted_ib_multicast.packets); + pma_cnt_ext->port_multicast_rcv_packets = cpu_to_be64(vc->received_ib_multicast.packets); +} + +static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt, + struct mlx5_vport_counters *vc) +{ + ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_data, + (vc->transmitted_ib_unicast.octets + + vc->transmitted_ib_multicast.octets) >> 2); + ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_data, + (vc->received_ib_unicast.octets + + vc->received_ib_multicast.octets) >> 2); + ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_packets, + vc->transmitted_ib_unicast.packets + + vc->transmitted_ib_multicast.packets); + ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_packets, + vc->received_ib_unicast.packets + + vc->received_ib_multicast.packets); +} + +static int process_pma_cmd(struct ib_device *ibdev, u8 port_num, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_vport_counters *vc; + int err; + int ext; + + vc = kzalloc(sizeof(*vc), GFP_KERNEL); + if (!vc) + return -ENOMEM; + + ext = in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT; + + err = mlx5_get_vport_counters(dev->mdev, port_num, vc); + if (!err) { + if (ext) { + struct ib_pma_portcounters_ext *pma_cnt_ext = + (struct ib_pma_portcounters_ext *)(out_mad->data + 40); + + pma_cnt_ext_assign(pma_cnt_ext, vc); + } else { + struct ib_pma_portcounters *pma_cnt = + (struct ib_pma_portcounters *)(out_mad->data + 40); + + ASSIGN_16BIT_COUNTER(pma_cnt->port_rcv_errors, + (u16)vc->received_errors.packets); + + pma_cnt_assign(pma_cnt, vc); + } + err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + } + + kfree(vc); + return err; +} + +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + + memset(out_mad->data, 0, sizeof(out_mad->data)); + + if (MLX5_CAP_GEN(mdev, vport_counters) && + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT && + in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) { + /* TBD: read error counters from the PPCNT */ + return process_pma_cmd(ibdev, port_num, in_mad, out_mad); + } else { + return process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, + in_mad, out_mad); + } +} + +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + u16 packet_error; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + + packet_error = be16_to_cpu(out_mad->status); + + dev->mdev->port_caps[port - 1].ext_port_cap = (!err && !packet_error) ? + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0; + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_smp_attr_node_info_mad_ifc(struct ib_device *ibdev, + struct ib_smp *out_mad) +{ + struct ib_smp *in_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + if (!in_mad) + return -ENOMEM; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, + out_mad); + + kfree(in_mad); + return err; +} + +int mlx5_query_system_image_guid_mad_ifc(struct ib_device *ibdev, + __be64 *sys_image_guid) +{ + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!out_mad) + return -ENOMEM; + + err = mlx5_query_smp_attr_node_info_mad_ifc(ibdev, out_mad); + if (err) + goto out; + + memcpy(sys_image_guid, out_mad->data + 4, 8); + +out: + kfree(out_mad); + + return err; +} + +int mlx5_query_max_pkeys_mad_ifc(struct ib_device *ibdev, + u16 *max_pkeys) +{ + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!out_mad) + return -ENOMEM; + + err = mlx5_query_smp_attr_node_info_mad_ifc(ibdev, out_mad); + if (err) + goto out; + + *max_pkeys = be16_to_cpup((__be16 *)(out_mad->data + 28)); + +out: + kfree(out_mad); + + return err; +} + +int mlx5_query_vendor_id_mad_ifc(struct ib_device *ibdev, + u32 *vendor_id) +{ + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!out_mad) + return -ENOMEM; + + err = mlx5_query_smp_attr_node_info_mad_ifc(ibdev, out_mad); + if (err) + goto out; + + *vendor_id = be32_to_cpup((__be32 *)(out_mad->data + 36)) & 0xffff; + +out: + kfree(out_mad); + + return err; +} + +int mlx5_query_node_desc_mad_ifc(struct mlx5_ib_dev *dev, char *node_desc) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(node_desc, out_mad->data, 64); +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_node_guid_mad_ifc(struct mlx5_ib_dev *dev, u64 *node_guid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_pkey_mad_ifc(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; + in_mad->attr_mod = cpu_to_be32(index / 32); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, + out_mad); + if (err) + goto out; + + *pkey = be16_to_cpu(((__be16 *)out_mad->data)[index % 32]); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_gids_mad_ifc(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, + out_mad); + if (err) + goto out; + + memcpy(gid->raw, out_mad->data + 8, 8); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(index / 8); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, + out_mad); + if (err) + goto out; + + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_port_mad_ifc(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int ext_active_speed; + int err = -ENOMEM; + + if (port < 1 || port > MLX5_CAP_GEN(mdev, num_ports)) { + mlx5_ib_warn(dev, "invalid port number %d\n", port); + return -EINVAL; + } + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + memset(props, 0, sizeof(*props)); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + goto out; + } + + props->lid = be16_to_cpup((__be16 *)(out_mad->data + 16)); + props->lmc = out_mad->data[34] & 0x7; + props->sm_lid = be16_to_cpup((__be16 *)(out_mad->data + 18)); + props->sm_sl = out_mad->data[36] & 0xf; + props->state = out_mad->data[32] & 0xf; + props->phys_state = out_mad->data[33] >> 4; + props->port_cap_flags = be32_to_cpup((__be32 *)(out_mad->data + 20)); + props->gid_tbl_len = out_mad->data[50]; + props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); + props->pkey_tbl_len = mdev->port_caps[port - 1].pkey_table_len; + props->bad_pkey_cntr = be16_to_cpup((__be16 *)(out_mad->data + 46)); + props->qkey_viol_cntr = be16_to_cpup((__be16 *)(out_mad->data + 48)); + props->active_width = out_mad->data[31] & 0xf; + props->active_speed = out_mad->data[35] >> 4; + props->max_mtu = out_mad->data[41] & 0xf; + props->active_mtu = out_mad->data[36] >> 4; + props->subnet_timeout = out_mad->data[51] & 0x1f; + props->max_vl_num = out_mad->data[37] >> 4; + props->init_type_reply = out_mad->data[41] >> 4; + + /* Check if extended speeds (EDR/FDR/...) are supported */ + if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { + ext_active_speed = out_mad->data[62] >> 4; + + switch (ext_active_speed) { + case 1: + props->active_speed = 16; /* FDR */ + break; + case 2: + props->active_speed = 32; /* EDR */ + break; + } + } + + /* If reported active speed is QDR, check if is FDR-10 */ + if (props->active_speed == 4) { + if (mdev->port_caps[port - 1].ext_port_cap & + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) { + init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, + NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + /* Checking LinkSpeedActive for FDR-10 */ + if (out_mad->data[15] & 0x1) + props->active_speed = 8; + } + } + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} Index: sys/dev/mlx5/mlx5_ib/mlx5_main.c =================================================================== --- /dev/null +++ sys/dev/mlx5/mlx5_ib/mlx5_main.c @@ -0,0 +1,3683 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#undef inode +#include +#include +#include +#include +#include +#include "user.h" +#include "mlx5_ib.h" + +#define DRIVER_NAME "mlx5_ib" +#define DRIVER_VERSION "3.2-rc1" +#define DRIVER_RELDATE "May 2016" + +MODULE_AUTHOR("Eli Cohen "); +MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); +MODULE_LICENSE("Dual BSD/GPL"); +#if (__FreeBSD_version >= 1100000) +MODULE_DEPEND(mlx5ib, linuxkpi, 1, 1, 1); +#endif +MODULE_DEPEND(mlx5ib, mlx5, 1, 1, 1); +MODULE_DEPEND(mlx5ib, ibcore, 1, 1, 1); +MODULE_VERSION(mlx5ib, 1); + +static int deprecated_prof_sel = 2; +module_param_named(prof_sel, deprecated_prof_sel, int, 0444); +MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core"); + +enum { + MLX5_STANDARD_ATOMIC_SIZE = 0x8, +}; + +struct workqueue_struct *mlx5_ib_wq; + +static char mlx5_version[] = + DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" + DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; + +static void get_atomic_caps(struct mlx5_ib_dev *dev, + struct ib_device_attr *props, + int exp) +{ + int tmp; + u8 atomic_operations; + u8 atomic_size_qp; + u8 atomic_req_endianess; + + atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); + atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); + atomic_req_endianess = MLX5_CAP_ATOMIC(dev->mdev, + atomic_req_8B_endianess_mode) || + !mlx5_host_is_le(); + + tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD; + if (((atomic_operations & tmp) == tmp) + && (atomic_size_qp & 8)) { + if (atomic_req_endianess) { + props->atomic_cap = IB_ATOMIC_HCA; + } else { + if (exp) + props->atomic_cap = IB_ATOMIC_HCA_REPLY_BE; + else + props->atomic_cap = IB_ATOMIC_NONE; + } + } else { + props->atomic_cap = IB_ATOMIC_NONE; + } + + tmp = MLX5_ATOMIC_OPS_MASKED_CMP_SWAP | MLX5_ATOMIC_OPS_MASKED_FETCH_ADD; + if (((atomic_operations & tmp) == tmp) + &&(atomic_size_qp & 8)) { + if (atomic_req_endianess) + props->masked_atomic_cap = IB_ATOMIC_HCA; + else { + if (exp) + props->masked_atomic_cap = IB_ATOMIC_HCA_REPLY_BE; + else + props->masked_atomic_cap = IB_ATOMIC_NONE; + } + } else { + props->masked_atomic_cap = IB_ATOMIC_NONE; + } +} + +static enum rdma_link_layer +mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + + switch (MLX5_CAP_GEN(dev->mdev, port_type)) { + case MLX5_CAP_PORT_TYPE_IB: + return IB_LINK_LAYER_INFINIBAND; + case MLX5_CAP_PORT_TYPE_ETH: + return IB_LINK_LAYER_ETHERNET; + default: + return IB_LINK_LAYER_UNSPECIFIED; + } +} + +static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev) +{ + return !dev->mdev->issi; +} + +enum { + MLX5_VPORT_ACCESS_METHOD_MAD, + MLX5_VPORT_ACCESS_METHOD_HCA, + MLX5_VPORT_ACCESS_METHOD_NIC, +}; + +static int mlx5_get_vport_access_method(struct ib_device *ibdev) +{ + if (mlx5_use_mad_ifc(to_mdev(ibdev))) + return MLX5_VPORT_ACCESS_METHOD_MAD; + + if (mlx5_ib_port_link_layer(ibdev, 1) == + IB_LINK_LAYER_ETHERNET) + return MLX5_VPORT_ACCESS_METHOD_NIC; + + return MLX5_VPORT_ACCESS_METHOD_HCA; +} + +static int mlx5_query_system_image_guid(struct ib_device *ibdev, + __be64 *sys_image_guid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + u64 tmp; + int err; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_system_image_guid_mad_ifc(ibdev, + sys_image_guid); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp); + if (!err) + *sys_image_guid = cpu_to_be64(tmp); + return err; + + case MLX5_VPORT_ACCESS_METHOD_NIC: + err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp); + if (!err) + *sys_image_guid = cpu_to_be64(tmp); + return err; + + default: + return -EINVAL; + } +} + +static int mlx5_query_max_pkeys(struct ib_device *ibdev, + u16 *max_pkeys) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_max_pkeys_mad_ifc(ibdev, max_pkeys); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + case MLX5_VPORT_ACCESS_METHOD_NIC: + *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, + pkey_table_size)); + return 0; + + default: + return -EINVAL; + } +} + +static int mlx5_query_vendor_id(struct ib_device *ibdev, + u32 *vendor_id) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_vendor_id_mad_ifc(ibdev, vendor_id); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + case MLX5_VPORT_ACCESS_METHOD_NIC: + return mlx5_core_query_vendor_id(dev->mdev, vendor_id); + + default: + return -EINVAL; + } +} + +static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, + __be64 *node_guid) +{ + u64 tmp; + int err; + + switch (mlx5_get_vport_access_method(&dev->ib_dev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_node_guid_mad_ifc(dev, node_guid); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp); + if (!err) + *node_guid = cpu_to_be64(tmp); + return err; + + case MLX5_VPORT_ACCESS_METHOD_NIC: + err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp); + if (!err) + *node_guid = cpu_to_be64(tmp); + return err; + + default: + return -EINVAL; + } +} + +struct mlx5_reg_node_desc { + u8 desc[64]; +}; + +static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc) +{ + struct mlx5_reg_node_desc in; + + if (mlx5_use_mad_ifc(dev)) + return mlx5_query_node_desc_mad_ifc(dev, node_desc); + + memset(&in, 0, sizeof(in)); + + return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc, + sizeof(struct mlx5_reg_node_desc), + MLX5_REG_NODE_DESC, 0, 0); +} + +static int query_device_exp(struct ib_device *ibdev, + struct ib_device_attr *props, + int exp) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + int max_sq_desc; + int max_rq_sg; + int max_sq_sg; + int err; + + + memset(props, 0, sizeof(*props)); + + err = mlx5_query_system_image_guid(ibdev, + &props->sys_image_guid); + if (err) + return err; + + err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys); + if (err) + return err; + + err = mlx5_query_vendor_id(ibdev, &props->vendor_id); + if (err) + return err; + + props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) | + ((u64)fw_rev_min(dev->mdev) << 16) | + fw_rev_sub(dev->mdev); + props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN; + + if (MLX5_CAP_GEN(mdev, pkv)) + props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; + if (MLX5_CAP_GEN(mdev, qkv)) + props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; + if (MLX5_CAP_GEN(mdev, apm)) + props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; + props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; + if (MLX5_CAP_GEN(mdev, xrc)) + props->device_cap_flags |= IB_DEVICE_XRC; + props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + props->device_cap_flags |= IB_DEVICE_INDIR_REGISTRATION; + if (MLX5_CAP_GEN(mdev, cq_oi) && + MLX5_CAP_GEN(mdev, cd)) + props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; + if (MLX5_CAP_GEN(mdev, sho)) { + props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; + /* At this stage no support for signature handover */ + props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | + IB_PROT_T10DIF_TYPE_2 | + IB_PROT_T10DIF_TYPE_3; + props->sig_guard_cap = IB_GUARD_T10DIF_CRC | + IB_GUARD_T10DIF_CSUM; + } + if (MLX5_CAP_GEN(mdev, drain_sigerr)) + props->device_cap_flags |= IB_DEVICE_SIGNATURE_RESP_PIPE; + + if (MLX5_CAP_GEN(mdev, block_lb_mc)) + props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + + props->vendor_part_id = mdev->pdev->device; + props->hw_ver = mdev->pdev->revision; + + props->max_mr_size = ~0ull; + props->page_size_cap = ~(u32)((1ull << MLX5_CAP_GEN(mdev, log_pg_sz)) -1); + props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp); + props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) / + sizeof(struct mlx5_wqe_data_seg); + max_sq_desc = min((int)MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512); + max_sq_sg = (max_sq_desc - + sizeof(struct mlx5_wqe_ctrl_seg) - + sizeof(struct mlx5_wqe_raddr_seg)) / sizeof(struct mlx5_wqe_data_seg); + props->max_sge = min(max_rq_sg, max_sq_sg); + props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq); + props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1; + props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); + props->max_pd = 1 << MLX5_CAP_GEN(mdev, log_max_pd); + props->max_qp_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp); + props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp); + props->max_srq = 1 << MLX5_CAP_GEN(mdev, log_max_srq); + props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1; + props->local_ca_ack_delay = MLX5_CAP_GEN(mdev, local_ca_ack_delay); + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_srq_sge = max_rq_sg - 1; + props->max_fast_reg_page_list_len = (unsigned int)-1; + props->max_indir_reg_mr_list_len = 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size); + get_atomic_caps(dev, props, exp); + props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); + props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ + props->max_ah = INT_MAX; + props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz); + props->timestamp_mask = 0xFFFFFFFFFFFFFFFFULL; + props->comp_mask |= IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK | + IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK; + + return 0; +} + +static int mlx5_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + return query_device_exp(ibdev, props, 0); +} + +enum mlx5_ib_width { + MLX5_IB_WIDTH_1X = 1 << 0, + MLX5_IB_WIDTH_2X = 1 << 1, + MLX5_IB_WIDTH_4X = 1 << 2, + MLX5_IB_WIDTH_8X = 1 << 3, + MLX5_IB_WIDTH_12X = 1 << 4 +}; + +static int translate_active_width(struct ib_device *ibdev, u8 active_width, + u8 *ib_width) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + int err = 0; + + if (active_width & MLX5_IB_WIDTH_1X) { + *ib_width = IB_WIDTH_1X; + } else if (active_width & MLX5_IB_WIDTH_2X) { + mlx5_ib_warn(dev, "active_width %d is not supported by IB spec\n", + (int)active_width); + err = -EINVAL; + } else if (active_width & MLX5_IB_WIDTH_4X) { + *ib_width = IB_WIDTH_4X; + } else if (active_width & MLX5_IB_WIDTH_8X) { + *ib_width = IB_WIDTH_8X; + } else if (active_width & MLX5_IB_WIDTH_12X) { + *ib_width = IB_WIDTH_12X; + } else { + mlx5_ib_dbg(dev, "Invalid active_width %d\n", + (int)active_width); + err = -EINVAL; + } + + return err; +} + +/* + * TODO: Move to IB core + */ +enum ib_max_vl_num { + __IB_MAX_VL_0 = 1, + __IB_MAX_VL_0_1 = 2, + __IB_MAX_VL_0_3 = 3, + __IB_MAX_VL_0_7 = 4, + __IB_MAX_VL_0_14 = 5, +}; + +enum mlx5_vl_hw_cap { + MLX5_VL_HW_0 = 1, + MLX5_VL_HW_0_1 = 2, + MLX5_VL_HW_0_2 = 3, + MLX5_VL_HW_0_3 = 4, + MLX5_VL_HW_0_4 = 5, + MLX5_VL_HW_0_5 = 6, + MLX5_VL_HW_0_6 = 7, + MLX5_VL_HW_0_7 = 8, + MLX5_VL_HW_0_14 = 15 +}; + +static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap, + u8 *max_vl_num) +{ + switch (vl_hw_cap) { + case MLX5_VL_HW_0: + *max_vl_num = __IB_MAX_VL_0; + break; + case MLX5_VL_HW_0_1: + *max_vl_num = __IB_MAX_VL_0_1; + break; + case MLX5_VL_HW_0_3: + *max_vl_num = __IB_MAX_VL_0_3; + break; + case MLX5_VL_HW_0_7: + *max_vl_num = __IB_MAX_VL_0_7; + break; + case MLX5_VL_HW_0_14: + *max_vl_num = __IB_MAX_VL_0_14; + break; + + default: + return -EINVAL; + } + + return 0; +} + +static int mlx5_query_port_ib(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + u32 *rep; + int outlen = MLX5_ST_SZ_BYTES(query_hca_vport_context_out); + struct mlx5_ptys_reg *ptys; + struct mlx5_pmtu_reg *pmtu; + struct mlx5_pvlc_reg pvlc; + void *ctx; + int err; + + rep = mlx5_vzalloc(outlen); + ptys = kzalloc(sizeof(*ptys), GFP_KERNEL); + pmtu = kzalloc(sizeof(*pmtu), GFP_KERNEL); + if (!rep || !ptys || !pmtu) { + err = -ENOMEM; + goto out; + } + + memset(props, 0, sizeof(*props)); + + /* what if I am pf with dual port */ + err = mlx5_query_hca_vport_context(mdev, port, 0, rep, outlen); + if (err) + goto out; + + ctx = MLX5_ADDR_OF(query_hca_vport_context_out, rep, hca_vport_context); + + props->lid = MLX5_GET(hca_vport_context, ctx, lid); + props->lmc = MLX5_GET(hca_vport_context, ctx, lmc); + props->sm_lid = MLX5_GET(hca_vport_context, ctx, sm_lid); + props->sm_sl = MLX5_GET(hca_vport_context, ctx, sm_sl); + props->state = MLX5_GET(hca_vport_context, ctx, vport_state); + props->phys_state = MLX5_GET(hca_vport_context, ctx, + port_physical_state); + props->port_cap_flags = MLX5_GET(hca_vport_context, ctx, cap_mask1); + props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size)); + props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); + props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); + props->bad_pkey_cntr = MLX5_GET(hca_vport_context, ctx, + pkey_violation_counter); + props->qkey_viol_cntr = MLX5_GET(hca_vport_context, ctx, + qkey_violation_counter); + props->subnet_timeout = MLX5_GET(hca_vport_context, ctx, + subnet_timeout); + props->init_type_reply = MLX5_GET(hca_vport_context, ctx, + init_type_reply); + + ptys->proto_mask |= MLX5_PTYS_IB; + ptys->local_port = port; + err = mlx5_core_access_ptys(mdev, ptys, 0); + if (err) + goto out; + + err = translate_active_width(ibdev, ptys->ib_link_width_oper, + &props->active_width); + if (err) + goto out; + + props->active_speed = (u8)ptys->ib_proto_oper; + + pmtu->local_port = port; + err = mlx5_core_access_pmtu(mdev, pmtu, 0); + if (err) + goto out; + + props->max_mtu = pmtu->max_mtu; + props->active_mtu = pmtu->oper_mtu; + + memset(&pvlc, 0, sizeof(pvlc)); + pvlc.local_port = port; + err = mlx5_core_access_pvlc(mdev, &pvlc, 0); + if (err) + goto out; + + err = translate_max_vl_num(ibdev, pvlc.vl_hw_cap, + &props->max_vl_num); +out: + kvfree(rep); + kfree(ptys); + kfree(pmtu); + return err; +} + +int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_port_mad_ifc(ibdev, port, props); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + return mlx5_query_port_ib(ibdev, port, props); + + case MLX5_VPORT_ACCESS_METHOD_NIC: + return mlx5_query_port_roce(ibdev, port, props); + + default: + return -EINVAL; + } +} + +static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_gids_mad_ifc(ibdev, port, index, gid); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + return mlx5_query_hca_vport_gid(mdev, port, 0, index, gid); + + case MLX5_VPORT_ACCESS_METHOD_NIC: + return -ENOSYS; + + default: + return -EINVAL; + } +} + +static int mlx5_ib_modify_gid(struct ib_device *ibdev, u8 port, + unsigned int index, const union ib_gid *gid, + const struct ib_gid_attr *attr, + __always_unused void **context) +{ + enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port); + + if (ll != IB_LINK_LAYER_ETHERNET) + return -EINVAL; + + return modify_gid_roce(ibdev, port, index, gid, attr); +} + +static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_pkey_mad_ifc(ibdev, port, index, pkey); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + case MLX5_VPORT_ACCESS_METHOD_NIC: + return mlx5_query_hca_vport_pkey(mdev, 0, port, 0, index, + pkey); + + default: + return -EINVAL; + } +} + +static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, + struct ib_device_modify *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_reg_node_desc in; + struct mlx5_reg_node_desc out; + int err; + + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) + return 0; + + /* + * If possible, pass node desc to FW, so it can generate + * a 144 trap. If cmd fails, just ignore. + */ + memcpy(&in, props->node_desc, 64); + err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out, + sizeof(out), MLX5_REG_NODE_DESC, 0, 1); + if (err) + return err; + + memcpy(ibdev->node_desc, props->node_desc, 64); + + return err; +} + +static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, + struct ib_port_modify *props) +{ + u8 is_eth = (mlx5_ib_port_link_layer(ibdev, port) == + IB_LINK_LAYER_ETHERNET); + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_port_attr attr; + u32 tmp; + int err; + + /* return OK if this is RoCE. CM calls ib_modify_port() regardless + * of whether port link layer is ETH or IB. For ETH ports, qkey + * violations and port capabilities are not valid. + */ + if (is_eth) + return 0; + + mutex_lock(&dev->cap_mask_mutex); + + err = mlx5_ib_query_port(ibdev, port, &attr); + if (err) + goto out; + + tmp = (attr.port_cap_flags | props->set_port_cap_mask) & + ~props->clr_port_cap_mask; + + err = mlx5_set_port_caps(dev->mdev, port, tmp); + +out: + mutex_unlock(&dev->cap_mask_mutex); + return err; +} + +enum mlx5_cap_flags { + MLX5_CAP_COMPACT_AV = 1 << 0, +}; + +static void set_mlx5_flags(u32 *flags, struct mlx5_core_dev *dev) +{ + *flags |= MLX5_CAP_GEN(dev, compact_address_vector) ? + MLX5_CAP_COMPACT_AV : 0; +} + +static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_alloc_ucontext_req_v2 req; + struct mlx5_exp_ib_alloc_ucontext_resp resp; + struct mlx5_ib_ucontext *context; + struct mlx5_uuar_info *uuari; + struct mlx5_uar *uars; + int gross_uuars; + int num_uars; + int ver; + int uuarn; + int err; + int i; + size_t reqlen; + + if (!dev->ib_active) + return ERR_PTR(-EAGAIN); + + memset(&req, 0, sizeof(req)); + memset(&resp, 0, sizeof(resp)); + + reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); + if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) + ver = 0; + else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) + ver = 2; + else { + mlx5_ib_err(dev, "request malformed, reqlen: %ld\n", reqlen); + return ERR_PTR(-EINVAL); + } + + err = ib_copy_from_udata(&req, udata, reqlen); + if (err) { + mlx5_ib_err(dev, "copy failed\n"); + return ERR_PTR(err); + } + + if (req.reserved) { + mlx5_ib_err(dev, "request corrupted\n"); + return ERR_PTR(-EINVAL); + } + + if (req.total_num_uuars == 0 || req.total_num_uuars > MLX5_MAX_UUARS) { + mlx5_ib_warn(dev, "wrong num_uuars: %d\n", req.total_num_uuars); + return ERR_PTR(-ENOMEM); + } + + req.total_num_uuars = ALIGN(req.total_num_uuars, + MLX5_NON_FP_BF_REGS_PER_PAGE); + if (req.num_low_latency_uuars > req.total_num_uuars - 1) { + mlx5_ib_warn(dev, "wrong num_low_latency_uuars: %d ( > %d)\n", + req.total_num_uuars, req.total_num_uuars); + return ERR_PTR(-EINVAL); + } + + num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; + gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE; + resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); + if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) + resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); + resp.cache_line_size = L1_CACHE_BYTES; + resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); + resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); + resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); + resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); + resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); + set_mlx5_flags(&resp.flags, dev->mdev); + + if (offsetof(struct mlx5_ib_alloc_ucontext_resp, max_desc_sz_sq_dc) < udata->outlen) + resp.max_desc_sz_sq_dc = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq_dc); + + if (offsetof(struct mlx5_ib_alloc_ucontext_resp, atomic_arg_sizes_dc) < udata->outlen) + resp.atomic_arg_sizes_dc = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); + + resp.exp_data.comp_mask = MLX5_EXP_ALLOC_CTX_RESP_MASK_CQE_VERSION | + MLX5_EXP_ALLOC_CTX_RESP_MASK_CQE_COMP_MAX_NUM; + resp.exp_data.cqe_version = MLX5_CAP_GEN(dev->mdev, cqe_version); + resp.exp_data.cqe_comp_max_num = MLX5_CAP_GEN(dev->mdev, + cqe_compression_max_num); + resp.exp_data.hca_core_clock_offset = + offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE; + + if (MLX5_CAP_GEN(dev->mdev, roce)) { + resp.exp_data.comp_mask |= MLX5_EXP_ALLOC_CTX_RESP_MASK_RROCE_UDP_SPORT_MIN | + MLX5_EXP_ALLOC_CTX_RESP_MASK_RROCE_UDP_SPORT_MAX; + resp.exp_data.rroce_udp_sport_min = MLX5_CAP_ROCE(dev->mdev, + r_roce_min_src_udp_port); + resp.exp_data.rroce_udp_sport_max = MLX5_CAP_ROCE(dev->mdev, + r_roce_max_src_udp_port); + } + + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + uuari = &context->uuari; + mutex_init(&uuari->lock); + uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL); + if (!uars) { + err = -ENOMEM; + goto out_ctx; + } + + uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars), + sizeof(*uuari->bitmap), + GFP_KERNEL); + if (!uuari->bitmap) { + err = -ENOMEM; + goto out_uar_ctx; + } + /* + * clear all fast path uuars + */ + for (i = 0; i < gross_uuars; i++) { + uuarn = i & 3; + if (uuarn == 2 || uuarn == 3) + set_bit(i, uuari->bitmap); + } + + uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL); + if (!uuari->count) { + err = -ENOMEM; + goto out_bitmap; + } + + for (i = 0; i < num_uars; i++) { + err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index); + if (err) { + mlx5_ib_err(dev, "uar alloc failed at %d\n", i); + goto out_uars; + } + } + for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++) + context->dynamic_wc_uar_index[i] = MLX5_IB_INVALID_UAR_INDEX; + + INIT_LIST_HEAD(&context->db_page_list); + mutex_init(&context->db_page_mutex); + + resp.tot_uuars = req.total_num_uuars; + resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports); + err = ib_copy_to_udata(udata, &resp, + min_t(size_t, udata->outlen, sizeof(resp))); + if (err) + goto out_uars; + + uuari->ver = ver; + uuari->num_low_latency_uuars = req.num_low_latency_uuars; + uuari->uars = uars; + uuari->num_uars = num_uars; + + if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET) { + err = mlx5_alloc_transport_domain(dev->mdev, &context->tdn); + if (err) + goto out_uars; + } + + return &context->ibucontext; + +out_uars: + for (i--; i >= 0; i--) + mlx5_cmd_free_uar(dev->mdev, uars[i].index); + kfree(uuari->count); + +out_bitmap: + kfree(uuari->bitmap); + +out_uar_ctx: + kfree(uars); + +out_ctx: + kfree(context); + return ERR_PTR(err); +} + +static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + struct mlx5_uuar_info *uuari = &context->uuari; + int i; + + if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET) + mlx5_dealloc_transport_domain(dev->mdev, context->tdn); + + for (i = 0; i < uuari->num_uars; i++) { + if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index)) + mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index); + } + for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++) { + if (context->dynamic_wc_uar_index[i] != MLX5_IB_INVALID_UAR_INDEX) + mlx5_cmd_free_uar(dev->mdev, context->dynamic_wc_uar_index[i]); + } + + kfree(uuari->count); + kfree(uuari->bitmap); + kfree(uuari->uars); + kfree(context); + + return 0; +} + +static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index) +{ + return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index; +} + +static int get_command(unsigned long offset) +{ + return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK; +} + +static int get_arg(unsigned long offset) +{ + return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1); +} + +static int get_index(unsigned long offset) +{ + return get_arg(offset); +} + +static int uar_mmap(struct vm_area_struct *vma, pgprot_t prot, bool is_wc, + struct mlx5_uuar_info *uuari, struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context) +{ + unsigned long idx; + phys_addr_t pfn; + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) { + mlx5_ib_warn(dev, "wrong size, expected PAGE_SIZE(%ld) got %ld\n", + (long)PAGE_SIZE, (long)(vma->vm_end - vma->vm_start)); + return -EINVAL; + } + + idx = get_index(vma->vm_pgoff); + if (idx >= uuari->num_uars) { + mlx5_ib_warn(dev, "wrong offset, idx:%ld num_uars:%d\n", + idx, uuari->num_uars); + return -EINVAL; + } + + pfn = uar_index2pfn(dev, uuari->uars[idx].index); + mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx, + (unsigned long long)pfn); + + vma->vm_page_prot = prot; + if (io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot)) { + mlx5_ib_err(dev, "io remap failed\n"); + return -EAGAIN; + } + + mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA 0x%llx\n", is_wc ? "WC" : "NC", + (long)vma->vm_start, (unsigned long long)pfn << PAGE_SHIFT); + + return 0; +} + +static int alloc_and_map_wc(struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context, u32 indx, + struct vm_area_struct *vma) +{ + phys_addr_t pfn; + u32 uar_index; + int err; + + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) { + mlx5_ib_warn(dev, "wrong size, expected PAGE_SIZE(%ld) got %ld\n", + (long)PAGE_SIZE, vma->vm_end - vma->vm_start); + return -EINVAL; + } + + if (indx >= MLX5_IB_MAX_CTX_DYNAMIC_UARS) { + mlx5_ib_warn(dev, "wrong offset, idx:%d max:%d\n", + indx, MLX5_IB_MAX_CTX_DYNAMIC_UARS); + return -EINVAL; + } + + /* Fail if uar already allocated */ + if (context->dynamic_wc_uar_index[indx] != MLX5_IB_INVALID_UAR_INDEX) { + mlx5_ib_warn(dev, "wrong offset, idx %d is busy\n", indx); + return -EINVAL; + } + + err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index); + if (err) { + mlx5_ib_warn(dev, "UAR alloc failed\n"); + return err; + } + + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + pfn = uar_index2pfn(dev, uar_index); + if (io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot)) { + mlx5_ib_err(dev, "io remap failed\n"); + mlx5_cmd_free_uar(dev->mdev, uar_index); + return -EAGAIN; + } + context->dynamic_wc_uar_index[indx] = uar_index; + + return 0; +} + +static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + struct mlx5_uuar_info *uuari = &context->uuari; + unsigned long command; + + command = get_command(vma->vm_pgoff); + switch (command) { + case MLX5_IB_MMAP_REGULAR_PAGE: + return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot), + true, + uuari, dev, context); + + break; + + case MLX5_IB_EXP_MMAP_GET_CONTIGUOUS_PAGES_CPU_NUMA: + case MLX5_IB_EXP_MMAP_GET_CONTIGUOUS_PAGES_DEV_NUMA: + case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: + return -ENOSYS; + + case MLX5_IB_MMAP_WC_PAGE: + return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot), + true, uuari, dev, context); + break; + + case MLX5_IB_MMAP_NC_PAGE: + return uar_mmap(vma, pgprot_noncached(vma->vm_page_prot), + false, uuari, dev, context); + break; + + case MLX5_IB_EXP_ALLOC_N_MMAP_WC: + return alloc_and_map_wc(dev, context, get_index(vma->vm_pgoff), + vma); + break; + + default: + return -EINVAL; + } + + return 0; +} + +static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn) +{ + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_mkey_seg *seg; + struct mlx5_core_mr mr; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + seg = &in->seg; + seg->flags = MLX5_PERM_LOCAL_READ | MLX5_ACCESS_MODE_PA; + seg->flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64); + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + seg->start_addr = 0; + + err = mlx5_core_create_mkey(dev->mdev, &mr, in, sizeof(*in), + NULL, NULL, NULL); + if (err) { + mlx5_ib_warn(dev, "failed to create mkey, %d\n", err); + goto err_in; + } + + kfree(in); + *key = mr.key; + + return 0; + +err_in: + kfree(in); + + return err; +} + +static void free_pa_mkey(struct mlx5_ib_dev *dev, u32 key) +{ + struct mlx5_core_mr mr; + int err; + + memset(&mr, 0, sizeof(mr)); + mr.key = key; + err = mlx5_core_destroy_mkey(dev->mdev, &mr); + if (err) + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x\n", key); +} + +static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_alloc_pd_resp resp; + struct mlx5_ib_pd *pd; + int err; + + pd = kmalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn); + if (err) { + mlx5_ib_warn(dev, "pd alloc failed\n"); + kfree(pd); + return ERR_PTR(err); + } + + if (context) { + resp.pdn = pd->pdn; + if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { + mlx5_ib_err(dev, "copy failed\n"); + mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); + kfree(pd); + return ERR_PTR(-EFAULT); + } + } else { + err = alloc_pa_mkey(to_mdev(ibdev), &pd->pa_lkey, pd->pdn); + if (err) { + mlx5_ib_err(dev, "alloc mkey failed\n"); + mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); + kfree(pd); + return ERR_PTR(err); + } + } + + return &pd->ibpd; +} + +static int mlx5_ib_dealloc_pd(struct ib_pd *pd) +{ + struct mlx5_ib_dev *mdev = to_mdev(pd->device); + struct mlx5_ib_pd *mpd = to_mpd(pd); + + if (!pd->uobject) + free_pa_mkey(mdev, mpd->pa_lkey); + + mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn); + kfree(mpd); + + return 0; +} + +static struct mlx5_ib_fs_mc_flow *get_mc_flow(struct mlx5_ib_qp *mqp, + union ib_gid *gid) +{ + struct mlx5_ib_fs_mc_flow *iter; + + list_for_each_entry(iter, &mqp->mc_flows_list.flows_list, list) { + if (!memcmp(iter->gid.raw, gid->raw, 16)) + return iter; + } + + return NULL; +} + +static int attach_mcg_fs(struct mlx5_ib_dev *dev, struct ib_qp *ibqp, + union ib_gid *gid) +{ + struct ib_flow_attr *flow_attr; + struct ib_flow_spec_eth *eth_flow; + unsigned int size = sizeof(*flow_attr) + sizeof(*eth_flow); + struct ib_flow *ib_flow; + struct mlx5_ib_qp *mqp = to_mqp(ibqp); + struct mlx5_ib_fs_mc_flow *mc_flow = NULL; + int err = 0; + static const char mac_mask[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + + mutex_lock(&mqp->mc_flows_list.lock); + mc_flow = get_mc_flow(mqp, gid); + if (mc_flow) { + mc_flow->refcount++; + goto unlock; + } + + flow_attr = kzalloc(size, GFP_KERNEL); + if (!flow_attr) { + err = -ENOMEM; + goto unlock; + } + + flow_attr->size = size; + flow_attr->priority = 0; + flow_attr->num_of_specs = 1; + flow_attr->port = 1; + flow_attr->type = IB_FLOW_ATTR_NORMAL; + + eth_flow = (void *)(flow_attr + 1); + eth_flow->type = IB_FLOW_SPEC_ETH; + eth_flow->size = sizeof(*eth_flow); + memcpy(eth_flow->mask.dst_mac, mac_mask, ETH_ALEN); + memcpy(eth_flow->val.dst_mac, &gid->raw[10], ETH_ALEN); + mc_flow = kzalloc(sizeof(*mc_flow), GFP_KERNEL); + if (!mc_flow) { + err = -ENOMEM; + goto free; + } + + ib_flow = ib_create_flow(ibqp, + flow_attr, + IB_FLOW_DOMAIN_USER); + if (IS_ERR(ib_flow)) { + err = PTR_ERR(ib_flow); + goto free; + } + + mc_flow->ib_flow = ib_flow; + mc_flow->refcount = 1; + memcpy(&mc_flow->gid, gid, sizeof(*gid)); + list_add_tail(&mc_flow->list, &mqp->mc_flows_list.flows_list); + + mutex_unlock(&mqp->mc_flows_list.lock); + kfree(flow_attr); + return 0; +free: + kfree(flow_attr); + kfree(mc_flow); +unlock: + mutex_unlock(&mqp->mc_flows_list.lock); + return err; +} + +static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + int err; + + if (ibqp->qp_type == IB_QPT_RAW_PACKET) + err = attach_mcg_fs(dev, ibqp, gid); + else + err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); + if (err) + mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static int detach_mcg_fs(struct ib_qp *ibqp, union ib_gid *gid) +{ + struct mlx5_ib_qp *mqp = to_mqp(ibqp); + struct mlx5_ib_fs_mc_flow *mc_flow; + int err = 0; + + mutex_lock(&mqp->mc_flows_list.lock); + mc_flow = get_mc_flow(mqp, gid); + if (!mc_flow) { + err = -EINVAL; + goto unlock; + } + if (!--mc_flow->refcount) + err = ib_destroy_flow(mc_flow->ib_flow); + list_del(&mc_flow->list); + kfree(mc_flow); +unlock: + mutex_unlock(&mqp->mc_flows_list.lock); + return err; +} + +static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + int err; + + if (ibqp->qp_type == IB_QPT_RAW_PACKET) + err = detach_mcg_fs(ibqp, gid); + else + err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num); + if (err) + mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static void put_ft(struct mlx5_ib_dev *dev, + struct mlx5_ib_fs_prio *prio, bool ft_added) +{ + prio->refcount -= !!ft_added; + if (!prio->refcount) { + mlx5_destroy_flow_table(prio->ft); + prio->ft = NULL; + } +} + +int mlx5_ib_destroy_flow(struct ib_flow *flow_id) +{ + struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device); + struct mlx5_ib_fs_handler *handler = container_of(flow_id, + struct mlx5_ib_fs_handler, + ibflow); + struct mlx5_ib_fs_handler *iter, *tmp; + + mutex_lock(&dev->fs.lock); + + mlx5_del_flow_rule(handler->rule); + + list_for_each_entry_safe(iter, tmp, &handler->list, list) { + mlx5_del_flow_rule(iter->rule); + list_del(&iter->list); + kfree(iter); + } + + put_ft(dev, &dev->fs.prios[handler->prio], true); + + mutex_unlock(&dev->fs.lock); + + kfree(handler); + + return 0; +} + +static int parse_flow_attr(u32 *match_c, u32 *match_v, + union ib_flow_spec *ib_spec) +{ + void *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c, + outer_headers); + void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v, + outer_headers); + switch (ib_spec->type) { + case IB_FLOW_SPEC_ETH: + if (ib_spec->size != sizeof(ib_spec->eth)) + return -EINVAL; + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + dmac_47_16), + ib_spec->eth.mask.dst_mac, ETH_ALEN); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, + dmac_47_16), + ib_spec->eth.val.dst_mac, ETH_ALEN); + + if (ib_spec->eth.mask.vlan_tag) { + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + cvlan_tag, 1); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + cvlan_tag, 1); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + first_vid, ntohs(ib_spec->eth.mask.vlan_tag)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + first_vid, ntohs(ib_spec->eth.val.vlan_tag)); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + first_cfi, + ntohs(ib_spec->eth.mask.vlan_tag) >> 12); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + first_cfi, + ntohs(ib_spec->eth.val.vlan_tag) >> 12); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + first_prio, + ntohs(ib_spec->eth.mask.vlan_tag) >> 13); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + first_prio, + ntohs(ib_spec->eth.val.vlan_tag) >> 13); + } + break; + case IB_FLOW_SPEC_IPV4: + if (ib_spec->size != sizeof(ib_spec->ipv4)) + return -EINVAL; + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + ethertype, 0xffff); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + ethertype, ETHERTYPE_IP); + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + src_ip[3]), + &ib_spec->ipv4.mask.src_ip, + sizeof(ib_spec->ipv4.mask.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, + src_ip[3]), + &ib_spec->ipv4.val.src_ip, + sizeof(ib_spec->ipv4.val.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + dst_ip[3]), + &ib_spec->ipv4.mask.dst_ip, + sizeof(ib_spec->ipv4.mask.dst_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, + dst_ip[3]), + &ib_spec->ipv4.val.dst_ip, + sizeof(ib_spec->ipv4.val.dst_ip)); + + break; + case IB_FLOW_SPEC_IPV6: + if (ib_spec->size != sizeof(ib_spec->ipv6)) + return -EINVAL; + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + ethertype, 0xffff); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + ethertype, ETHERTYPE_IPV6); + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, + outer_headers_c, src_ip), + &ib_spec->ipv6.mask.src_ip, + sizeof(ib_spec->ipv6.mask.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, + outer_headers_v, src_ip), + &ib_spec->ipv6.val.src_ip, + sizeof(ib_spec->ipv6.val.src_ip)); + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, + outer_headers_c, dst_ip), + &ib_spec->ipv6.mask.dst_ip, + sizeof(ib_spec->ipv6.mask.dst_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, + outer_headers_v, dst_ip), + &ib_spec->ipv6.val.dst_ip, + sizeof(ib_spec->ipv6.val.dst_ip)); + + break; + case IB_FLOW_SPEC_TCP: + if (ib_spec->size != sizeof(ib_spec->tcp_udp)) + return -EINVAL; + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol, + 0xff); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol, + IPPROTO_TCP); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport, + ntohs(ib_spec->tcp_udp.mask.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_sport, + ntohs(ib_spec->tcp_udp.val.src_port)); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport, + ntohs(ib_spec->tcp_udp.mask.dst_port)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_dport, + ntohs(ib_spec->tcp_udp.val.dst_port)); + break; + + case IB_FLOW_SPEC_UDP: + if (ib_spec->size != sizeof(ib_spec->tcp_udp)) + return -EINVAL; + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol, + 0xff); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol, + IPPROTO_UDP); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_sport, + ntohs(ib_spec->tcp_udp.mask.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_sport, + ntohs(ib_spec->tcp_udp.val.src_port)); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_dport, + ntohs(ib_spec->tcp_udp.mask.dst_port)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_dport, + ntohs(ib_spec->tcp_udp.val.dst_port)); + break; + + default: + return -EINVAL; + } + + return 0; +} + +static bool flow_is_multicast(struct ib_flow_attr *ib_attr) +{ + struct ib_flow_spec_eth *eth_spec; + + if (ib_attr->type != IB_FLOW_ATTR_NORMAL || + ib_attr->size < sizeof(struct ib_flow_attr) + + sizeof(struct ib_flow_spec_eth) || + ib_attr->num_of_specs < 1) + return false; + + eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1); + if (eth_spec->type != IB_FLOW_SPEC_ETH || + eth_spec->size != sizeof(*eth_spec)) + return false; + + return is_multicast_ether_addr(eth_spec->mask.dst_mac) && + is_multicast_ether_addr(eth_spec->val.dst_mac); +} + +static struct mlx5_ib_fs_prio *get_ft(struct mlx5_ib_dev *dev, + struct ib_flow_attr *flow_attr) +{ + struct mlx5_flow_namespace *ns = NULL; + unsigned int priority; + char name[FT_NAME_STR_SZ]; + int n_ent, n_grp; + struct mlx5_ib_fs_prio *prio; + struct mlx5_flow_table *ft; + + if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { + if (flow_is_multicast(flow_attr)) { + priority = MLX5_IB_FS_MCAST_PRIO; + snprintf(name, sizeof(name), "bypass_mcast"); + } else { + priority = flow_attr->priority; + snprintf(name, sizeof(name), "bypass%u", priority + 1); + } + ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS); + n_ent = FS_MAX_ENTRIES; + n_grp = FS_MAX_TYPES; + prio = &dev->fs.prios[priority]; + } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || + flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { + ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_LEFTOVERS); + build_leftovers_ft_param(name, &priority, &n_ent, &n_grp); + prio = &dev->fs.prios[MLX5_IB_FS_LEFTOVERS_PRIO]; + } + + if (!ns) + return ERR_PTR(-ENOTSUPP); + + ft = prio->ft; + if (!ft) { + ft = mlx5_create_auto_grouped_flow_table(ns, priority, name, n_ent, n_grp); + + if (!IS_ERR(ft)) { + prio->refcount = 0; + prio->ft = ft; + } + } + + return IS_ERR(ft) ? (void *)ft : prio; +} + +static struct mlx5_ib_fs_handler *create_user_normal_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_fs_prio *ft_prio, + struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + struct mlx5_ib_fs_handler *handler; + struct mlx5_flow_table *ft = ft_prio->ft; + u8 match_criteria_enable = 0; + u32 *match_c; + u32 *match_v; + unsigned int spec_index; + void *ib_flow = flow_attr + 1; + int err = 0; + + match_c = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL); + match_v = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL); + handler = kzalloc(sizeof(*handler), GFP_KERNEL); + if (!handler || !match_c || !match_v) { + err = -ENOMEM; + goto free; + } + + INIT_LIST_HEAD(&handler->list); + + for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { + err = parse_flow_attr(match_c, match_v, ib_flow); + if (err < 0) + goto free; + + ib_flow += ((union ib_flow_spec *)ib_flow)->size; + } + + match_criteria_enable = (!outer_header_zero(match_c)) << 0; + handler->rule = mlx5_add_flow_rule(ft, match_criteria_enable, + match_c, match_v, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_DEFAULT_FLOW_TAG, + dst); + + if (IS_ERR(handler->rule)) { + err = PTR_ERR(handler->rule); + goto free; + } + + handler->prio = ft_prio - dev->fs.prios; + + ft_prio->ft = ft; + ft_prio->refcount++; + + kfree(match_c); + kfree(match_v); + + return handler; + +free: + kfree(handler); + kfree(match_c); + kfree(match_v); + + return ERR_PTR(err); +} +static struct mlx5_ib_fs_handler *create_leftovers_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_fs_prio *ft_prio, + struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + struct mlx5_ib_fs_handler *handler_mcast; + struct mlx5_ib_fs_handler *handler_ucast = NULL; + struct mlx5_flow_table *ft = ft_prio->ft; + u8 match_criteria_enable = 0; + u32 *match_c; + u32 *match_v; + void *outer_headers_c; + void *outer_headers_v; + static const char mcast_mac[ETH_ALEN] = {0x1}; + static const char empty_mac[ETH_ALEN] = {}; + int err = 0; + + match_c = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL); + match_v = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL); + handler_mcast = kzalloc(sizeof(*handler_mcast), GFP_KERNEL); + if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) + handler_ucast = kzalloc(sizeof(*handler_ucast), GFP_KERNEL); + + if (!handler_mcast || !match_c || !match_v || + ((flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) && !handler_ucast)) { + err = -ENOMEM; + goto free; + } + + INIT_LIST_HEAD(&handler_mcast->list); + if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) { + INIT_LIST_HEAD(&handler_ucast->list); + list_add(&handler_ucast->list, &handler_mcast->list); + } + + outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c, outer_headers); + outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v, outer_headers); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, dmac_47_16), mcast_mac, ETH_ALEN); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, dmac_47_16), mcast_mac, ETH_ALEN); + + match_criteria_enable = (!outer_header_zero(match_c)) << 0; + handler_mcast->rule = mlx5_add_flow_rule(ft, match_criteria_enable, + match_c, match_v, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_DEFAULT_FLOW_TAG, + dst); + + if (IS_ERR(handler_mcast->rule)) { + err = PTR_ERR(handler_mcast->rule); + goto free; + } + + handler_mcast->prio = ft_prio - dev->fs.prios; + + if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) { + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, dmac_47_16), empty_mac, ETH_ALEN); + + match_criteria_enable = (!outer_header_zero(match_c)) << 0; + handler_ucast->rule = mlx5_add_flow_rule(ft, match_criteria_enable, + match_c, match_v, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_DEFAULT_FLOW_TAG, + dst); + + if (IS_ERR(handler_ucast->rule)) { + err = PTR_ERR(handler_ucast->rule); + goto destroy_mcast; + } + + handler_ucast->prio = ft_prio - dev->fs.prios; + } + + ft_prio->ft = ft; + ft_prio->refcount++; + + kfree(match_c); + kfree(match_v); + + return handler_mcast; + +destroy_mcast: + mlx5_del_flow_rule(handler_mcast->rule); +free: + kfree(match_c); + kfree(match_v); + kfree(handler_mcast); + kfree(handler_ucast); + return ERR_PTR(err); +} + +static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + int domain) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + int err; + struct mlx5_flow_destination *dst = NULL; + struct mlx5_ib_fs_handler *handler = NULL; + struct mlx5_ib_fs_prio *ft_prio; + struct mlx5_flow_table *ft; + + if (flow_attr->priority > MLX5_IB_FS_LAST_PRIO) { + mlx5_ib_warn(dev, "wrong priority %d\n", flow_attr->priority); + return ERR_PTR(-ENOSPC); + } + + if (domain != IB_FLOW_DOMAIN_USER || + flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) || + flow_attr->flags) { + mlx5_ib_warn(dev, "wrong params\n"); + return ERR_PTR(-EINVAL); + } + + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return ERR_PTR(-ENOMEM); + + mutex_lock(&dev->fs.lock); + + ft_prio = get_ft(dev, flow_attr); + if (IS_ERR(ft_prio)) { + mlx5_ib_warn(dev, "failed to get priority\n"); + err = PTR_ERR(ft_prio); + goto unlock; + } + + ft = ft_prio->ft; + + dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR; + dst->tir_num = to_mqp(qp)->tirn; + + if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { + handler = create_user_normal_rule(dev, ft_prio, flow_attr, + dst); + } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || + flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { + handler = create_leftovers_rule(dev, ft_prio, flow_attr, + dst); + } else { + mlx5_ib_warn(dev, "wrong attr type %d\n", flow_attr->type); + err = -EINVAL; + goto destroy_ft; + } + + if (IS_ERR(handler)) { + mlx5_ib_warn(dev, "failed to create rule\n"); + err = PTR_ERR(handler); + handler = NULL; + goto destroy_ft; + } + + mutex_unlock(&dev->fs.lock); + kfree(dst); + + return &handler->ibflow; + +destroy_ft: + put_ft(dev, ft_prio, false); +unlock: + mutex_unlock(&dev->fs.lock); + kfree(dst); + kfree(handler); + return ERR_PTR(err); +} + +static int init_node_data(struct mlx5_ib_dev *dev) +{ + int err; + + err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc); + if (err) + return err; + + dev->mdev->rev_id = dev->mdev->pdev->revision; + + return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid); +} + +static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + + return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages); +} + +static ssize_t show_reg_pages(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + + return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%d.%d.%04d\n", fw_rev_maj(dev->mdev), + fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); +} + +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%x\n", dev->mdev->rev_id); +} + +static ssize_t show_board(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, + dev->mdev->board_id); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); +static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); + +static struct device_attribute *mlx5_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id, + &dev_attr_fw_pages, + &dev_attr_reg_pages, +}; + +static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_ib_qp *mqp; + struct mlx5_ib_cq *send_mcq, *recv_mcq; + struct mlx5_core_cq *mcq; + struct list_head cq_armed_list; + unsigned long flags_qp; + unsigned long flags_cq; + unsigned long flags; + + mlx5_ib_warn(ibdev, " started\n"); + INIT_LIST_HEAD(&cq_armed_list); + + /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/ + spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags); + list_for_each_entry(mqp, &ibdev->qp_list, qps_list) { + spin_lock_irqsave(&mqp->sq.lock, flags_qp); + if (mqp->sq.tail != mqp->sq.head) { + send_mcq = to_mcq(mqp->ibqp.send_cq); + spin_lock_irqsave(&send_mcq->lock, flags_cq); + if (send_mcq->mcq.comp && + mqp->ibqp.send_cq->comp_handler) { + if (!send_mcq->mcq.reset_notify_added) { + send_mcq->mcq.reset_notify_added = 1; + list_add_tail(&send_mcq->mcq.reset_notify, + &cq_armed_list); + } + } + spin_unlock_irqrestore(&send_mcq->lock, flags_cq); + } + spin_unlock_irqrestore(&mqp->sq.lock, flags_qp); + spin_lock_irqsave(&mqp->rq.lock, flags_qp); + /* no handling is needed for SRQ */ + if (!mqp->ibqp.srq) { + if (mqp->rq.tail != mqp->rq.head) { + recv_mcq = to_mcq(mqp->ibqp.recv_cq); + spin_lock_irqsave(&recv_mcq->lock, flags_cq); + if (recv_mcq->mcq.comp && + mqp->ibqp.recv_cq->comp_handler) { + if (!recv_mcq->mcq.reset_notify_added) { + recv_mcq->mcq.reset_notify_added = 1; + list_add_tail(&recv_mcq->mcq.reset_notify, + &cq_armed_list); + } + } + spin_unlock_irqrestore(&recv_mcq->lock, + flags_cq); + } + } + spin_unlock_irqrestore(&mqp->rq.lock, flags_qp); + } + /*At that point all inflight post send were put to be executed as of we + * lock/unlock above locks Now need to arm all involved CQs. + */ + list_for_each_entry(mcq, &cq_armed_list, reset_notify) { + mcq->comp(mcq); + } + spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); + mlx5_ib_warn(ibdev, " ended\n"); + return; +} + +static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, + enum mlx5_dev_event event, unsigned long param) +{ + struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context; + struct ib_event ibev; + + u8 port = 0; + + switch (event) { + case MLX5_DEV_EVENT_SYS_ERROR: + ibdev->ib_active = false; + ibev.event = IB_EVENT_DEVICE_FATAL; + mlx5_ib_handle_internal_error(ibdev); + break; + + case MLX5_DEV_EVENT_PORT_UP: + ibev.event = IB_EVENT_PORT_ACTIVE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_PORT_DOWN: + case MLX5_DEV_EVENT_PORT_INITIALIZED: + ibev.event = IB_EVENT_PORT_ERR; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_LID_CHANGE: + ibev.event = IB_EVENT_LID_CHANGE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_PKEY_CHANGE: + ibev.event = IB_EVENT_PKEY_CHANGE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_GUID_CHANGE: + ibev.event = IB_EVENT_GID_CHANGE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_CLIENT_REREG: + ibev.event = IB_EVENT_CLIENT_REREGISTER; + port = (u8)param; + break; + + default: + break; + } + + ibev.device = &ibdev->ib_dev; + ibev.element.port_num = port; + + if ((event != MLX5_DEV_EVENT_SYS_ERROR) && + (port < 1 || port > ibdev->num_ports)) { + mlx5_ib_warn(ibdev, "warning: event on port %d\n", port); + return; + } + + if (ibdev->ib_active) + ib_dispatch_event(&ibev); +} + +static void get_ext_port_caps(struct mlx5_ib_dev *dev) +{ + int port; + + for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) + mlx5_query_ext_port_caps(dev, port); +} + +static void config_atomic_responder(struct mlx5_ib_dev *dev, + struct ib_exp_device_attr *props) +{ + enum ib_atomic_cap cap = props->base.atomic_cap; + + if (cap == IB_ATOMIC_HCA || + cap == IB_ATOMIC_GLOB || + cap == IB_ATOMIC_HCA_REPLY_BE) + dev->enable_atomic_resp = 1; + + dev->atomic_cap = cap; +} + +enum mlx5_addr_align { + MLX5_ADDR_ALIGN_0 = 0, + MLX5_ADDR_ALIGN_64 = 64, + MLX5_ADDR_ALIGN_128 = 128, +}; + +static int mlx5_ib_exp_query_device(struct ib_device *ibdev, + struct ib_exp_device_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + int err; + + err = query_device_exp(ibdev, &props->base, 1); + if (err) + return err; + + props->exp_comp_mask = IB_EXP_DEVICE_ATTR_CAP_FLAGS2; + props->device_cap_flags2 = 0; + props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_DC_REQ_RD; + props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_DC_RES_RD; + props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_MAX_DCT; + if (MLX5_CAP_GEN(dev->mdev, dct)) { + props->device_cap_flags2 |= IB_EXP_DEVICE_DC_TRANSPORT; + props->dc_rd_req = 1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_dc); + props->dc_rd_res = 1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_dc); + props->max_dct = props->base.max_qp; + } else { + props->dc_rd_req = 0; + props->dc_rd_res = 0; + props->max_dct = 0; + } + + props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ; + if (MLX5_CAP_GEN(dev->mdev, sctr_data_cqe)) + props->inline_recv_sz = MLX5_MAX_INLINE_RECEIVE_SIZE; + else + props->inline_recv_sz = 0; + + props->device_cap_flags2 |= IB_EXP_DEVICE_NOP; + + props->device_cap_flags2 |= IB_EXP_DEVICE_UMR; + props->umr_caps.max_reg_descriptors = 1 << MLX5_CAP_GEN(dev->mdev, log_max_klm_list_size); + props->umr_caps.max_send_wqe_inline_klms = 20; + props->umr_caps.max_umr_recursion_depth = MLX5_CAP_GEN(dev->mdev, max_indirection); + props->umr_caps.max_umr_stride_dimenson = 1; + props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_UMR; + props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN; + props->max_ctx_res_domain = MLX5_IB_MAX_CTX_DYNAMIC_UARS * MLX5_NON_FP_BF_REGS_PER_PAGE; + + props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_RX_HASH; + props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_MAX_WQ_TYPE_RQ; + if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) { + props->rx_hash_caps.max_rwq_indirection_tables = 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt); + props->rx_hash_caps.max_rwq_indirection_table_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size); + props->rx_hash_caps.supported_hash_functions = IB_EX_RX_HASH_FUNC_TOEPLITZ; + props->rx_hash_caps.supported_packet_fields = IB_RX_HASH_SRC_IPV4 | + IB_RX_HASH_DST_IPV4 | + IB_RX_HASH_SRC_IPV6 | + IB_RX_HASH_DST_IPV6 | + IB_RX_HASH_SRC_PORT_TCP | + IB_RX_HASH_DST_PORT_TCP | + IB_RX_HASH_SRC_PORT_UDP | + IB_RX_HASH_DST_PORT_UDP; + props->rx_hash_caps.supported_qps = IB_EXP_QPT_RAW_PACKET; + props->max_wq_type_rq = 1 << MLX5_CAP_GEN(dev->mdev, log_max_rq); + } else { + memset(&props->rx_hash_caps, 0, sizeof(props->rx_hash_caps)); + props->max_wq_type_rq = 0; + } + + props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_MAX_DEVICE_CTX; + /*mlx5_core uses NUM_DRIVER_UARS uar pages*/ + /*For simplicity, assume one to one releation ship between uar pages and context*/ + props->max_device_ctx = + (1 << (MLX5_CAP_GEN(dev->mdev, uar_sz) + 20 - PAGE_SHIFT)) + / (MLX5_DEF_TOT_UUARS / MLX5_NUM_UUARS_PER_PAGE) + - NUM_DRIVER_UARS; + + props->rx_pad_end_addr_align = MLX5_ADDR_ALIGN_0; + if (MLX5_CAP_GEN(dev->mdev, pad_cap)) { + if (MLX5_CAP_GEN(dev->mdev, cache_line_128byte) && + (cache_line_size() == 128)) + props->rx_pad_end_addr_align = MLX5_ADDR_ALIGN_128; + else + props->rx_pad_end_addr_align = MLX5_ADDR_ALIGN_64; + } + props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_RX_PAD_END_ALIGN; + + return err; +} + +static int get_port_caps(struct mlx5_ib_dev *dev) +{ + struct ib_exp_device_attr *dprops = NULL; + struct ib_port_attr *pprops = NULL; + int err = -ENOMEM; + int port; + + pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); + if (!pprops) + goto out; + + dprops = kmalloc(sizeof(*dprops), GFP_KERNEL); + if (!dprops) + goto out; + + err = mlx5_ib_exp_query_device(&dev->ib_dev, dprops); + if (err) { + mlx5_ib_warn(dev, "query_device failed %d\n", err); + goto out; + } + config_atomic_responder(dev, dprops); + + for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { + err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); + if (err) { + mlx5_ib_warn(dev, "query_port %d failed %d\n", + port, err); + break; + } + dev->mdev->port_caps[port - 1].pkey_table_len = dprops->base.max_pkeys; + dev->mdev->port_caps[port - 1].gid_table_len = pprops->gid_tbl_len; + mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", + dprops->base.max_pkeys, pprops->gid_tbl_len); + } + +out: + kfree(pprops); + kfree(dprops); + + return err; +} + +static void destroy_umrc_res(struct mlx5_ib_dev *dev) +{ + int err; + + err = mlx5_mr_cache_cleanup(dev); + if (err) + mlx5_ib_warn(dev, "mr cache cleanup failed\n"); + + mlx5_ib_destroy_qp(dev->umrc.qp); + ib_destroy_cq(dev->umrc.cq); + ib_dereg_mr(dev->umrc.mr); + ib_dealloc_pd(dev->umrc.pd); +} + +enum { + MAX_UMR_WR = 128, +}; + +static int create_umr_res(struct mlx5_ib_dev *dev) +{ + struct ib_qp_init_attr *init_attr = NULL; + struct ib_qp_attr *attr = NULL; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + struct ib_mr *mr; + int ret; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL); + if (!attr || !init_attr) { + ret = -ENOMEM; + goto error_0; + } + + pd = ib_alloc_pd(&dev->ib_dev); + if (IS_ERR(pd)) { + mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); + ret = PTR_ERR(pd); + goto error_0; + } + + mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(mr)) { + mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n"); + ret = PTR_ERR(mr); + goto error_1; + } + + cq = ib_create_cq(&dev->ib_dev, mlx5_umr_cq_handler, NULL, NULL, 128, + 0); + if (IS_ERR(cq)) { + mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); + ret = PTR_ERR(cq); + goto error_2; + } + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + + init_attr->send_cq = cq; + init_attr->recv_cq = cq; + init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->cap.max_send_wr = MAX_UMR_WR; + init_attr->cap.max_send_sge = 1; + init_attr->qp_type = MLX5_IB_QPT_REG_UMR; + init_attr->port_num = 1; + qp = mlx5_ib_create_qp(pd, init_attr, NULL); + if (IS_ERR(qp)) { + mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); + ret = PTR_ERR(qp); + goto error_3; + } + qp->device = &dev->ib_dev; + qp->real_qp = qp; + qp->uobject = NULL; + qp->qp_type = MLX5_IB_QPT_REG_UMR; + + attr->qp_state = IB_QPS_INIT; + attr->port_num = 1; + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_PORT, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); + goto error_4; + } + + memset(attr, 0, sizeof(*attr)); + attr->qp_state = IB_QPS_RTR; + attr->path_mtu = IB_MTU_256; + + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); + goto error_4; + } + + memset(attr, 0, sizeof(*attr)); + attr->qp_state = IB_QPS_RTS; + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); + goto error_4; + } + + dev->umrc.qp = qp; + dev->umrc.cq = cq; + dev->umrc.mr = mr; + dev->umrc.pd = pd; + + sema_init(&dev->umrc.sem, MAX_UMR_WR); + ret = mlx5_mr_cache_init(dev); + if (ret) { + mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); + goto error_4; + } + + kfree(attr); + kfree(init_attr); + + return 0; + +error_4: + mlx5_ib_destroy_qp(qp); + +error_3: + ib_destroy_cq(cq); + +error_2: + ib_dereg_mr(mr); + +error_1: + ib_dealloc_pd(pd); + +error_0: + kfree(attr); + kfree(init_attr); + return ret; +} + +static int create_dev_resources(struct mlx5_ib_resources *devr) +{ + struct ib_srq_init_attr attr; + struct mlx5_ib_dev *dev; + struct ib_cq_init_attr cq_attr; + int ret = 0; + + dev = container_of(devr, struct mlx5_ib_dev, devr); + + devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->p0)) { + ret = PTR_ERR(devr->p0); + goto error0; + } + devr->p0->device = &dev->ib_dev; + devr->p0->uobject = NULL; + atomic_set(&devr->p0->usecnt, 0); + + memset(&cq_attr, 0, sizeof(cq_attr)); + cq_attr.cqe = 1; + devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL); + if (IS_ERR(devr->c0)) { + ret = PTR_ERR(devr->c0); + goto error1; + } + devr->c0->device = &dev->ib_dev; + devr->c0->uobject = NULL; + devr->c0->comp_handler = NULL; + devr->c0->event_handler = NULL; + devr->c0->cq_context = NULL; + atomic_set(&devr->c0->usecnt, 0); + + devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->x0)) { + ret = PTR_ERR(devr->x0); + goto error2; + } + devr->x0->device = &dev->ib_dev; + devr->x0->inode = NULL; + atomic_set(&devr->x0->usecnt, 0); + mutex_init(&devr->x0->tgt_qp_mutex); + INIT_LIST_HEAD(&devr->x0->tgt_qp_list); + + devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->x1)) { + ret = PTR_ERR(devr->x1); + goto error3; + } + devr->x1->device = &dev->ib_dev; + devr->x1->inode = NULL; + atomic_set(&devr->x1->usecnt, 0); + mutex_init(&devr->x1->tgt_qp_mutex); + INIT_LIST_HEAD(&devr->x1->tgt_qp_list); + + memset(&attr, 0, sizeof(attr)); + attr.attr.max_sge = 1; + attr.attr.max_wr = 1; + attr.srq_type = IB_SRQT_XRC; + attr.ext.xrc.cq = devr->c0; + attr.ext.xrc.xrcd = devr->x0; + + devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL); + if (IS_ERR(devr->s0)) { + ret = PTR_ERR(devr->s0); + goto error4; + } + devr->s0->device = &dev->ib_dev; + devr->s0->pd = devr->p0; + devr->s0->uobject = NULL; + devr->s0->event_handler = NULL; + devr->s0->srq_context = NULL; + devr->s0->srq_type = IB_SRQT_XRC; + devr->s0->ext.xrc.xrcd = devr->x0; + devr->s0->ext.xrc.cq = devr->c0; + atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); + atomic_inc(&devr->s0->ext.xrc.cq->usecnt); + atomic_inc(&devr->p0->usecnt); + atomic_set(&devr->s0->usecnt, 0); + + memset(&attr, 0, sizeof(attr)); + attr.attr.max_sge = 1; + attr.attr.max_wr = 1; + attr.srq_type = IB_SRQT_BASIC; + devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL); + if (IS_ERR(devr->s1)) { + ret = PTR_ERR(devr->s1); + goto error5; + } + devr->s1->device = &dev->ib_dev; + devr->s1->pd = devr->p0; + devr->s1->uobject = NULL; + devr->s1->event_handler = NULL; + devr->s1->srq_context = NULL; + devr->s1->srq_type = IB_SRQT_BASIC; + devr->s1->ext.xrc.cq = devr->c0; + atomic_inc(&devr->p0->usecnt); + atomic_set(&devr->s1->usecnt, 0); + + return 0; + +error5: + mlx5_ib_destroy_srq(devr->s0); +error4: + mlx5_ib_dealloc_xrcd(devr->x1); +error3: + mlx5_ib_dealloc_xrcd(devr->x0); +error2: + mlx5_ib_destroy_cq(devr->c0); +error1: + mlx5_ib_dealloc_pd(devr->p0); +error0: + return ret; +} + +static void destroy_dev_resources(struct mlx5_ib_resources *devr) +{ + mlx5_ib_destroy_srq(devr->s1); + mlx5_ib_destroy_srq(devr->s0); + mlx5_ib_dealloc_xrcd(devr->x0); + mlx5_ib_dealloc_xrcd(devr->x1); + mlx5_ib_destroy_cq(devr->c0); + mlx5_ib_dealloc_pd(devr->p0); +} + +static void enable_dc_tracer(struct mlx5_ib_dev *dev) +{ + struct device *device = dev->ib_dev.dma_device; + struct mlx5_dc_tracer *dct = &dev->dctr; + int order; + void *tmp; + int size; + int err; + + size = MLX5_CAP_GEN(dev->mdev, num_ports) * 4096; + if (size <= PAGE_SIZE) + order = 0; + else + order = 1; + + dct->pg = alloc_pages(GFP_KERNEL, order); + if (!dct->pg) { + mlx5_ib_err(dev, "failed to allocate %d pages\n", order); + return; + } + + tmp = page_address(dct->pg); + memset(tmp, 0xff, size); + + dct->size = size; + dct->order = order; + dct->dma = dma_map_page(device, dct->pg, 0, size, DMA_FROM_DEVICE); + if (dma_mapping_error(device, dct->dma)) { + mlx5_ib_err(dev, "dma mapping error\n"); + goto map_err; + } + + err = mlx5_core_set_dc_cnak_trace(dev->mdev, 1, dct->dma); + if (err) { + mlx5_ib_warn(dev, "failed to enable DC tracer\n"); + goto cmd_err; + } + + return; + +cmd_err: + dma_unmap_page(device, dct->dma, size, DMA_FROM_DEVICE); +map_err: + __free_pages(dct->pg, dct->order); + dct->pg = NULL; +} + +static void disable_dc_tracer(struct mlx5_ib_dev *dev) +{ + struct device *device = dev->ib_dev.dma_device; + struct mlx5_dc_tracer *dct = &dev->dctr; + int err; + + if (!dct->pg) + return; + + err = mlx5_core_set_dc_cnak_trace(dev->mdev, 0, dct->dma); + if (err) { + mlx5_ib_warn(dev, "failed to disable DC tracer\n"); + return; + } + + dma_unmap_page(device, dct->dma, dct->size, DMA_FROM_DEVICE); + __free_pages(dct->pg, dct->order); + dct->pg = NULL; +} + +enum { + MLX5_DC_CNAK_SIZE = 128, + MLX5_NUM_BUF_IN_PAGE = PAGE_SIZE / MLX5_DC_CNAK_SIZE, + MLX5_CNAK_TX_CQ_SIGNAL_FACTOR = 128, + MLX5_DC_CNAK_SL = 0, + MLX5_DC_CNAK_VL = 0, +}; + +static void dump_buf(void *buf, int size) +{ + __be32 *p = buf; + int offset; + int i; + + for (i = 0, offset = 0; i < size; i += 16) { + printf("mlx5_ib: INFO: ""%03x: %08x %08x %08x %08x\n", offset, be32_to_cpu(p[0]), be32_to_cpu(p[1]), be32_to_cpu(p[2]), be32_to_cpu(p[3])); + p += 4; + offset += 16; + } + printf("mlx5_ib: INFO: ""\n"); +} + +enum { + CNAK_LENGTH_WITHOUT_GRH = 32, + CNAK_LENGTH_WITH_GRH = 72, +}; + +static struct mlx5_dc_desc *get_desc_from_index(struct mlx5_dc_desc *desc, u64 index, unsigned *offset) +{ + struct mlx5_dc_desc *d; + + int i; + int j; + + i = index / MLX5_NUM_BUF_IN_PAGE; + j = index % MLX5_NUM_BUF_IN_PAGE; + d = desc + i; + *offset = j * MLX5_DC_CNAK_SIZE; + return d; +} + +static void build_cnak_msg(void *rbuf, void *sbuf, u32 *length, u16 *dlid) +{ + void *rdceth, *sdceth; + void *rlrh, *slrh; + void *rgrh, *sgrh; + void *rbth, *sbth; + int is_global; + void *saeth; + + memset(sbuf, 0, MLX5_DC_CNAK_SIZE); + rlrh = rbuf; + is_global = MLX5_GET(lrh, rlrh, lnh) == 0x3; + rgrh = is_global ? rlrh + MLX5_ST_SZ_BYTES(lrh) : NULL; + rbth = rgrh ? rgrh + MLX5_ST_SZ_BYTES(grh) : rlrh + MLX5_ST_SZ_BYTES(lrh); + rdceth = rbth + MLX5_ST_SZ_BYTES(bth); + + slrh = sbuf; + sgrh = is_global ? slrh + MLX5_ST_SZ_BYTES(lrh) : NULL; + sbth = sgrh ? sgrh + MLX5_ST_SZ_BYTES(grh) : slrh + MLX5_ST_SZ_BYTES(lrh); + sdceth = sbth + MLX5_ST_SZ_BYTES(bth); + saeth = sdceth + MLX5_ST_SZ_BYTES(dceth); + + *dlid = MLX5_GET(lrh, rlrh, slid); + MLX5_SET(lrh, slrh, vl, MLX5_DC_CNAK_VL); + MLX5_SET(lrh, slrh, lver, MLX5_GET(lrh, rlrh, lver)); + MLX5_SET(lrh, slrh, sl, MLX5_DC_CNAK_SL); + MLX5_SET(lrh, slrh, lnh, MLX5_GET(lrh, rlrh, lnh)); + MLX5_SET(lrh, slrh, dlid, MLX5_GET(lrh, rlrh, slid)); + MLX5_SET(lrh, slrh, pkt_len, 0x9 + ((is_global ? MLX5_ST_SZ_BYTES(grh) : 0) >> 2)); + MLX5_SET(lrh, slrh, slid, MLX5_GET(lrh, rlrh, dlid)); + + if (is_global) { + void *rdgid, *rsgid; + void *ssgid, *sdgid; + + MLX5_SET(grh, sgrh, ip_version, MLX5_GET(grh, rgrh, ip_version)); + MLX5_SET(grh, sgrh, traffic_class, MLX5_GET(grh, rgrh, traffic_class)); + MLX5_SET(grh, sgrh, flow_label, MLX5_GET(grh, rgrh, flow_label)); + MLX5_SET(grh, sgrh, payload_length, 0x1c); + MLX5_SET(grh, sgrh, next_header, 0x1b); + MLX5_SET(grh, sgrh, hop_limit, MLX5_GET(grh, rgrh, hop_limit)); + + rdgid = MLX5_ADDR_OF(grh, rgrh, dgid); + rsgid = MLX5_ADDR_OF(grh, rgrh, sgid); + ssgid = MLX5_ADDR_OF(grh, sgrh, sgid); + sdgid = MLX5_ADDR_OF(grh, sgrh, dgid); + memcpy(ssgid, rdgid, 16); + memcpy(sdgid, rsgid, 16); + *length = CNAK_LENGTH_WITH_GRH; + } else { + *length = CNAK_LENGTH_WITHOUT_GRH; + } + + MLX5_SET(bth, sbth, opcode, 0x51); + MLX5_SET(bth, sbth, migreq, 0x1); + MLX5_SET(bth, sbth, p_key, MLX5_GET(bth, rbth, p_key)); + MLX5_SET(bth, sbth, dest_qp, MLX5_GET(dceth, rdceth, dci_dct)); + MLX5_SET(bth, sbth, psn, MLX5_GET(bth, rbth, psn)); + + MLX5_SET(dceth, sdceth, dci_dct, MLX5_GET(bth, rbth, dest_qp)); + + MLX5_SET(aeth, saeth, syndrome, 0x64); + + if (0) { + printf("mlx5_ib: INFO: ""===dump packet ====\n"); + dump_buf(sbuf, *length); + } +} + +static int reduce_tx_pending(struct mlx5_dc_data *dcd, int num) +{ + struct mlx5_ib_dev *dev = dcd->dev; + struct ib_cq *cq = dcd->scq; + unsigned int send_completed; + unsigned int polled; + struct ib_wc wc; + int n; + + while (num > 0) { + n = ib_poll_cq(cq, 1, &wc); + if (unlikely(n < 0)) { + mlx5_ib_warn(dev, "error polling cnak send cq\n"); + return n; + } + if (unlikely(!n)) + return -EAGAIN; + + if (unlikely(wc.status != IB_WC_SUCCESS)) { + mlx5_ib_warn(dev, "cnak send completed with error, status %d vendor_err %d\n", + wc.status, wc.vendor_err); + dcd->last_send_completed++; + dcd->tx_pending--; + num--; + } else { + send_completed = wc.wr_id; + polled = send_completed - dcd->last_send_completed; + dcd->tx_pending = (unsigned int)(dcd->cur_send - send_completed); + num -= polled; + dcd->cnaks += polled; + dcd->last_send_completed = send_completed; + } + } + + return 0; +} + +static int send_cnak(struct mlx5_dc_data *dcd, struct mlx5_send_wr *mlx_wr, + u64 rcv_buff_id) +{ + struct ib_send_wr *wr = &mlx_wr->wr; + struct mlx5_ib_dev *dev = dcd->dev; + struct ib_send_wr *bad_wr; + struct mlx5_dc_desc *rxd; + struct mlx5_dc_desc *txd; + unsigned int offset; + unsigned int cur; + __be32 *sbuf; + void *rbuf; + int err; + + if (unlikely(dcd->tx_pending > dcd->max_wqes)) { + mlx5_ib_warn(dev, "SW error in cnak send: tx_pending(%d) > max_wqes(%d)\n", + dcd->tx_pending, dcd->max_wqes); + return -EFAULT; + } + + if (unlikely(dcd->tx_pending == dcd->max_wqes)) { + err = reduce_tx_pending(dcd, 1); + if (err) + return err; + if (dcd->tx_pending == dcd->max_wqes) + return -EAGAIN; + } + + cur = dcd->cur_send; + txd = get_desc_from_index(dcd->txdesc, cur % dcd->max_wqes, &offset); + sbuf = txd->buf + offset; + + wr->sg_list[0].addr = txd->dma + offset; + wr->sg_list[0].lkey = dcd->mr->lkey; + wr->opcode = IB_WR_SEND; + wr->num_sge = 1; + wr->wr_id = cur; + if (cur % MLX5_CNAK_TX_CQ_SIGNAL_FACTOR) + wr->send_flags &= ~IB_SEND_SIGNALED; + else + wr->send_flags |= IB_SEND_SIGNALED; + + rxd = get_desc_from_index(dcd->rxdesc, rcv_buff_id, &offset); + rbuf = rxd->buf + offset; + build_cnak_msg(rbuf, sbuf, &wr->sg_list[0].length, &mlx_wr->sel.mlx.dlid); + + mlx_wr->sel.mlx.sl = MLX5_DC_CNAK_SL; + mlx_wr->sel.mlx.icrc = 1; + + err = ib_post_send(dcd->dcqp, wr, &bad_wr); + if (likely(!err)) { + dcd->tx_pending++; + dcd->cur_send++; + } + + return err; +} + +static int mlx5_post_one_rxdc(struct mlx5_dc_data *dcd, int index) +{ + struct ib_recv_wr *bad_wr; + struct ib_recv_wr wr; + struct ib_sge sge; + u64 addr; + int err; + int i; + int j; + + i = index / (PAGE_SIZE / MLX5_DC_CNAK_SIZE); + j = index % (PAGE_SIZE / MLX5_DC_CNAK_SIZE); + addr = dcd->rxdesc[i].dma + j * MLX5_DC_CNAK_SIZE; + + memset(&wr, 0, sizeof(wr)); + wr.num_sge = 1; + sge.addr = addr; + sge.length = MLX5_DC_CNAK_SIZE; + sge.lkey = dcd->mr->lkey; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.wr_id = index; + err = ib_post_recv(dcd->dcqp, &wr, &bad_wr); + if (unlikely(err)) + mlx5_ib_warn(dcd->dev, "failed to post dc rx buf at index %d\n", index); + + return err; +} + +static void dc_cnack_rcv_comp_handler(struct ib_cq *cq, void *cq_context) +{ + struct mlx5_dc_data *dcd = cq_context; + struct mlx5_ib_dev *dev = dcd->dev; + struct mlx5_send_wr mlx_wr; + struct ib_send_wr *wr = &mlx_wr.wr; + struct ib_wc *wc = dcd->wc_tbl; + struct ib_sge sge; + int err; + int n; + int i; + + memset(&mlx_wr, 0, sizeof(mlx_wr)); + wr->sg_list = &sge; + + n = ib_poll_cq(cq, MLX5_CNAK_RX_POLL_CQ_QUOTA, wc); + if (unlikely(n < 0)) { + /* mlx5 never returns negative values but leave a message just in case */ + mlx5_ib_warn(dev, "failed to poll cq (%d), aborting\n", n); + return; + } + if (likely(n > 0)) { + for (i = 0; i < n; i++) { + if (unlikely(wc[i].status != IB_WC_SUCCESS)) { + if (dev->mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) + mlx5_ib_warn(dev, "DC cnak: completed with error, status = %d vendor_err = %d\n", + wc[i].status, wc[i].vendor_err); + } else { + dcd->connects++; + if (unlikely(send_cnak(dcd, &mlx_wr, wc[i].wr_id))) + mlx5_ib_warn(dev, "DC cnak: failed to allocate send buf - dropped\n"); + } + + if (unlikely(mlx5_post_one_rxdc(dcd, wc[i].wr_id))) { + dcd->discards++; + mlx5_ib_warn(dev, "DC cnak: repost rx failed, will leak rx queue\n"); + } + } + } + + err = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (unlikely(err)) + mlx5_ib_warn(dev, "DC cnak: failed to re-arm receive cq (%d)\n", err); +} + +static int alloc_dc_buf(struct mlx5_dc_data *dcd, int rx) +{ + struct mlx5_ib_dev *dev = dcd->dev; + struct mlx5_dc_desc **desc; + struct mlx5_dc_desc *d; + struct device *ddev; + int max_wqes; + int err = 0; + int npages; + int totsz; + int i; + + ddev = &dev->mdev->pdev->dev; + max_wqes = dcd->max_wqes; + totsz = max_wqes * MLX5_DC_CNAK_SIZE; + npages = DIV_ROUND_UP(totsz, PAGE_SIZE); + desc = rx ? &dcd->rxdesc : &dcd->txdesc; + *desc = kcalloc(npages, sizeof(*dcd->rxdesc), GFP_KERNEL); + if (!*desc) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < npages; i++) { + d = *desc + i; + d->buf = dma_alloc_coherent(ddev, PAGE_SIZE, &d->dma, GFP_KERNEL); + if (!d->buf) { + mlx5_ib_err(dev, "dma alloc failed at %d\n", i); + goto out_free; + } + } + if (rx) + dcd->rx_npages = npages; + else + dcd->tx_npages = npages; + + return 0; + +out_free: + for (i--; i >= 0; i--) { + d = *desc + i; + dma_free_coherent(ddev, PAGE_SIZE, d->buf, d->dma); + } + kfree(*desc); +out: + return err; +} + +static int alloc_dc_rx_buf(struct mlx5_dc_data *dcd) +{ + return alloc_dc_buf(dcd, 1); +} + +static int alloc_dc_tx_buf(struct mlx5_dc_data *dcd) +{ + return alloc_dc_buf(dcd, 0); +} + +static void free_dc_buf(struct mlx5_dc_data *dcd, int rx) +{ + struct mlx5_ib_dev *dev = dcd->dev; + struct mlx5_dc_desc *desc; + struct mlx5_dc_desc *d; + struct device *ddev; + int npages; + int i; + + ddev = &dev->mdev->pdev->dev; + npages = rx ? dcd->rx_npages : dcd->tx_npages; + desc = rx ? dcd->rxdesc : dcd->txdesc; + for (i = 0; i < npages; i++) { + d = desc + i; + dma_free_coherent(ddev, PAGE_SIZE, d->buf, d->dma); + } + kfree(desc); +} + +static void free_dc_rx_buf(struct mlx5_dc_data *dcd) +{ + free_dc_buf(dcd, 1); +} + +static void free_dc_tx_buf(struct mlx5_dc_data *dcd) +{ + free_dc_buf(dcd, 0); +} + +struct dc_attribute { + struct attribute attr; + ssize_t (*show)(struct mlx5_dc_data *, struct dc_attribute *, char *buf); + ssize_t (*store)(struct mlx5_dc_data *, struct dc_attribute *, + const char *buf, size_t count); +}; + +#define DC_ATTR(_name, _mode, _show, _store) \ +struct dc_attribute dc_attr_##_name = __ATTR(_name, _mode, _show, _store) + +static ssize_t rx_connect_show(struct mlx5_dc_data *dcd, + struct dc_attribute *unused, + char *buf) +{ + unsigned long num; + + num = dcd->connects; + + return sprintf(buf, "%lu\n", num); +} + +static ssize_t tx_cnak_show(struct mlx5_dc_data *dcd, + struct dc_attribute *unused, + char *buf) +{ + unsigned long num; + + num = dcd->cnaks; + + return sprintf(buf, "%lu\n", num); +} + +static ssize_t tx_discard_show(struct mlx5_dc_data *dcd, + struct dc_attribute *unused, + char *buf) +{ + unsigned long num; + + num = dcd->discards; + + return sprintf(buf, "%lu\n", num); +} + +#define DC_ATTR_RO(_name) \ +struct dc_attribute dc_attr_##_name = __ATTR_RO(_name) + +static DC_ATTR_RO(rx_connect); +static DC_ATTR_RO(tx_cnak); +static DC_ATTR_RO(tx_discard); + +static struct attribute *dc_attrs[] = { + &dc_attr_rx_connect.attr, + &dc_attr_tx_cnak.attr, + &dc_attr_tx_discard.attr, + NULL +}; + +static ssize_t dc_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct dc_attribute *dc_attr = container_of(attr, struct dc_attribute, attr); + struct mlx5_dc_data *d = container_of(kobj, struct mlx5_dc_data, kobj); + + if (!dc_attr->show) + return -EIO; + + return dc_attr->show(d, dc_attr, buf); +} + +static const struct sysfs_ops dc_sysfs_ops = { + .show = dc_attr_show +}; + +static struct kobj_type dc_type = { + .sysfs_ops = &dc_sysfs_ops, + .default_attrs = dc_attrs +}; + +static int init_sysfs(struct mlx5_ib_dev *dev) +{ + struct device *device = &dev->ib_dev.dev; + + dev->dc_kobj = kobject_create_and_add("dct", &device->kobj); + if (!dev->dc_kobj) { + mlx5_ib_err(dev, "failed to register DCT sysfs object\n"); + return -ENOMEM; + } + + return 0; +} + +static void cleanup_sysfs(struct mlx5_ib_dev *dev) +{ + if (dev->dc_kobj) { + kobject_put(dev->dc_kobj); + dev->dc_kobj = NULL; + } +} + +static int init_port_sysfs(struct mlx5_dc_data *dcd) +{ + return kobject_init_and_add(&dcd->kobj, &dc_type, dcd->dev->dc_kobj, + "%d", dcd->port); +} + +static void cleanup_port_sysfs(struct mlx5_dc_data *dcd) +{ + kobject_put(&dcd->kobj); +} + +static int init_driver_cnak(struct mlx5_ib_dev *dev, int port) +{ + int ncqe = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); + struct mlx5_dc_data *dcd = &dev->dcd[port - 1]; + struct mlx5_ib_resources *devr = &dev->devr; + struct ib_qp_init_attr init_attr; + struct ib_pd *pd = devr->p0; + struct ib_qp_attr attr; + int err; + int i; + + dcd->dev = dev; + dcd->port = port; + dcd->mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(dcd->mr)) { + mlx5_ib_warn(dev, "failed to create dc DMA MR\n"); + err = PTR_ERR(dcd->mr); + goto error1; + } + + dcd->rcq = ib_create_cq(&dev->ib_dev, dc_cnack_rcv_comp_handler, NULL, + dcd, ncqe, 0); + if (IS_ERR(dcd->rcq)) { + err = PTR_ERR(dcd->rcq); + mlx5_ib_warn(dev, "failed to create dc cnack rx cq (%d)\n", err); + goto error2; + } + + err = ib_req_notify_cq(dcd->rcq, IB_CQ_NEXT_COMP); + if (err) { + mlx5_ib_warn(dev, "failed to setup dc cnack rx cq (%d)\n", err); + goto error3; + } + + dcd->scq = ib_create_cq(&dev->ib_dev, NULL, NULL, + dcd, ncqe, 0); + if (IS_ERR(dcd->scq)) { + err = PTR_ERR(dcd->scq); + mlx5_ib_warn(dev, "failed to create dc cnack tx cq (%d)\n", err); + goto error3; + } + + memset(&init_attr, 0, sizeof(init_attr)); + init_attr.qp_type = MLX5_IB_QPT_SW_CNAK; + init_attr.cap.max_recv_wr = ncqe; + init_attr.cap.max_recv_sge = 1; + init_attr.cap.max_send_wr = ncqe; + init_attr.cap.max_send_sge = 1; + init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + init_attr.recv_cq = dcd->rcq; + init_attr.send_cq = dcd->scq; + dcd->dcqp = ib_create_qp(pd, &init_attr); + if (IS_ERR(dcd->dcqp)) { + mlx5_ib_warn(dev, "failed to create qp (%d)\n", err); + err = PTR_ERR(dcd->dcqp); + goto error4; + } + + memset(&attr, 0, sizeof(attr)); + attr.qp_state = IB_QPS_INIT; + attr.port_num = port; + err = ib_modify_qp(dcd->dcqp, &attr, + IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT); + if (err) { + mlx5_ib_warn(dev, "failed to modify qp to init\n"); + goto error5; + } + + memset(&attr, 0, sizeof(attr)); + attr.qp_state = IB_QPS_RTR; + attr.path_mtu = IB_MTU_4096; + err = ib_modify_qp(dcd->dcqp, &attr, IB_QP_STATE); + if (err) { + mlx5_ib_warn(dev, "failed to modify qp to rtr\n"); + goto error5; + } + + memset(&attr, 0, sizeof(attr)); + attr.qp_state = IB_QPS_RTS; + err = ib_modify_qp(dcd->dcqp, &attr, IB_QP_STATE); + if (err) { + mlx5_ib_warn(dev, "failed to modify qp to rts\n"); + goto error5; + } + + dcd->max_wqes = ncqe; + err = alloc_dc_rx_buf(dcd); + if (err) { + mlx5_ib_warn(dev, "failed to allocate rx buf\n"); + goto error5; + } + + err = alloc_dc_tx_buf(dcd); + if (err) { + mlx5_ib_warn(dev, "failed to allocate tx buf\n"); + goto error6; + } + + for (i = 0; i < ncqe; i++) { + err = mlx5_post_one_rxdc(dcd, i); + if (err) + goto error7; + } + + err = init_port_sysfs(dcd); + if (err) { + mlx5_ib_warn(dev, "failed to initialize DC cnak sysfs\n"); + goto error7; + } + + dcd->initialized = 1; + return 0; + +error7: + free_dc_tx_buf(dcd); +error6: + free_dc_rx_buf(dcd); +error5: + if (ib_destroy_qp(dcd->dcqp)) + mlx5_ib_warn(dev, "failed to destroy dc qp\n"); +error4: + if (ib_destroy_cq(dcd->scq)) + mlx5_ib_warn(dev, "failed to destroy dc scq\n"); +error3: + if (ib_destroy_cq(dcd->rcq)) + mlx5_ib_warn(dev, "failed to destroy dc rcq\n"); +error2: + ib_dereg_mr(dcd->mr); +error1: + return err; +} + +static void cleanup_driver_cnak(struct mlx5_ib_dev *dev, int port) +{ + struct mlx5_dc_data *dcd = &dev->dcd[port - 1]; + + if (!dcd->initialized) + return; + + cleanup_port_sysfs(dcd); + + if (ib_destroy_qp(dcd->dcqp)) + mlx5_ib_warn(dev, "destroy qp failed\n"); + + if (ib_destroy_cq(dcd->scq)) + mlx5_ib_warn(dev, "destroy scq failed\n"); + + if (ib_destroy_cq(dcd->rcq)) + mlx5_ib_warn(dev, "destroy rcq failed\n"); + + ib_dereg_mr(dcd->mr); + free_dc_tx_buf(dcd); + free_dc_rx_buf(dcd); + dcd->initialized = 0; +} + +static int init_dc_improvements(struct mlx5_ib_dev *dev) +{ + int port; + int err; + + if (!mlx5_core_is_pf(dev->mdev)) + return 0; + + if (!(MLX5_CAP_GEN(dev->mdev, dc_cnak_trace))) + return 0; + + enable_dc_tracer(dev); + + err = init_sysfs(dev); + if (err) + return err; + + if (!MLX5_CAP_GEN(dev->mdev, dc_connect_qp)) + return 0; + + for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { + err = init_driver_cnak(dev, port); + if (err) + goto out; + } + + return 0; + +out: + for (port--; port >= 1; port--) + cleanup_driver_cnak(dev, port); + + cleanup_sysfs(dev); + + return err; +} + +static void cleanup_dc_improvements(struct mlx5_ib_dev *dev) +{ + int port; + + for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) + cleanup_driver_cnak(dev, port); + cleanup_sysfs(dev); + + disable_dc_tracer(dev); +} + +static void mlx5_ib_dealloc_q_port_counter(struct mlx5_ib_dev *dev, u8 port_num) +{ + mlx5_vport_dealloc_q_counter(dev->mdev, + MLX5_INTERFACE_PROTOCOL_IB, + dev->port[port_num].q_cnt_id); + dev->port[port_num].q_cnt_id = 0; +} + +static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev) +{ + unsigned int i; + + for (i = 0; i < dev->num_ports; i++) + mlx5_ib_dealloc_q_port_counter(dev, i); +} + +static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev) +{ + int i; + int ret; + + for (i = 0; i < dev->num_ports; i++) { + ret = mlx5_vport_alloc_q_counter(dev->mdev, + MLX5_INTERFACE_PROTOCOL_IB, + &dev->port[i].q_cnt_id); + if (ret) { + mlx5_ib_warn(dev, + "couldn't allocate queue counter for port %d\n", + i + 1); + goto dealloc_counters; + } + } + + return 0; + +dealloc_counters: + while (--i >= 0) + mlx5_ib_dealloc_q_port_counter(dev, i); + + return ret; +} + +struct port_attribute { + struct attribute attr; + ssize_t (*show)(struct mlx5_ib_port *, + struct port_attribute *, char *buf); + ssize_t (*store)(struct mlx5_ib_port *, + struct port_attribute *, + const char *buf, size_t count); +}; + +struct port_counter_attribute { + struct port_attribute attr; + size_t offset; +}; + +static ssize_t port_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct mlx5_ib_port_sysfs_group *p = + container_of(kobj, struct mlx5_ib_port_sysfs_group, + kobj); + struct mlx5_ib_port *mibport = container_of(p, struct mlx5_ib_port, + group); + + if (!port_attr->show) + return -EIO; + + return port_attr->show(mibport, port_attr, buf); +} + +static ssize_t show_port_counter(struct mlx5_ib_port *p, + struct port_attribute *port_attr, + char *buf) +{ + int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); + struct port_counter_attribute *counter_attr = + container_of(port_attr, struct port_counter_attribute, attr); + void *out; + int ret; + + out = mlx5_vzalloc(outlen); + if (!out) + return -ENOMEM; + + ret = mlx5_vport_query_q_counter(p->dev->mdev, + p->q_cnt_id, 0, + out, outlen); + if (ret) + goto free; + + ret = sprintf(buf, "%d\n", + be32_to_cpu(*(__be32 *)(out + counter_attr->offset))); + +free: + kfree(out); + return ret; +} + +#define PORT_COUNTER_ATTR(_name) \ +struct port_counter_attribute port_counter_attr_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_port_counter, NULL), \ + .offset = MLX5_BYTE_OFF(query_q_counter_out, _name) \ +} + +static PORT_COUNTER_ATTR(rx_write_requests); +static PORT_COUNTER_ATTR(rx_read_requests); +static PORT_COUNTER_ATTR(rx_atomic_requests); +static PORT_COUNTER_ATTR(rx_dct_connect); +static PORT_COUNTER_ATTR(out_of_buffer); +static PORT_COUNTER_ATTR(out_of_sequence); +static PORT_COUNTER_ATTR(duplicate_request); +static PORT_COUNTER_ATTR(rnr_nak_retry_err); +static PORT_COUNTER_ATTR(packet_seq_err); +static PORT_COUNTER_ATTR(implied_nak_seq_err); +static PORT_COUNTER_ATTR(local_ack_timeout_err); + +static struct attribute *counter_attrs[] = { + &port_counter_attr_rx_write_requests.attr.attr, + &port_counter_attr_rx_read_requests.attr.attr, + &port_counter_attr_rx_atomic_requests.attr.attr, + &port_counter_attr_rx_dct_connect.attr.attr, + &port_counter_attr_out_of_buffer.attr.attr, + &port_counter_attr_out_of_sequence.attr.attr, + &port_counter_attr_duplicate_request.attr.attr, + &port_counter_attr_rnr_nak_retry_err.attr.attr, + &port_counter_attr_packet_seq_err.attr.attr, + &port_counter_attr_implied_nak_seq_err.attr.attr, + &port_counter_attr_local_ack_timeout_err.attr.attr, + NULL +}; + +static struct attribute_group port_counters_group = { + .name = "counters", + .attrs = counter_attrs +}; + +static const struct sysfs_ops port_sysfs_ops = { + .show = port_attr_show +}; + +static struct kobj_type port_type = { + .sysfs_ops = &port_sysfs_ops, +}; + +static int add_port_attrs(struct mlx5_ib_dev *dev, + struct kobject *parent, + struct mlx5_ib_port_sysfs_group *port, + u8 port_num) +{ + int ret; + + ret = kobject_init_and_add(&port->kobj, &port_type, + parent, + "%d", port_num); + if (ret) + return ret; + + if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && + MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { + ret = sysfs_create_group(&port->kobj, &port_counters_group); + if (ret) + goto put_kobj; + } + + port->enabled = true; + return ret; + +put_kobj: + kobject_put(&port->kobj); + return ret; +} + +static void destroy_ports_attrs(struct mlx5_ib_dev *dev, + unsigned int num_ports) +{ + unsigned int i; + + for (i = 0; i < num_ports; i++) { + struct mlx5_ib_port_sysfs_group *port = + &dev->port[i].group; + + if (!port->enabled) + continue; + + if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && + MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) + sysfs_remove_group(&port->kobj, + &port_counters_group); + kobject_put(&port->kobj); + port->enabled = false; + } + + if (dev->ports_parent) { + kobject_put(dev->ports_parent); + dev->ports_parent = NULL; + } +} + +static int create_port_attrs(struct mlx5_ib_dev *dev) +{ + int ret = 0; + unsigned int i = 0; + struct device *device = &dev->ib_dev.dev; + + dev->ports_parent = kobject_create_and_add("mlx5_ports", + &device->kobj); + if (!dev->ports_parent) + return -ENOMEM; + + for (i = 0; i < dev->num_ports; i++) { + ret = add_port_attrs(dev, + dev->ports_parent, + &dev->port[i].group, + i + 1); + + if (ret) + goto _destroy_ports_attrs; + } + + return 0; + +_destroy_ports_attrs: + destroy_ports_attrs(dev, i); + return ret; +} + +static void *mlx5_ib_add(struct mlx5_core_dev *mdev) +{ + struct mlx5_ib_dev *dev; + int err; + int i; + + printk_once(KERN_INFO "%s", mlx5_version); + + dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); + if (!dev) + return NULL; + + dev->mdev = mdev; + + dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port), + GFP_KERNEL); + if (!dev->port) + goto err_dealloc; + + for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) { + dev->port[i].dev = dev; + dev->port[i].port_num = i; + } + + err = get_port_caps(dev); + if (err) + goto err_free_port; + + if (mlx5_use_mad_ifc(dev)) + get_ext_port_caps(dev); + + if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET) { + if (MLX5_CAP_GEN(mdev, roce)) { + err = mlx5_nic_vport_enable_roce(mdev); + if (err) + goto err_free_port; + } else { + goto err_free_port; + } + } + + MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock); + + strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX); + dev->ib_dev.owner = THIS_MODULE; + dev->ib_dev.node_type = RDMA_NODE_IB_CA; + dev->ib_dev.local_dma_lkey = mdev->special_contexts.resd_lkey; + dev->num_ports = MLX5_CAP_GEN(mdev, num_ports); + dev->ib_dev.phys_port_cnt = dev->num_ports; + dev->ib_dev.num_comp_vectors = + dev->mdev->priv.eq_table.num_comp_vectors; + dev->ib_dev.dma_device = &mdev->pdev->dev; + + dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; + dev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP); + dev->ib_dev.uverbs_ex_cmd_mask = + (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); + dev->ib_dev.uverbs_exp_cmd_mask = + (1ull << IB_USER_VERBS_EXP_CMD_REG_MR_EX) | + (1ull << IB_USER_VERBS_EXP_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_EXP_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_EXP_CMD_MODIFY_CQ); + + dev->ib_dev.query_device = mlx5_ib_query_device; + dev->ib_dev.query_port = mlx5_ib_query_port; + dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer; + dev->ib_dev.get_netdev = mlx5_ib_get_netdev; + dev->ib_dev.query_gid = mlx5_ib_query_gid; + dev->ib_dev.modify_gid = mlx5_ib_modify_gid; + dev->ib_dev.query_pkey = mlx5_ib_query_pkey; + dev->ib_dev.modify_device = mlx5_ib_modify_device; + dev->ib_dev.modify_port = mlx5_ib_modify_port; + dev->ib_dev.alloc_ucontext = mlx5_ib_alloc_ucontext; + dev->ib_dev.dealloc_ucontext = mlx5_ib_dealloc_ucontext; + dev->ib_dev.mmap = mlx5_ib_mmap; + dev->ib_dev.alloc_pd = mlx5_ib_alloc_pd; + dev->ib_dev.dealloc_pd = mlx5_ib_dealloc_pd; + dev->ib_dev.create_ah = mlx5_ib_create_ah; + dev->ib_dev.query_ah = mlx5_ib_query_ah; + dev->ib_dev.destroy_ah = mlx5_ib_destroy_ah; + dev->ib_dev.create_srq = mlx5_ib_create_srq; + dev->ib_dev.modify_srq = mlx5_ib_modify_srq; + dev->ib_dev.query_srq = mlx5_ib_query_srq; + dev->ib_dev.destroy_srq = mlx5_ib_destroy_srq; + dev->ib_dev.post_srq_recv = mlx5_ib_post_srq_recv; + dev->ib_dev.create_qp = mlx5_ib_create_qp; + dev->ib_dev.modify_qp = mlx5_ib_modify_qp; + dev->ib_dev.query_qp = mlx5_ib_query_qp; + dev->ib_dev.destroy_qp = mlx5_ib_destroy_qp; + dev->ib_dev.post_send = mlx5_ib_post_send; + dev->ib_dev.post_recv = mlx5_ib_post_recv; + dev->ib_dev.create_cq = mlx5_ib_create_cq; + dev->ib_dev.modify_cq = mlx5_ib_modify_cq; + dev->ib_dev.resize_cq = mlx5_ib_resize_cq; + dev->ib_dev.destroy_cq = mlx5_ib_destroy_cq; + dev->ib_dev.poll_cq = mlx5_ib_poll_cq; + dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; + dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; + dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; + dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; + dev->ib_dev.destroy_mr = mlx5_ib_destroy_mr; + dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; + dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; + dev->ib_dev.create_flow = mlx5_ib_create_flow; + dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow; + dev->ib_dev.process_mad = mlx5_ib_process_mad; + dev->ib_dev.create_mr = mlx5_ib_create_mr; + dev->ib_dev.alloc_fast_reg_mr = mlx5_ib_alloc_fast_reg_mr; + dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list; + dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; + dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; + dev->ib_dev.alloc_indir_reg_list = mlx5_ib_alloc_indir_reg_list; + dev->ib_dev.free_indir_reg_list = mlx5_ib_free_indir_reg_list; + + if (MLX5_CAP_GEN(mdev, xrc)) { + dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; + dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; + dev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | + (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); + } + + if (MLX5_CAP_GEN(mdev, dct)) { + dev->ib_dev.exp_create_dct = mlx5_ib_create_dct; + dev->ib_dev.exp_destroy_dct = mlx5_ib_destroy_dct; + dev->ib_dev.exp_query_dct = mlx5_ib_query_dct; + dev->ib_dev.exp_arm_dct = mlx5_ib_arm_dct; + dev->ib_dev.uverbs_exp_cmd_mask |= + (1ull << IB_USER_VERBS_EXP_CMD_CREATE_DCT) | + (1ull << IB_USER_VERBS_EXP_CMD_DESTROY_DCT) | + (1ull << IB_USER_VERBS_EXP_CMD_QUERY_DCT) | + (1ull << IB_USER_VERBS_EXP_CMD_ARM_DCT); + } + dev->ib_dev.uverbs_exp_cmd_mask |= (1ull << IB_USER_VERBS_EXP_CMD_CREATE_MR); + + dev->ib_dev.exp_create_qp = mlx5_ib_exp_create_qp; + dev->ib_dev.uverbs_exp_cmd_mask |= (1ull << IB_USER_VERBS_EXP_CMD_CREATE_QP); + + dev->ib_dev.exp_query_device = mlx5_ib_exp_query_device; + dev->ib_dev.uverbs_exp_cmd_mask |= (1 << IB_USER_VERBS_EXP_CMD_QUERY_DEVICE); + dev->ib_dev.exp_query_mkey = mlx5_ib_exp_query_mkey; + dev->ib_dev.uverbs_exp_cmd_mask |= (1 << IB_USER_VERBS_EXP_CMD_QUERY_MKEY); + + if (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) { + dev->ib_dev.create_wq = mlx5_ib_create_wq; + dev->ib_dev.modify_wq = mlx5_ib_modify_wq; + dev->ib_dev.destroy_wq = mlx5_ib_destroy_wq; + dev->ib_dev.uverbs_exp_cmd_mask |= + (1ull << IB_USER_VERBS_EXP_CMD_CREATE_WQ) | + (1ull << IB_USER_VERBS_EXP_CMD_MODIFY_WQ) | + (1ull << IB_USER_VERBS_EXP_CMD_DESTROY_WQ) | + (1ull << IB_USER_VERBS_EXP_CMD_CREATE_RWQ_IND_TBL) | + (1ull << IB_USER_VERBS_EXP_CMD_DESTROY_RWQ_IND_TBL) | + (1ull << IB_USER_VERBS_EXP_CMD_CREATE_FLOW); + dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table; + dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table; + } + + err = init_node_data(dev); + if (err) + goto err_disable_roce; + + mutex_init(&dev->fs.lock); + mutex_init(&dev->cap_mask_mutex); + INIT_LIST_HEAD(&dev->qp_list); + spin_lock_init(&dev->reset_flow_resource_lock); + + err = create_dev_resources(&dev->devr); + if (err) + goto err_disable_roce; + + + err = mlx5_ib_alloc_q_counters(dev); + if (err) + goto err_odp; + + err = ib_register_device(&dev->ib_dev, NULL); + if (err) + goto err_q_cnt; + + err = create_umr_res(dev); + if (err) + goto err_dev; + + if (MLX5_CAP_GEN(dev->mdev, port_type) == + MLX5_CAP_PORT_TYPE_IB) { + if (init_dc_improvements(dev)) + mlx5_ib_dbg(dev, "init_dc_improvements - continuing\n"); + } + + err = create_port_attrs(dev); + if (err) + goto err_dc; + + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { + err = device_create_file(&dev->ib_dev.dev, + mlx5_class_attributes[i]); + if (err) + goto err_port_attrs; + } + + dev->ib_active = true; + + return dev; + +err_port_attrs: + destroy_ports_attrs(dev, dev->num_ports); + +err_dc: + if (MLX5_CAP_GEN(dev->mdev, port_type) == + MLX5_CAP_PORT_TYPE_IB) + cleanup_dc_improvements(dev); + destroy_umrc_res(dev); + +err_dev: + ib_unregister_device(&dev->ib_dev); + +err_q_cnt: + mlx5_ib_dealloc_q_counters(dev); + +err_odp: + destroy_dev_resources(&dev->devr); + +err_disable_roce: + if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce)) + mlx5_nic_vport_disable_roce(mdev); +err_free_port: + kfree(dev->port); + +err_dealloc: + ib_dealloc_device((struct ib_device *)dev); + + return NULL; +} + +static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) +{ + struct mlx5_ib_dev *dev = context; + + destroy_ports_attrs(dev, dev->num_ports); + if (MLX5_CAP_GEN(dev->mdev, port_type) == + MLX5_CAP_PORT_TYPE_IB) + cleanup_dc_improvements(dev); + mlx5_ib_dealloc_q_counters(dev); + ib_unregister_device(&dev->ib_dev); + destroy_umrc_res(dev); + destroy_dev_resources(&dev->devr); + + if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce)) + mlx5_nic_vport_disable_roce(mdev); + + kfree(dev->port); + ib_dealloc_device(&dev->ib_dev); +} + +static struct mlx5_interface mlx5_ib_interface = { + .add = mlx5_ib_add, + .remove = mlx5_ib_remove, + .event = mlx5_ib_event, + .protocol = MLX5_INTERFACE_PROTOCOL_IB, +}; + +static int __init mlx5_ib_init(void) +{ + int err; + + if (deprecated_prof_sel != 2) + printf("mlx5_ib: WARN: ""prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); + + err = mlx5_register_interface(&mlx5_ib_interface); + if (err) + goto clean_odp; + + mlx5_ib_wq = create_singlethread_workqueue("mlx5_ib_wq"); + if (!mlx5_ib_wq) { + printf("mlx5_ib: ERR: ""%s: failed to create mlx5_ib_wq\n", __func__); + goto err_unreg; + } + + return err; + +err_unreg: + mlx5_unregister_interface(&mlx5_ib_interface); + +clean_odp: + return err; +} + +static void __exit mlx5_ib_cleanup(void) +{ + destroy_workqueue(mlx5_ib_wq); + mlx5_unregister_interface(&mlx5_ib_interface); +} + +module_init_order(mlx5_ib_init, SI_ORDER_THIRD); +module_exit_order(mlx5_ib_cleanup, SI_ORDER_THIRD); Index: sys/dev/mlx5/mlx5_ib/mlx5_mem.c =================================================================== --- /dev/null +++ sys/dev/mlx5/mlx5_ib/mlx5_mem.c @@ -0,0 +1,184 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include "mlx5_ib.h" + +/* @umem: umem object to scan + * @addr: ib virtual address requested by the user + * @count: number of PAGE_SIZE pages covered by umem + * @shift: page shift for the compound pages found in the region + * @ncont: number of compund pages + * @order: log2 of the number of compound pages + */ + +void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, + int *ncont, int *order) +{ + uintptr_t tmp; + unsigned long m; + int i, k; + u64 base = 0; + int p = 0; + int skip; + int mask; + u64 len; + u64 pfn; + struct scatterlist *sg; + int entry; + unsigned long page_shift = ilog2(umem->page_size); + + addr = addr >> page_shift; + tmp = (uintptr_t)addr; + m = find_first_bit(&tmp, 8 * sizeof(tmp)); + skip = 1 << m; + mask = skip - 1; + i = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> page_shift; + pfn = sg_dma_address(sg) >> page_shift; + for (k = 0; k < len; k++) { + if (!(i & mask)) { + tmp = (uintptr_t)pfn; + m = min_t(unsigned long, m, + find_first_bit(&tmp, 8 * sizeof(tmp))); + skip = 1 << m; + mask = skip - 1; + base = pfn; + p = 0; + } else { + if (base + p != pfn) { + tmp = (uintptr_t)p; + m = find_first_bit(&tmp, 8 * sizeof(tmp)); + skip = 1 << m; + mask = skip - 1; + base = pfn; + p = 0; + } + } + p++; + i++; + } + } + + if (i) { + m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m); + + if (order) + *order = ilog2(roundup_pow_of_two(i) >> m); + + *ncont = DIV_ROUND_UP(i, (1 << m)); + } else { + m = 0; + + if (order) + *order = 0; + + *ncont = 0; + } + *shift = page_shift + m; + *count = i; +} + +/* + * Populate the given array with bus addresses from the umem. + * + * dev - mlx5_ib device + * umem - umem to use to fill the pages + * page_shift - determines the page size used in the resulting array + * offset - offset into the umem to start from, + * only implemented for ODP umems + * num_pages - total number of pages to fill + * pas - bus addresses array to fill + * access_flags - access flags to set on all present pages. + use enum mlx5_ib_mtt_access_flags for this. + */ +static void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, size_t offset, + __be64 *pas, int access_flags) +{ + unsigned long umem_page_shift = ilog2(umem->page_size); + int shift = page_shift - umem_page_shift; + int mask = (1 << shift) - 1; + int i, k; + u64 cur = 0; + u64 base; + int len; + struct scatterlist *sg; + int entry; + + i = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> umem_page_shift; + base = sg_dma_address(sg); + for (k = 0; k < len; k++) { + if (!(i & mask)) { + cur = base + (k << umem_page_shift); + cur |= access_flags; + + pas[i >> shift] = cpu_to_be64(cur); + mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", + i >> shift, (unsigned long long) + be64_to_cpu(pas[i >> shift])); + } else + mlx5_ib_dbg(dev, "=====> 0x%llx\n", + (unsigned long long) + (base + (k << umem_page_shift))); + i++; + } + } +} + +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int access_flags) +{ + return __mlx5_ib_populate_pas(dev, umem, page_shift, 0, + pas, + access_flags); +} + +int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) +{ + u64 page_size; + u64 page_mask; + u64 off_size; + u64 off_mask; + u64 buf_off; + + page_size = (u64)1 << page_shift; + page_mask = page_size - 1; + buf_off = addr & page_mask; + off_size = page_size >> 6; + off_mask = off_size - 1; + + if (buf_off & off_mask) + return -EINVAL; + + *offset = (u32)(buf_off >> ilog2(off_size)); + return 0; +} Index: sys/dev/mlx5/mlx5_ib/mlx5_mr.c =================================================================== --- /dev/null +++ sys/dev/mlx5/mlx5_ib/mlx5_mr.c @@ -0,0 +1,2048 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#include +#include +#include +#include +#include +#include "mlx5_ib.h" + +enum { + MAX_PENDING_REG_MR = 8, + MAX_MR_RELEASE_TIMEOUT = (60 * 20) /* Allow release timeout up to 20 min */ +}; + +#define MLX5_UMR_ALIGN 2048 + +static int mlx5_mr_sysfs_init(struct mlx5_ib_dev *dev); +static void mlx5_mr_sysfs_cleanup(struct mlx5_ib_dev *dev); + +static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + + return err; +} + +static int order2idx(struct mlx5_ib_dev *dev, int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + + if (order < cache->ent[0].order) + return 0; + else + return order - cache->ent[0].order; +} + +static void reg_mr_callback(int status, void *context) +{ + struct mlx5_ib_mr *mr = context; + struct mlx5_ib_dev *dev = mr->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int c = order2idx(dev, mr->order); + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_core_mr *mmr = &mr->mmr; + struct mlx5_mr_table *table = &dev->mdev->priv.mr_table; + unsigned long flags; + int err; + u8 key; + + spin_lock_irqsave(&ent->lock, flags); + ent->pending--; + spin_unlock_irqrestore(&ent->lock, flags); + if (status) { + mlx5_ib_warn(dev, "async reg mr failed. status %d, order %d\n", status, ent->order); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + if (mr->out.hdr.status) { + mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n", + mr->out.hdr.status, + be32_to_cpu(mr->out.hdr.syndrome)); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags); + key = dev->mdev->priv.mkey_key++; + spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags); + mmr->key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; + mlx5_ib_dbg(dev, "callbacked mkey 0x%x created\n", + be32_to_cpu(mr->out.mkey)); + + cache->last_add = jiffies; + + spin_lock_irqsave(&ent->lock, flags); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + ent->size++; + spin_unlock_irqrestore(&ent->lock, flags); + + spin_lock_irqsave(&table->lock, flags); + err = radix_tree_insert(&table->tree, mlx5_mkey_to_idx(mmr->key), mmr); + spin_unlock_irqrestore(&table->lock, flags); + if (err) { + mlx5_ib_warn(dev, "failed radix tree insert of mkey 0x%x, %d\n", + mmr->key, err); + mlx5_core_destroy_mkey(mdev, mmr); + } +} + +static int add_keys(struct mlx5_ib_dev *dev, int c, int num) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int npages = 1 << ent->order; + int err = 0; + int i; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + for (i = 0; i < num; i++) { + if (ent->pending >= MAX_PENDING_REG_MR) { + err = -EAGAIN; + break; + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + err = -ENOMEM; + break; + } + mr->order = ent->order; + mr->umred = 1; + mr->dev = dev; + in->seg.status = MLX5_MKEY_STATUS_FREE; + in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; + in->seg.log2_page_size = 12; + + spin_lock_irq(&ent->lock); + ent->pending++; + spin_unlock_irq(&ent->lock); + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, + sizeof(*in), reg_mr_callback, + mr, &mr->out); + if (err) { + spin_lock_irq(&ent->lock); + ent->pending--; + spin_unlock_irq(&ent->lock); + mlx5_ib_warn(dev, "create mkey failed %d\n", err); + kfree(mr); + break; + } + } + + kfree(in); + return err; +} + +static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *mr; + int err; + int i; + + for (i = 0; i < num; i++) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + return; + } + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->cur--; + ent->size--; + spin_unlock_irq(&ent->lock); + err = destroy_mkey(dev, mr); + if (err) + mlx5_ib_warn(dev, "failed destroy mkey\n"); + else + kfree(mr); + } +} + +static int someone_adding(struct mlx5_mr_cache *cache) +{ + int i; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].cur < cache->ent[i].limit) + return 1; + } + + return 0; +} + +static int someone_releasing(struct mlx5_mr_cache *cache) +{ + int i; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].cur > 2 * cache->ent[i].limit) + return 1; + } + + return 0; +} + +static void __cache_work_func(struct mlx5_cache_ent *ent) +{ + struct mlx5_ib_dev *dev = ent->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int i = order2idx(dev, ent->order); + int err; + s64 dtime; + + if (cache->stopped) + return; + + ent = &dev->cache.ent[i]; + if (ent->cur < 2 * ent->limit && !dev->fill_delay) { + err = add_keys(dev, i, 1); + if (ent->cur < 2 * ent->limit) { + if (err == -EAGAIN) { + mlx5_ib_dbg(dev, "returned eagain, order %d\n", + i + 2); + cancel_delayed_work(&ent->dwork); + if (!queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(3))) + mlx5_ib_warn(dev, "failed queueing delayed work\n"); + } else if (err) { + mlx5_ib_warn(dev, "command failed order %d, err %d\n", + i + 2, err); + cancel_delayed_work(&ent->dwork); + if (!queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(1000))) + mlx5_ib_warn(dev, "failed queueing delayed work\n"); + } else { + if (!queue_work(cache->wq, &ent->work)) + mlx5_ib_warn(dev, "failed queueing work\n"); + } + } + } else if (ent->cur > 2 * ent->limit) { + dtime = (cache->last_add + (s64)cache->rel_timeout * HZ) - jiffies; + if (cache->rel_imm || + (cache->rel_timeout >= 0 && !someone_adding(cache) && dtime <= 0)) { + remove_keys(dev, i, 1); + if (ent->cur > ent->limit) + if (!queue_work(cache->wq, &ent->work)) + mlx5_ib_warn(dev, "failed queueing work\n"); + } else if (cache->rel_timeout >= 0) { + dtime = max_t(s64, dtime, 0); + dtime = min_t(s64, dtime, (MAX_MR_RELEASE_TIMEOUT * HZ)); + cancel_delayed_work(&ent->dwork); + if (!queue_delayed_work(cache->wq, &ent->dwork, dtime)) + mlx5_ib_warn(dev, "failed queueing delayed work\n"); + } + } else if (cache->rel_imm && !someone_releasing(cache)) { + cache->rel_imm = 0; + } +} + +static void delayed_cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, dwork.work); + __cache_work_func(ent); +} + +static void cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, work); + __cache_work_func(ent); +} + +static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_ib_mr *mr = NULL; + struct mlx5_cache_ent *ent; + int c; + int i; + + c = order2idx(dev, order); + if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { + mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c); + return NULL; + } + + for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + + mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); + + spin_lock_irq(&ent->lock); + if (!list_empty(&ent->head)) { + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, + list); + list_del(&mr->list); + ent->cur--; + spin_unlock_irq(&ent->lock); + if (ent->cur < ent->limit) + if (!queue_work(cache->wq, &ent->work)) + mlx5_ib_warn(dev, "failed queueing work\n"); + break; + } + spin_unlock_irq(&ent->lock); + + if (!queue_work(cache->wq, &ent->work)) + mlx5_ib_warn(dev, "failed queueing work\n"); + + if (mr) + break; + } + + if (!mr) + cache->ent[c].miss++; + + return mr; +} + +static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int shrink = 0; + int c; + + c = order2idx(dev, mr->order); + if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { + mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c); + return; + } + ent = &cache->ent[c]; + spin_lock_irq(&ent->lock); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + if (ent->cur > 2 * ent->limit) + shrink = 1; + spin_unlock_irq(&ent->lock); + + if (shrink) + if (!queue_work(cache->wq, &ent->work)) + mlx5_ib_warn(dev, "failed queueing work\n"); +} + +static void clean_keys(struct mlx5_ib_dev *dev, int c) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *mr; + int err; + + cancel_delayed_work(&ent->dwork); + while (1) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + return; + } + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->cur--; + ent->size--; + spin_unlock_irq(&ent->lock); + err = destroy_mkey(dev, mr); + if (err) + mlx5_ib_warn(dev, "failed destroy mkey 0x%x from order %d\n", + mr->mmr.key, ent->order); + else + kfree(mr); + } +} + +static void delay_time_func(unsigned long ctx) +{ + struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx; + + dev->fill_delay = 0; +} + +enum { + MLX5_VF_MR_LIMIT = 2, +}; + +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int limit; + int err; + int i; + + mutex_init(&dev->slow_path_mutex); + cache->rel_timeout = 300; + cache->wq = create_singlethread_workqueue("mkey_cache"); + if (!cache->wq) { + mlx5_ib_warn(dev, "failed to create work queue\n"); + return -ENOMEM; + } + + setup_timer(&dev->delay_timer, delay_time_func, (uintptr_t)dev); + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + INIT_LIST_HEAD(&cache->ent[i].head); + spin_lock_init(&cache->ent[i].lock); + + ent = &cache->ent[i]; + INIT_LIST_HEAD(&ent->head); + spin_lock_init(&ent->lock); + ent->order = i + 2; + ent->dev = dev; + + if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) { + if (mlx5_core_is_pf(dev->mdev)) + limit = dev->mdev->profile->mr_cache[i].limit; + else + limit = MLX5_VF_MR_LIMIT; + } else { + limit = 0; + } + + INIT_WORK(&ent->work, cache_work_func); + INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); + ent->limit = limit; + if (!queue_work(cache->wq, &ent->work)) + mlx5_ib_warn(dev, "failed queueing work\n"); + } + + err = mlx5_mr_sysfs_init(dev); + if (err) + mlx5_ib_warn(dev, "failed to init mr cache sysfs\n"); + + return 0; +} + +static void wait_for_async_commands(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int total = 0; + int i; + int j; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + for (j = 0 ; j < 1000; j++) { + if (!ent->pending) + break; + msleep(50); + } + } + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + total += ent->pending; + } + + if (total) + mlx5_ib_dbg(dev, "aborted, %d pending requests\n", total); + else + mlx5_ib_dbg(dev, "done with all pending requests\n"); +} + +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) +{ + int i; + + dev->cache.stopped = 1; + flush_workqueue(dev->cache.wq); + mlx5_mr_sysfs_cleanup(dev); + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) + clean_keys(dev, i); + + destroy_workqueue(dev->cache.wq); + wait_for_async_commands(dev); + del_timer_sync(&dev->delay_timer); + return 0; +} + +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_mkey_seg *seg; + struct mlx5_ib_mr *mr; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + seg = &in->seg; + seg->flags = convert_access(acc) | MLX5_ACCESS_MODE_PA; + seg->flags_pd = cpu_to_be32(to_mpd(pd)->pdn | MLX5_MKEY_LEN64); + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + seg->start_addr = 0; + + err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL, + NULL); + if (err) + goto err_in; + + kfree(in); + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_in: + kfree(in); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +static int get_octo_len(u64 addr, u64 len, int page_size) +{ + u64 offset; + int npages; + + offset = addr & (page_size - 1); + npages = ALIGN(len + offset, page_size) >> ilog2(page_size); + return (npages + 1) / 2; +} + +static int use_umr(int order) +{ + return order <= MLX5_MAX_UMR_SHIFT; +} + +static int use_klm(int order) +{ + return order <= 31; +} + +static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, + struct ib_sge *sg, u64 dma, int n, u32 key, + int page_shift, u64 virt_addr, u64 len, + int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_mr *mr = dev->umrc.mr; + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; + + sg->addr = dma; + sg->length = ALIGN(sizeof(u64) * n, 64); + sg->lkey = mr->lkey; + + wr->next = NULL; + wr->send_flags = 0; + wr->sg_list = sg; + if (n) + wr->num_sge = 1; + else + wr->num_sge = 0; + + wr->opcode = MLX5_IB_WR_UMR; + + umrwr->npages = n; + umrwr->page_shift = page_shift; + umrwr->mkey = key; + umrwr->target.virt_addr = virt_addr; + umrwr->length = len; + umrwr->access_flags = access_flags; + umrwr->pd = pd; +} + +static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, + struct ib_send_wr *wr, u32 key) +{ + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; + + wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE; + wr->opcode = MLX5_IB_WR_UMR; + umrwr->mkey = key; +} + +void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) +{ + struct mlx5_ib_umr_context *context; + struct ib_wc wc; + int err; + + while (1) { + err = ib_poll_cq(cq, 1, &wc); + if (err < 0) { + printf("mlx5_ib: WARN: ""poll cq error %d\n", err); + return; + } + if (err == 0) + break; + + context = (struct mlx5_ib_umr_context *)(uintptr_t)wc.wr_id; + context->status = wc.status; + complete(&context->done); + } + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); +} + +static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, + u64 virt_addr, u64 len, int npages, + int page_shift, int order, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct device *ddev = dev->ib_dev.dma_device; + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_send_wr wr, *bad; + struct mlx5_ib_mr *mr; + struct ib_sge sg; + int size; + __be64 *mr_pas; + __be64 *pas; + dma_addr_t dma; + int err = 0; + int i; + + for (i = 0; i < 1; i++) { + mr = alloc_cached_mr(dev, order); + if (mr) + break; + + err = add_keys(dev, order2idx(dev, order), 1); + if (err && err != -EAGAIN) { + mlx5_ib_warn(dev, "add_keys failed, err %d\n", err); + break; + } + } + + if (!mr) + return ERR_PTR(-EAGAIN); + + /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. + * To avoid copying garbage after the pas array, we allocate + * a little more. */ + size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); + mr_pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); + if (!mr_pas) { + err = -ENOMEM; + goto free_mr; + } + + pas = PTR_ALIGN(mr_pas, MLX5_UMR_ALIGN); + mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); + /* Clear padding after the actual pages. */ + memset(pas + npages, 0, size - npages * sizeof(u64)); + + dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) { + err = -ENOMEM; + goto free_pas; + } + + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(uintptr_t)&umr_context; + prep_umr_reg_wqe(pd, &wr, &sg, dma, npages, mr->mmr.key, page_shift, + virt_addr, len, access_flags); + + mlx5_ib_init_umr_context(&umr_context); + down(&umrc->sem); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + mlx5_ib_warn(dev, "post send failed, err %d\n", err); + goto unmap_dma; + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed\n"); + err = -EFAULT; + } + } + + mr->mmr.iova = virt_addr; + mr->mmr.size = len; + mr->mmr.pd = to_mpd(pd)->pdn; + +unmap_dma: + up(&umrc->sem); + dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); + +free_pas: + kfree(mr_pas); + +free_mr: + if (err) { + free_cached_mr(dev, mr); + return ERR_PTR(err); + } + + return mr; +} + +static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, + u64 length, struct ib_umem *umem, + int npages, int page_shift, + int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int inlen; + int err; + bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2; + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto err_1; + } + mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, + pg_cap ? MLX5_IB_MTT_PRESENT : 0); + + /* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags + * in the page list submitted with the command. */ + in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0; + in->seg.flags = convert_access(access_flags) | + MLX5_ACCESS_MODE_MTT; + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + in->seg.start_addr = cpu_to_be64(virt_addr); + in->seg.len = cpu_to_be64(length); + in->seg.bsfs_octo_size = 0; + in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); + in->seg.log2_page_size = page_shift; + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, + 1 << page_shift)); + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL, + NULL, NULL); + if (err) { + mlx5_ib_warn(dev, "create mkey failed\n"); + goto err_2; + } + mr->umem = umem; + mr->dev = dev; + kvfree(in); + + mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); + + return mr; + +err_2: + kvfree(in); + +err_1: + kfree(mr); + + return ERR_PTR(err); +} + +enum { + MLX5_MAX_REG_ORDER = MAX_MR_CACHE_ENTRIES + 1, + MLX5_MAX_REG_SIZE = 2ul * 1024 * 1024 * 1024, +}; + +static u64 get_lsize(int page_shift) +{ + u64 l1; + u64 l2; + + l1 = (u64)1 << (page_shift + MLX5_MAX_REG_ORDER); + l2 = MLX5_MAX_REG_SIZE; + + if (l1 > l2) + return l2; + + return l1; +} + +static int alloc_mrs(struct mlx5_ib_dev *dev, struct mlx5_ib_mr **lmr, int n, + int order, u64 size, int nchild, int sorder, u64 len, + u64 off, int npages) +{ + int err = 0; + int i; + int k; + + for (i = 0, k = 0; i < n; i++) { +again: + if (k++ > 3) { + err = -EAGAIN; + goto out; + } + lmr[i] = alloc_cached_mr(dev, order); + if (!lmr[i]) { + err = add_keys(dev, order2idx(dev, order), n - i); + if (err) { + if (err != -EAGAIN) { + mlx5_ib_warn(dev, "add_keys failed to add %d keys (%d)\n", n - i, err); + goto out; + } + msleep(20); + } + goto again; + } + lmr[i]->size = size; + lmr[i]->page_count = 1 << order; + k = 0; + } + + if (nchild == n) + return 0; + + for (k = 0; k < 3; k++) { + lmr[i] = alloc_cached_mr(dev, sorder); + if (lmr[i]) + break; + err = add_keys(dev, order2idx(dev, sorder), 1); + if (err) { + if (err != -EAGAIN) { + mlx5_ib_warn(dev, "add_keys failed, err %d\n", err); + goto out; + } + msleep(20); + } + } + + if (!lmr[i]) { + err = -EAGAIN; + goto out; + } + lmr[i]->size = len - size * n + off; + lmr[i]->page_count = npages - (n << order); + + return 0; + +out: + for (--i; i >= 0; --i) + free_cached_mr(dev, lmr[i]); + + return err; +} + +static int create_indirect_key(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_mr *mr, unsigned n) +{ + struct mlx5_create_mkey_mbox_in *in; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + mr->dev = dev; + in->seg.status = 1 << 6; /* free */; + in->seg.flags = MLX5_ACCESS_MODE_KLM | MLX5_PERM_UMR_EN; + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + in->seg.xlt_oct_size = cpu_to_be32(ALIGN(n, 4)); + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), + NULL, NULL, NULL); + + kfree(in); + return err; +} + +static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + struct mlx5_core_dev *mdev = dev->mdev; + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_send_wr wr, *bad; + int err; + + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + return 0; + + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(unsigned long)&umr_context; + prep_umr_unreg_wqe(dev, &wr, mr->mmr.key); + + mlx5_ib_init_umr_context(&umr_context); + down(&umrc->sem); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + up(&umrc->sem); + mlx5_ib_warn(dev, "err %d\n", err); + goto error; + } else { + wait_for_completion(&umr_context.done); + up(&umrc->sem); + } + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "unreg umr failed\n"); + err = -EFAULT; + goto error; + } + return 0; + +error: + return err; +} + +static int reg_mrs(struct ib_pd *pd, struct mlx5_ib_mr **mrs, int n, + dma_addr_t dma, int copy, int page_shift, void *dptr, + __be64 *pas, int access_flags, u64 maxorder) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_send_wr *bad; + struct ib_send_wr wr; + struct ib_sge sg; + int err1; + int err; + int i; + + for (i = 0; i < n; ++i) { + if (copy) { + memcpy(dptr, pas + (i << maxorder), + sizeof(__be64) * mrs[i]->page_count); + mrs[i]->dma = dma; + } else { + mrs[i]->dma = dma + (sizeof(__be64) << maxorder) * i; + } + + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(unsigned long)&umr_context; + prep_umr_reg_wqe(pd, + &wr, + &sg, + mrs[i]->dma, + mrs[i]->page_count, + mrs[i]->mmr.key, + page_shift, + 0, + mrs[i]->size, + access_flags); + down(&umrc->sem); + mlx5_ib_init_umr_context(&umr_context); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + mlx5_ib_warn(dev, "post send failed, err %d\n", err); + up(&umrc->sem); + goto out; + } + wait_for_completion(&umr_context.done); + up(&umrc->sem); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed\n"); + err = -EFAULT; + goto out; + } + } + return 0; +out: + for (--i; i >= 0; --i) { + err1 = unreg_umr(dev, mrs[i]); + if (err1) + mlx5_ib_warn(dev, "unreg_umr failed %d\n", err1); + } + + return err; +} + +static void populate_klm(void *dma, struct mlx5_ib_mr **lmr, int n, u64 off) +{ + struct mlx5_wqe_data_seg *dseg = dma; + int i; + + for (i = 0; i < n; i++) { + dseg[i].lkey = cpu_to_be32(lmr[i]->mmr.key); + if (!i) { + dseg[i].byte_count = cpu_to_be32((u32)(lmr[i]->size - off)); + dseg[0].addr = cpu_to_be64(off); + } else { + dseg[i].byte_count = cpu_to_be32((u32)(lmr[i]->size)); + dseg[i].addr = 0; + } + } +} + +static void prep_indirect_wqe(struct ib_pd *pd, struct ib_send_wr *wr, + struct ib_sge *sg, u64 dma, int n, u32 key, + int page_shift, u64 virt_addr, u64 len, + int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_mr *mr = dev->umrc.mr; + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; + + sg->addr = dma; + sg->length = ALIGN(sizeof(u64) * n, 64); + sg->lkey = mr->lkey; + + wr->next = NULL; + wr->send_flags = 0; + wr->sg_list = sg; + wr->num_sge = 1; + wr->opcode = MLX5_IB_WR_UMR; + wr->send_flags = 0; + /* since post send interprets this as MTTs and since a KLM + is two MTTs, we multiply by two to have */ + umrwr->npages = n * 2; + umrwr->page_shift = page_shift; + umrwr->mkey = key; + umrwr->target.virt_addr = virt_addr; + umrwr->length = len; + umrwr->access_flags = access_flags; + umrwr->pd = pd; +} + +static void free_mrs(struct mlx5_ib_dev *dev, struct mlx5_ib_mr **lmr, int n) +{ + int i; + + for (i = 0; i < n; i++) + if (lmr[i]) + free_cached_mr(dev, lmr[i]); +} + +static int get_nchild(int npages, int page_shift, u64 *maxorder, int *sorder, int *quot) +{ + int res; + int denom; + + denom = min_t(int, 1 << MLX5_MAX_REG_ORDER, MLX5_MAX_REG_SIZE >> page_shift); + res = npages % denom; + *quot = npages / denom; + *maxorder = ilog2(denom); + *sorder = max_t(int, ilog2(roundup_pow_of_two(res)), 2); + return *quot + (res ? 1 : 0); +} + +static struct mlx5_ib_mr *reg_klm(struct ib_pd *pd, struct ib_umem *umem, + u64 virt_addr, u64 len, int npages, + int page_shift, int order, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct device *ddev = dev->ib_dev.dma_device; + unsigned size = sizeof(__be64) * npages; + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_mr **lmr = NULL; + struct mlx5_ib_mr *imr = NULL; + struct ib_send_wr *bad; + struct ib_send_wr wr; + struct mlx5_ib_umr_context umr_context; + __be64 *spas = NULL; + __be64 *pas = NULL; + dma_addr_t dma = 0; + unsigned dsize; + int err = -ENOMEM; + struct ib_sge sg; + int nchild; + int sorder; + void *dptr; + u64 lsize; + int i = 0; + int err1; + int quot; + u64 off; + u64 maxorder; + + mlx5_ib_dbg(dev, "addr 0x%llx, len 0x%llx, npages %d, page_shift %d, order %d, access_flags 0x%x\n", + (long long)virt_addr, (long long)len, npages, page_shift, order, access_flags); + lsize = get_lsize(page_shift); + nchild = get_nchild(npages, page_shift, &maxorder, &sorder, "); + off = (virt_addr & ((1 << page_shift) - 1)); + lmr = kcalloc(nchild, sizeof(*lmr), GFP_KERNEL); + if (!lmr) { + mlx5_ib_warn(dev, "allocation failed\n"); + err = -ENOMEM; + goto out; + } + + pas = mlx5_vmalloc(size); + if (!pas) { + mlx5_ib_warn(dev, "allocation failed\n"); + err = -ENOMEM; + goto out; + } + + mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); +#define is_vmalloc_addr(x) 0 + if (is_vmalloc_addr(pas)) { + dsize = sizeof(__be64) << maxorder; + spas = kmalloc(dsize, GFP_KERNEL); + if (!spas) { + err = -ENOMEM; + mlx5_ib_warn(dev, "allocation failed\n"); + goto out; + } + dptr = spas; + } else { + dsize = size; + dptr = pas; + } + + dma = dma_map_single(ddev, dptr, dsize, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) { + err = -ENOMEM; + mlx5_ib_warn(dev, "dma map failed\n"); + goto out; + } + + err = alloc_mrs(dev, lmr, quot, maxorder, lsize, nchild, sorder, len, off, npages); + if (err) { + mlx5_ib_dbg(dev, "alloc_mrs failed\n"); + goto out_map; + } + + imr = kzalloc(sizeof(*imr), GFP_KERNEL); + if (!imr) { + err = -ENOMEM; + mlx5_ib_warn(dev, "failed allocation\n"); + goto out_mrs; + } + + err = create_indirect_key(dev, pd, imr, nchild); + if (err) { + mlx5_ib_warn(dev, "failed creating indirect key %d\n", err); + goto out_mrs; + } + imr->size = len; + + err = reg_mrs(pd, lmr, nchild, dma, !!spas, + page_shift, dptr, pas, access_flags, maxorder); + if (err) { + mlx5_ib_warn(dev, "reg_mrs failed %d\n", err); + goto out_indir; + } + + populate_klm(dptr, lmr, nchild, off); + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(unsigned long)&umr_context; + imr->dma = dma; + prep_indirect_wqe(pd, &wr, &sg, dma, nchild, imr->mmr.key, page_shift, + virt_addr, len, access_flags); + down(&umrc->sem); + mlx5_ib_init_umr_context(&umr_context); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + mlx5_ib_warn(dev, "post send failed, err %d\n", err); + up(&umrc->sem); + goto out_unreg; + } + wait_for_completion(&umr_context.done); + up(&umrc->sem); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed\n"); + err = -EFAULT; + goto out_unreg; + } + imr->children = lmr; + imr->nchild = nchild; + + dma_unmap_single(ddev, dma, dsize, DMA_TO_DEVICE); + kfree(spas); + kvfree(pas); + + return imr; + +out_unreg: + for (i = 0; i < nchild; ++i) { + err1 = unreg_umr(dev, lmr[i]); + if (err1) + mlx5_ib_warn(dev, "unreg_umr failed %d\n", err1); + } +out_indir: + err1 = mlx5_core_destroy_mkey(dev->mdev, &imr->mmr); + if (err1) + mlx5_ib_warn(dev, "destroy imr mkey failed %d\n", err1); +out_mrs: + kfree(imr); + free_mrs(dev, lmr, nchild); +out_map: + dma_unmap_single(ddev, dma, dsize, DMA_TO_DEVICE); +out: + kfree(spas); + kvfree(pas); + kfree(lmr); + return ERR_PTR(err); +} + +static int clean_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + int umred = mr->umred; + int err; + int i; + + if (!umred) { + for (i = 0; i < mr->nchild; ++i) { + if (unreg_umr(dev, mr->children[i])) + mlx5_ib_warn(dev, "child %d\n", i); + + free_cached_mr(dev, mr->children[i]); + } + kfree(mr->children); + err = destroy_mkey(dev, mr); + if (err) { + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", + mr->mmr.key, err); + return err; + } + } else { + err = unreg_umr(dev, mr); + if (err) { + mlx5_ib_warn(dev, "failed unregister\n"); + return err; + } + } + + return 0; +} + +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata, int mr_id) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr = NULL; + struct ib_umem *umem; + int page_shift; + int npages; + int ncont; + int order; + int err; + struct ib_peer_memory_client *ib_peer_mem; + + mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", + (unsigned long long)start, (unsigned long long)virt_addr, + (unsigned long long)length, access_flags); + umem = ib_umem_get_ex(pd->uobject->context, start, length, access_flags, + 0, 1); + if (IS_ERR(umem)) { + mlx5_ib_warn(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); + return (void *)umem; + } + ib_peer_mem = umem->ib_peer_mem; + + mlx5_ib_cont_pages(umem, start, &npages, &page_shift, &ncont, &order); + if (!npages) { + mlx5_ib_warn(dev, "avoid zero region\n"); + err = -EINVAL; + goto error; + } + + mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", + npages, ncont, order, page_shift); + + if (use_umr(order)) { + mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift, + order, access_flags); + if (PTR_ERR(mr) == -EAGAIN) { + mlx5_ib_dbg(dev, "cache empty for order %d", order); + mr = NULL; + } + } else if (use_klm(order) && !(access_flags & IB_ACCESS_ON_DEMAND)) { + mr = reg_klm(pd, umem, virt_addr, length, ncont, page_shift, + order, access_flags); + if (IS_ERR(mr)) { + mlx5_ib_dbg(dev, "reg_klm failed for order %d (%ld)", + order, PTR_ERR(mr)); + mr = NULL; + } + } else if (access_flags & IB_ACCESS_ON_DEMAND) { + err = -EINVAL; + mlx5_ib_warn(dev, "Got MR registration for ODP MR > 512MB, not supported for Connect-IB"); + goto error; + } + + if (!mr) { + mutex_lock(&dev->slow_path_mutex); + mr = reg_create(pd, virt_addr, length, umem, ncont, page_shift, + access_flags); + mutex_unlock(&dev->slow_path_mutex); + } + + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + mr = NULL; + goto error; + } + + mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key); + + mr->umem = umem; + mr->npages = npages; + atomic_add(npages, &dev->mdev->priv.reg_pages); + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + + return &mr->ibmr; + +error: + /* + * Destroy the umem *before* destroying the MR, to ensure we + * will not have any in-flight notifiers when destroying the + * MR. + * + * As the MR is completely invalid to begin with, and this + * error path is only taken if we can't push the mr entry into + * the pagefault tree, this is safe. + */ + + ib_umem_release(umem); + return ERR_PTR(err); +} + +int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct ib_umem *umem = mr->umem; + int npages = mr->npages; + int umred = mr->umred; + int err; + + err = clean_mr(mr); + if (err) + return err; + + if (umem) { + ib_umem_release(umem); + atomic_sub(npages, &dev->mdev->priv.reg_pages); + } + + if (umred) + free_cached_mr(dev, mr); + else + kfree(mr); + + return 0; +} + +struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int access_mode, err; + int ndescs = roundup(mr_init_attr->max_reg_descriptors, 4); + + if (!(mr_init_attr->flags & + (IB_MR_SIGNATURE_EN | IB_MR_INDIRECT_REG))) + return ERR_PTR(-EINVAL); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + in->seg.status = MLX5_MKEY_STATUS_FREE; + in->seg.xlt_oct_size = cpu_to_be32(ndescs); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + access_mode = MLX5_ACCESS_MODE_MTT; + + if (mr_init_attr->flags & IB_MR_SIGNATURE_EN) { + u32 psv_index[2]; + + in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) | + MLX5_MKEY_BSF_EN); + in->seg.bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); + mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); + if (!mr->sig) { + err = -ENOMEM; + goto err_free_in; + } + + /* create mem & wire PSVs */ + err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, + 2, psv_index); + if (err) + goto err_free_sig; + + access_mode = MLX5_ACCESS_MODE_KLM; + mr->sig->psv_memory.psv_idx = psv_index[0]; + mr->sig->psv_wire.psv_idx = psv_index[1]; + + mr->sig->sig_status_checked = true; + mr->sig->sig_err_exists = false; + /* Next UMR, Arm SIGERR */ + ++mr->sig->sigerr_count; + } + + in->seg.flags = MLX5_PERM_UMR_EN | access_mode; + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), + NULL, NULL, NULL); + if (err) + goto err_destroy_psv; + + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + kfree(in); + + return &mr->ibmr; + +err_destroy_psv: + if (mr->sig) { + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + } +err_free_sig: + kfree(mr->sig); +err_free_in: + kfree(in); +err_free: + kfree(mr); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + int err; + + if (mr->sig) { + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + kfree(mr->sig); + } + + err = destroy_mkey(dev, mr); + if (err) { + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", + mr->mmr.key, err); + return err; + } + + kfree(mr); + + return err; +} + +struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + in->seg.status = MLX5_MKEY_STATUS_FREE; + in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT; + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + /* + * TBD not needed - issue 197292 */ + in->seg.log2_page_size = PAGE_SHIFT; + + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), NULL, + NULL, NULL); + kfree(in); + if (err) { + mlx5_ib_warn(dev, "failed create mkey\n"); + goto err_free; + } + + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl; + int size = page_list_len * sizeof(u64); + + mfrpl = kmalloc(sizeof(*mfrpl), GFP_KERNEL); + if (!mfrpl) + return ERR_PTR(-ENOMEM); + + mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL); + if (!mfrpl->ibfrpl.page_list) + goto err_free; + + mfrpl->mapped_page_list = dma_alloc_coherent(ibdev->dma_device, + size, &mfrpl->map, + GFP_KERNEL); + if (!mfrpl->mapped_page_list) + goto err_free; + + WARN_ON(mfrpl->map & 0x3f); + + return &mfrpl->ibfrpl; + +err_free: + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); + return ERR_PTR(-ENOMEM); +} + +void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list); + struct mlx5_ib_dev *dev = to_mdev(page_list->device); + int size = page_list->max_page_list_len * sizeof(u64); + + dma_free_coherent(&dev->mdev->pdev->dev, size, mfrpl->mapped_page_list, + mfrpl->map); + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); +} + +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + struct mlx5_ib_mr *mmr = to_mmr(ibmr); + int ret = 0; + + if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { + printf("mlx5_ib: ERR: ""Invalid status check mask\n"); + ret = -EINVAL; + goto done; + } + + mr_status->fail_status = 0; + if (check_mask & IB_MR_CHECK_SIG_STATUS) { + if (!mmr->sig) { + ret = -EINVAL; + printf("mlx5_ib: ERR: ""signature status check requested on a non-signature enabled MR\n"); + goto done; + } + + mmr->sig->sig_status_checked = true; + if (!mmr->sig->sig_err_exists) + goto done; + + if (ibmr->lkey == mmr->sig->err_item.key) + memcpy(&mr_status->sig_err, &mmr->sig->err_item, + sizeof(mr_status->sig_err)); + else { + mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; + mr_status->sig_err.sig_err_offset = 0; + mr_status->sig_err.key = mmr->sig->err_item.key; + } + + mmr->sig->sig_err_exists = false; + mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; + } + +done: + return ret; +} + +struct order_attribute { + struct attribute attr; + ssize_t (*show)(struct cache_order *, struct order_attribute *, char *buf); + ssize_t (*store)(struct cache_order *, struct order_attribute *, + const char *buf, size_t count); +}; + +static ssize_t cur_show(struct cache_order *co, struct order_attribute *oa, + char *buf) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + int err; + + err = snprintf(buf, 20, "%d\n", ent->cur); + return err; +} + +static ssize_t limit_show(struct cache_order *co, struct order_attribute *oa, + char *buf) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + int err; + + err = snprintf(buf, 20, "%d\n", ent->limit); + return err; +} + +static ssize_t limit_store(struct cache_order *co, struct order_attribute *oa, + const char *buf, size_t count) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + u32 var; + int err; + + if (kstrtouint(buf, 0, &var)) + return -EINVAL; + + if (var > ent->size) + return -EINVAL; + + ent->limit = var; + + if (ent->cur < ent->limit) { + err = add_keys(dev, co->index, 2 * ent->limit - ent->cur); + if (err) + return err; + } + + return count; +} + +static ssize_t miss_show(struct cache_order *co, struct order_attribute *oa, + char *buf) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + int err; + + err = snprintf(buf, 20, "%d\n", ent->miss); + return err; +} + +static ssize_t miss_store(struct cache_order *co, struct order_attribute *oa, + const char *buf, size_t count) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + u32 var; + + if (kstrtouint(buf, 0, &var)) + return -EINVAL; + + if (var != 0) + return -EINVAL; + + ent->miss = var; + + return count; +} + +static ssize_t size_show(struct cache_order *co, struct order_attribute *oa, + char *buf) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + int err; + + err = snprintf(buf, 20, "%d\n", ent->size); + return err; +} + +static ssize_t size_store(struct cache_order *co, struct order_attribute *oa, + const char *buf, size_t count) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + u32 var; + int err; + + if (kstrtouint(buf, 0, &var)) + return -EINVAL; + + if (var < ent->limit) + return -EINVAL; + + if (var > ent->size) { + do { + err = add_keys(dev, co->index, var - ent->size); + if (err && err != -EAGAIN) + return err; + + usleep_range(3000, 5000); + } while (err); + } else if (var < ent->size) { + remove_keys(dev, co->index, ent->size - var); + } + + return count; +} + +static ssize_t order_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct order_attribute *oa = + container_of(attr, struct order_attribute, attr); + struct cache_order *co = container_of(kobj, struct cache_order, kobj); + + if (!oa->show) + return -EIO; + + return oa->show(co, oa, buf); +} + +static ssize_t order_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t size) +{ + struct order_attribute *oa = + container_of(attr, struct order_attribute, attr); + struct cache_order *co = container_of(kobj, struct cache_order, kobj); + + if (!oa->store) + return -EIO; + + return oa->store(co, oa, buf, size); +} + +static const struct sysfs_ops order_sysfs_ops = { + .show = order_attr_show, + .store = order_attr_store, +}; + +#define ORDER_ATTR(_name) struct order_attribute order_attr_##_name = \ + __ATTR(_name, 0644, _name##_show, _name##_store) +#define ORDER_ATTR_RO(_name) struct order_attribute order_attr_##_name = \ + __ATTR(_name, 0444, _name##_show, NULL) + +static ORDER_ATTR_RO(cur); +static ORDER_ATTR(limit); +static ORDER_ATTR(miss); +static ORDER_ATTR(size); + +static struct attribute *order_default_attrs[] = { + &order_attr_cur.attr, + &order_attr_limit.attr, + &order_attr_miss.attr, + &order_attr_size.attr, + NULL +}; + +static struct kobj_type order_type = { + .sysfs_ops = &order_sysfs_ops, + .default_attrs = order_default_attrs +}; + + + +struct cache_attribute { + struct attribute attr; + ssize_t (*show)(struct mlx5_ib_dev *dev, char *buf); + ssize_t (*store)(struct mlx5_ib_dev *dev, const char *buf, size_t count); +}; + +static ssize_t rel_imm_show(struct mlx5_ib_dev *dev, char *buf) +{ + struct mlx5_mr_cache *cache = &dev->cache; + int err; + + err = snprintf(buf, 20, "%d\n", cache->rel_imm); + return err; +} + +static ssize_t rel_imm_store(struct mlx5_ib_dev *dev, const char *buf, size_t count) +{ + struct mlx5_mr_cache *cache = &dev->cache; + u32 var; + int i; + int found = 0; + + if (kstrtouint(buf, 0, &var)) + return -EINVAL; + + if (var > 1) + return -EINVAL; + + if (var == cache->rel_imm) + return count; + + cache->rel_imm = var; + if (cache->rel_imm == 1) { + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].cur > 2 * cache->ent[i].limit) { + queue_work(cache->wq, &cache->ent[i].work); + found = 1; + } + } + if (!found) + cache->rel_imm = 0; + } + + return count; +} +static ssize_t rel_timeout_show(struct mlx5_ib_dev *dev, char *buf) +{ + struct mlx5_mr_cache *cache = &dev->cache; + int err; + + err = snprintf(buf, 20, "%d\n", cache->rel_timeout); + return err; +} + +static ssize_t rel_timeout_store(struct mlx5_ib_dev *dev, const char *buf, size_t count) +{ + struct mlx5_mr_cache *cache = &dev->cache; + int var; + int i; + + if (kstrtoint(buf, 0, &var)) + return -EINVAL; + + if (var < -1 || var > MAX_MR_RELEASE_TIMEOUT) + return -EINVAL; + + if (var == cache->rel_timeout) + return count; + + if (cache->rel_timeout == -1 || (var < cache->rel_timeout && var != -1)) { + cache->rel_timeout = var; + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].cur > 2 * cache->ent[i].limit) + queue_work(cache->wq, &cache->ent[i].work); + } + } else { + cache->rel_timeout = var; + } + + return count; +} + +static ssize_t cache_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct cache_attribute *ca = + container_of(attr, struct cache_attribute, attr); + struct mlx5_ib_dev *dev = container_of(kobj, struct mlx5_ib_dev, mr_cache); + + if (!ca->show) + return -EIO; + + return ca->show(dev, buf); +} + +static ssize_t cache_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t size) +{ + struct cache_attribute *ca = + container_of(attr, struct cache_attribute, attr); + struct mlx5_ib_dev *dev = container_of(kobj, struct mlx5_ib_dev, mr_cache); + + if (!ca->store) + return -EIO; + + return ca->store(dev, buf, size); +} + +static const struct sysfs_ops cache_sysfs_ops = { + .show = cache_attr_show, + .store = cache_attr_store, +}; + +#define CACHE_ATTR(_name) struct cache_attribute cache_attr_##_name = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static CACHE_ATTR(rel_imm); +static CACHE_ATTR(rel_timeout); + +static struct attribute *cache_default_attrs[] = { + &cache_attr_rel_imm.attr, + &cache_attr_rel_timeout.attr, + NULL +}; + +static struct kobj_type cache_type = { + .sysfs_ops = &cache_sysfs_ops, + .default_attrs = cache_default_attrs +}; + +static int mlx5_mr_sysfs_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct device *device = &dev->ib_dev.dev; + struct cache_order *co; + int o; + int i; + int err; + + err = kobject_init_and_add(&dev->mr_cache, &cache_type, + &device->kobj, "mr_cache"); + if (err) + return -ENOMEM; + + for (o = 2, i = 0; i < MAX_MR_CACHE_ENTRIES; o++, i++) { + co = &cache->ent[i].co; + co->order = o; + co->index = i; + co->dev = dev; + err = kobject_init_and_add(&co->kobj, &order_type, + &dev->mr_cache, "%d", o); + if (err) + goto err_put; + } + + return 0; + +err_put: + for (; i >= 0; i--) { + co = &cache->ent[i].co; + kobject_put(&co->kobj); + } + kobject_put(&dev->mr_cache); + + return err; +} + +static void mlx5_mr_sysfs_cleanup(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct cache_order *co; + int i; + + for (i = MAX_MR_CACHE_ENTRIES - 1; i >= 0; i--) { + co = &cache->ent[i].co; + kobject_put(&co->kobj); + } + kobject_put(&dev->mr_cache); +} + +int mlx5_ib_exp_query_mkey(struct ib_mr *mr, u64 mkey_attr_mask, + struct ib_mkey_attr *mkey_attr) +{ + struct mlx5_ib_mr *mmr = to_mmr(mr); + + mkey_attr->max_reg_descriptors = mmr->max_reg_descriptors; + + return 0; +} + +struct ib_indir_reg_list * +mlx5_ib_alloc_indir_reg_list(struct ib_device *device, + unsigned int max_indir_list_len) +{ + struct device *ddev = device->dma_device; + struct mlx5_ib_indir_reg_list *mirl = NULL; + int dsize; + int err; + + mirl = kzalloc(sizeof(*mirl), GFP_KERNEL); + if (!mirl) + return ERR_PTR(-ENOMEM); + + mirl->ib_irl.sg_list = kcalloc(max_indir_list_len, + sizeof(*mirl->ib_irl.sg_list), + GFP_KERNEL); + if (!mirl->ib_irl.sg_list) { + err = -ENOMEM; + goto err_sg_list; + } + + dsize = sizeof(*mirl->klms) * max_indir_list_len; + dsize += max_t(int, MLX5_UMR_ALIGN - 1, 0); + mirl->mapped_ilist = kzalloc(dsize, GFP_KERNEL); + if (!mirl->mapped_ilist) { + err = -ENOMEM; + goto err_mapped_list; + } + + mirl->klms = (void *)ALIGN((unsigned long long)mirl->mapped_ilist, + MLX5_UMR_ALIGN); + mirl->map = dma_map_single(ddev, mirl->klms, + dsize, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, mirl->map)) { + err = -ENOMEM; + goto err_dma_map; + } + + return &mirl->ib_irl; +err_dma_map: + kfree(mirl->mapped_ilist); +err_mapped_list: + kfree(mirl->ib_irl.sg_list); +err_sg_list: + kfree(mirl); + + return ERR_PTR(err); +} + +void +mlx5_ib_free_indir_reg_list(struct ib_indir_reg_list *indir_list) +{ + struct mlx5_ib_indir_reg_list *mirl = to_mindir_list(indir_list); + struct device *ddev = indir_list->device->dma_device; + int dsize; + + dsize = sizeof(*mirl->klms) * indir_list->max_indir_list_len; + dma_unmap_single(ddev, mirl->map, dsize, DMA_TO_DEVICE); + kfree(mirl->mapped_ilist); + kfree(mirl->ib_irl.sg_list); + kfree(mirl); +} Index: sys/dev/mlx5/mlx5_ib/mlx5_qp.c =================================================================== --- /dev/null +++ sys/dev/mlx5/mlx5_ib/mlx5_qp.c @@ -0,0 +1,5245 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include "mlx5_ib.h" +#include "user.h" +#include +#include + +#define IPV6_DEFAULT_HOPLIMIT 64 + + +static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state); + +/* not supported currently */ +static int workqueue_signature; + +enum { + MLX5_IB_ACK_REQ_FREQ = 8, +}; + +enum { + MLX5_IB_DEFAULT_SCHED_QUEUE = 0x83, + MLX5_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, + MLX5_IB_LINK_TYPE_IB = 0, + MLX5_IB_LINK_TYPE_ETH = 1 +}; + +enum { + MLX5_IB_SQ_STRIDE = 6, + MLX5_IB_CACHE_LINE_SIZE = 64, +}; + +enum { + MLX5_RQ_NUM_STATE = MLX5_RQC_STATE_ERR + 1, + MLX5_SQ_NUM_STATE = MLX5_SQC_STATE_ERR + 1, + MLX5_QP_STATE = MLX5_QP_NUM_STATE + 1, + MLX5_QP_STATE_BAD = MLX5_QP_STATE + 1, +}; + +static const u32 mlx5_ib_opcode[] = { + [IB_WR_SEND] = MLX5_OPCODE_SEND, + [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, + [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, + [IB_WR_RDMA_READ] = MLX5_OPCODE_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_CS, + [IB_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA, + [IB_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL, + [IB_WR_LOCAL_INV] = MLX5_OPCODE_UMR, + [IB_WR_FAST_REG_MR] = MLX5_OPCODE_UMR, + [IB_WR_REG_INDIR_MR] = MLX5_OPCODE_UMR, + [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_MASKED_CS, + [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_MASKED_FA, + [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, +}; + +struct umr_wr { + u64 virt_addr; + struct ib_pd *pd; + unsigned int page_shift; + unsigned int npages; + u32 length; + int access_flags; + u32 mkey; +}; + +static int is_qp0(enum ib_qp_type qp_type) +{ + return qp_type == IB_QPT_SMI; +} + +static int is_qp1(enum ib_qp_type qp_type) +{ + return qp_type == IB_QPT_GSI; +} + +static int is_sqp(enum ib_qp_type qp_type) +{ + return is_qp0(qp_type) || is_qp1(qp_type); +} + +static void *get_wqe(struct mlx5_ib_qp *qp, int offset) +{ + return mlx5_buf_offset(&qp->buf, offset); +} + +static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); +} + +void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); +} + + +static int +query_wqe_idx(struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + struct mlx5_query_qp_mbox_out *outb; + struct mlx5_qp_context *context; + int ret; + + outb = kzalloc(sizeof(*outb), GFP_KERNEL); + if (!outb) + return -ENOMEM; + + context = &outb->ctx; + + mutex_lock(&qp->mutex); + ret = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb)); + if (ret) + goto out_free; + + ret = be16_to_cpu(context->hw_sq_wqe_counter) & (qp->sq.wqe_cnt - 1); + +out_free: + mutex_unlock(&qp->mutex); + kfree(outb); + + return ret; +} + +static int mlx5_handle_sig_pipelining(struct mlx5_ib_qp *qp) +{ + int wqe_idx; + + wqe_idx = query_wqe_idx(qp); + if (wqe_idx < 0) { + printf("mlx5_ib: ERR: ""Failed to query QP 0x%x wqe index\n", qp->mqp.qpn); + return wqe_idx; + } + + if (qp->sq.swr_ctx[wqe_idx].sig_piped) { + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + struct mlx5_wqe_ctrl_seg *cwqe; + + cwqe = mlx5_get_send_wqe(qp, wqe_idx); + cwqe->opmod_idx_opcode = cpu_to_be32(be32_to_cpu(cwqe->opmod_idx_opcode) & 0xffffff00); + qp->sq.swr_ctx[wqe_idx].w_list.opcode |= MLX5_OPCODE_SIGNATURE_CANCELED; + mlx5_ib_dbg(dev, "Cancel QP 0x%x wqe_index 0x%x\n", + qp->mqp.qpn, wqe_idx); + } + + return 0; +} + +static void mlx5_ib_sqd_work(struct work_struct *work) +{ + struct mlx5_ib_sqd *sqd; + struct mlx5_ib_qp *qp; + struct ib_qp_attr qp_attr; + + sqd = container_of(work, struct mlx5_ib_sqd, work); + qp = sqd->qp; + + if (mlx5_handle_sig_pipelining(qp)) + goto out; + + mutex_lock(&qp->mutex); + if (__mlx5_ib_modify_qp(&qp->ibqp, &qp_attr, 0, IB_QPS_SQD, IB_QPS_RTS)) + printf("mlx5_ib: ERR: ""Failed to resume QP 0x%x\n", qp->mqp.qpn); + mutex_unlock(&qp->mutex); +out: + kfree(sqd); +} + +static void mlx5_ib_sigerr_sqd_event(struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_sqd *sqd; + + sqd = kzalloc(sizeof(*sqd), GFP_ATOMIC); + if (!sqd) + return; + + sqd->qp = qp; + INIT_WORK(&sqd->work, mlx5_ib_sqd_work); + queue_work(mlx5_ib_wq, &sqd->work); +} + +static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) +{ + struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; + struct ib_event event; + + if (type == MLX5_EVENT_TYPE_SQ_DRAINED && + to_mibqp(qp)->state != IB_QPS_SQD) { + mlx5_ib_sigerr_sqd_event(to_mibqp(qp)); + return; + } + + if (type == MLX5_EVENT_TYPE_PATH_MIG) + to_mibqp(qp)->port = to_mibqp(qp)->alt_port; + + if (ibqp->event_handler) { + event.device = ibqp->device; + event.element.qp = ibqp; + switch (type) { + case MLX5_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX5_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX5_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + default: + printf("mlx5_ib: WARN: ""mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn); + return; + } + + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, + int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd) +{ + int wqe_size; + int wq_size; + + /* Sanity check RQ size before proceeding */ + if (cap->max_recv_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) + return -EINVAL; + + if (!has_rq) { + qp->rq.max_gs = 0; + qp->rq.wqe_cnt = 0; + qp->rq.wqe_shift = 0; + cap->max_recv_wr = 0; + cap->max_recv_sge = 0; + } else { + if (ucmd) { + qp->rq.wqe_cnt = ucmd->rq_wqe_count; + qp->rq.wqe_shift = ucmd->rq_wqe_shift; + qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } else { + wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0; + wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg); + wqe_size = roundup_pow_of_two(wqe_size); + wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size; + wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB); + qp->rq.wqe_cnt = wq_size / wqe_size; + if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq)) { + mlx5_ib_dbg(dev, "wqe_size %d, max %d\n", + wqe_size, + MLX5_CAP_GEN(dev->mdev, + max_wqe_sz_rq)); + return -EINVAL; + } + qp->rq.wqe_shift = ilog2(wqe_size); + qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } + } + + return 0; +} + +static int sq_overhead(enum ib_qp_type qp_type) +{ + int size = 0; + + switch (qp_type) { + case IB_QPT_XRC_INI: + size += sizeof(struct mlx5_wqe_xrc_seg); + /* fall through */ + case IB_QPT_RC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_atomic_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + + case IB_QPT_XRC_TGT: + return 0; + + case IB_QPT_UC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_datagram_seg); + break; + + case MLX5_IB_QPT_REG_UMR: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + + case MLX5_IB_QPT_SW_CNAK: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_mlx_seg); + break; + + default: + return -EINVAL; + } + + return size; +} + +static int calc_send_wqe(struct ib_exp_qp_init_attr *attr) +{ + int inl_size = 0; + int size; + + size = sq_overhead(attr->qp_type); + if (size < 0) + return size; + + if (attr->cap.max_inline_data) { + inl_size = size + sizeof(struct mlx5_wqe_inline_seg) + + attr->cap.max_inline_data; + } + + size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN && + ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE) + return MLX5_SIG_WQE_SIZE; + else + return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); +} + +static int get_send_sge(struct ib_exp_qp_init_attr *attr, int wqe_size) +{ + int max_sge; + + if (attr->qp_type == IB_QPT_RC) + max_sge = (min_t(int, wqe_size, 512) - + sizeof(struct mlx5_wqe_ctrl_seg) - + sizeof(struct mlx5_wqe_raddr_seg)) / + sizeof(struct mlx5_wqe_data_seg); + else if (attr->qp_type == IB_EXP_QPT_DC_INI) + max_sge = (min_t(int, wqe_size, 512) - + sizeof(struct mlx5_wqe_ctrl_seg) - + sizeof(struct mlx5_wqe_datagram_seg) - + sizeof(struct mlx5_wqe_raddr_seg)) / + sizeof(struct mlx5_wqe_data_seg); + else if (attr->qp_type == IB_QPT_XRC_INI) + max_sge = (min_t(int, wqe_size, 512) - + sizeof(struct mlx5_wqe_ctrl_seg) - + sizeof(struct mlx5_wqe_xrc_seg) - + sizeof(struct mlx5_wqe_raddr_seg)) / + sizeof(struct mlx5_wqe_data_seg); + else + max_sge = (wqe_size - sq_overhead(attr->qp_type)) / + sizeof(struct mlx5_wqe_data_seg); + + return min_t(int, max_sge, wqe_size - sq_overhead(attr->qp_type) / + sizeof(struct mlx5_wqe_data_seg)); +} + +static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_exp_qp_init_attr *attr, + struct mlx5_ib_qp *qp) +{ + int wqe_size; + int wq_size; + + if (!attr->cap.max_send_wr) + return 0; + + wqe_size = calc_send_wqe(attr); + mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size); + if (wqe_size < 0) + return wqe_size; + + if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) { + mlx5_ib_warn(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", + wqe_size, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)); + return -EINVAL; + } + + qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) - + sizeof(struct mlx5_wqe_inline_seg); + attr->cap.max_inline_data = qp->max_inline_data; + + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) + qp->signature_en = true; + + wq_size = roundup_pow_of_two(attr->cap.max_send_wr * (u64)wqe_size); + qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; + if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) { + mlx5_ib_warn(dev, "wqe count(%d) exceeds limits(%d)\n", + qp->sq.wqe_cnt, + 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)); + return -ENOMEM; + } + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.max_gs = get_send_sge(attr, wqe_size); + if (qp->sq.max_gs < attr->cap.max_send_sge) { + mlx5_ib_warn(dev, "max sge(%d) exceeds limits(%d)\n", + qp->sq.max_gs, attr->cap.max_send_sge); + return -ENOMEM; + } + + attr->cap.max_send_sge = qp->sq.max_gs; + qp->sq.max_post = wq_size / wqe_size; + attr->cap.max_send_wr = qp->sq.max_post; + + return wq_size; +} + +static int set_user_buf_size(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + struct mlx5_ib_create_qp *ucmd, + struct ib_exp_qp_init_attr *attr) +{ + int desc_sz = 1 << qp->sq.wqe_shift; + + if (desc_sz > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) { + mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n", + desc_sz, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)); + return -EINVAL; + } + + if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) { + mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n", + ucmd->sq_wqe_count, ucmd->sq_wqe_count); + return -EINVAL; + } + + qp->sq.wqe_cnt = ucmd->sq_wqe_count; + + if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) { + mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n", + qp->sq.wqe_cnt, + 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)); + return -EINVAL; + } + + + if (attr->qp_type == IB_QPT_RAW_PACKET) { + qp->buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift; + qp->sq_buf_size = qp->sq.wqe_cnt << 6; + } else { + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << 6); + qp->sq_buf_size = 0; + } + + return 0; +} + +static int qp_has_rq(struct ib_exp_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || + attr->qp_type == IB_QPT_XRC_TGT || attr->srq || + attr->qp_type == MLX5_IB_QPT_REG_UMR || + !attr->cap.max_recv_wr) + return 0; + + return 1; +} + +static int first_med_uuar(void) +{ + return 1; +} + +static int next_uuar(int n) +{ + n++; + + while (((n % 4) & 2)) + n++; + + return n; +} + +static int num_med_uuar(struct mlx5_uuar_info *uuari) +{ + int n; + + n = uuari->num_uars * MLX5_NON_FP_BF_REGS_PER_PAGE - + uuari->num_low_latency_uuars - 1; + + return n >= 0 ? n : 0; +} + +static int max_uuari(struct mlx5_uuar_info *uuari) +{ + return uuari->num_uars * 4; +} + +static int first_hi_uuar(struct mlx5_uuar_info *uuari) +{ + int med; + int i; + int t; + + med = num_med_uuar(uuari); + for (t = 0, i = first_med_uuar();; i = next_uuar(i)) { + t++; + if (t == med) + return next_uuar(i); + } + + return 0; +} + +static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari) +{ + int i; + + for (i = first_hi_uuar(uuari); i < max_uuari(uuari); i = next_uuar(i)) { + if (!test_bit(i, uuari->bitmap)) { + set_bit(i, uuari->bitmap); + uuari->count[i]++; + return i; + } + } + + return -ENOMEM; +} + +static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari) +{ + int minidx = first_med_uuar(); + int i; + + for (i = first_med_uuar(); i < first_hi_uuar(uuari); i = next_uuar(i)) { + if (uuari->count[i] < uuari->count[minidx]) + minidx = i; + } + + uuari->count[minidx]++; + + return minidx; +} + +static int alloc_uuar(struct mlx5_uuar_info *uuari, + enum mlx5_ib_latency_class lat) +{ + int uuarn = -EINVAL; + + mutex_lock(&uuari->lock); + switch (lat) { + case MLX5_IB_LATENCY_CLASS_LOW: + uuarn = 0; + uuari->count[uuarn]++; + break; + + case MLX5_IB_LATENCY_CLASS_MEDIUM: + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_med_class_uuar(uuari); + break; + + case MLX5_IB_LATENCY_CLASS_HIGH: + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_high_class_uuar(uuari); + break; + + case MLX5_IB_LATENCY_CLASS_FAST_PATH: + uuarn = 2; + break; + } + mutex_unlock(&uuari->lock); + + return uuarn; +} + +static void free_med_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + clear_bit(uuarn, uuari->bitmap); + --uuari->count[uuarn]; +} + +static void free_high_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + clear_bit(uuarn, uuari->bitmap); + --uuari->count[uuarn]; +} + +static void free_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE; + int high_uuar = nuuars - uuari->num_low_latency_uuars; + + mutex_lock(&uuari->lock); + if (uuarn == 0) { + --uuari->count[uuarn]; + goto out; + } + + if (uuarn < high_uuar) { + free_med_class_uuar(uuari, uuarn); + goto out; + } + + free_high_class_uuar(uuari, uuarn); + +out: + mutex_unlock(&uuari->lock); +} + +static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: return MLX5_QP_STATE_RST; + case IB_QPS_INIT: return MLX5_QP_STATE_INIT; + case IB_QPS_RTR: return MLX5_QP_STATE_RTR; + case IB_QPS_RTS: return MLX5_QP_STATE_RTS; + case IB_QPS_SQD: return MLX5_QP_STATE_SQD; + case IB_QPS_SQE: return MLX5_QP_STATE_SQER; + case IB_QPS_ERR: return MLX5_QP_STATE_ERR; + default: return -1; + } +} + +static int to_mlx5_st(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_RC: return MLX5_QP_ST_RC; + case IB_QPT_UC: return MLX5_QP_ST_UC; + case IB_QPT_UD: return MLX5_QP_ST_UD; + case MLX5_IB_QPT_REG_UMR: return MLX5_QP_ST_REG_UMR; + case MLX5_IB_QPT_SW_CNAK: return MLX5_QP_ST_SW_CNAK; + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; + case IB_QPT_SMI: return MLX5_QP_ST_QP0; + case IB_QPT_GSI: return MLX5_QP_ST_QP1; + case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; + case IB_QPT_RAW_PACKET: + case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; + case IB_EXP_QPT_DC_INI: return MLX5_QP_ST_DCI; + case IB_QPT_MAX: + default: return -EINVAL; + } +} + +static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, + struct mlx5_ib_cq *recv_cq); +static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, + struct mlx5_ib_cq *recv_cq); + +static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn) +{ + return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index; +} + +static void destroy_user_rq(struct ib_pd *pd, struct mlx5_ib_rwq *rwq) +{ + struct mlx5_ib_ucontext *context; + + context = to_mucontext(pd->uobject->context); + mlx5_ib_db_unmap_user(context, &rwq->db); + if (rwq->umem) + ib_umem_release(rwq->umem); +} + +static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_rwq *rwq, + struct mlx5_ib_create_wq *ucmd) +{ + struct mlx5_ib_ucontext *context; + int page_shift = 0; + int npages; + u32 offset = 0; + int ncont = 0; + int err; + + if (!ucmd->buf_addr || rwq->buf_size <= 0) + return -EINVAL; + + context = to_mucontext(pd->uobject->context); + rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr, + rwq->buf_size, 0, 0); + if (IS_ERR(rwq->umem)) { + mlx5_ib_warn(dev, "umem_get failed\n"); + err = PTR_ERR(rwq->umem); + return err; + } + + mlx5_ib_cont_pages(rwq->umem, ucmd->buf_addr, &npages, &page_shift, + &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift, + &rwq->rq_page_offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + + rwq->rq_num_pas = ncont; + rwq->page_shift = page_shift; + rwq->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + rwq->wq_sig = !!(ucmd->flags & MLX5_WQ_FLAG_SIGNATURE); + + mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", + (unsigned long long)ucmd->buf_addr, rwq->buf_size, + npages, page_shift, ncont, offset); + + err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db); + if (err) { + mlx5_ib_warn(dev, "map failed\n"); + goto err_umem; + } + + rwq->create_type = MLX5_WQ_USER; + return 0; + +err_umem: + ib_umem_release(rwq->umem); + return err; +} + +static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, struct ib_udata *udata, + struct ib_exp_qp_init_attr *attr, + struct mlx5_create_qp_mbox_in **in, + int *inlen, + struct mlx5_exp_ib_create_qp *ucmd) +{ + struct mlx5_exp_ib_create_qp_resp resp; + struct mlx5_ib_ucontext *context; + int page_shift = 0; + int uar_index; + int npages; + u32 offset = 0; + int uuarn; + int ncont = 0; + int err; + + context = to_mucontext(pd->uobject->context); + memset(&resp, 0, sizeof(resp)); + resp.size_of_prefix = offsetof(struct mlx5_exp_ib_create_qp_resp, prefix_reserved); + /* + * TBD: should come from the verbs when we have the API + */ + if (ucmd->exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_WC_UAR_IDX) { + /* In CROSS_CHANNEL CQ and QP must use the same UAR */ + if (attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL) + return -EINVAL; + + if (ucmd->exp.wc_uar_index == MLX5_EXP_CREATE_QP_DB_ONLY_UUAR) { + /* Assign LATENCY_CLASS_LOW (DB only UUAR) to this QP */ + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); + if (uuarn < 0) { + mlx5_ib_warn(dev, "DB only uuar allocation failed\n"); + return uuarn; + } + uar_index = uuarn_to_uar_index(&context->uuari, uuarn); + } else if (ucmd->exp.wc_uar_index >= MLX5_IB_MAX_CTX_DYNAMIC_UARS || + context->dynamic_wc_uar_index[ucmd->exp.wc_uar_index] == + MLX5_IB_INVALID_UAR_INDEX) { + mlx5_ib_warn(dev, "dynamic uuar allocation failed\n"); + return -EINVAL; + } else { + uar_index = context->dynamic_wc_uar_index[ucmd->exp.wc_uar_index]; + uuarn = MLX5_EXP_INVALID_UUAR; + } + } else if (attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL) { + uuarn = MLX5_CROSS_CHANNEL_UUAR; + uar_index = uuarn_to_uar_index(&context->uuari, uuarn); + } else { + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to medium latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to high latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); + if (uuarn < 0) { + mlx5_ib_warn(dev, "uuar allocation failed\n"); + return uuarn; + } + } + } + uar_index = uuarn_to_uar_index(&context->uuari, uuarn); + } + mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index); + + qp->rq.offset = 0; + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + + err = set_user_buf_size(dev, qp, (struct mlx5_ib_create_qp *)ucmd, attr); + if (err) + goto err_uuar; + + if (ucmd->buf_addr && qp->buf_size) { + qp->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr, + qp->buf_size, 0, 0); + if (IS_ERR(qp->umem)) { + mlx5_ib_warn(dev, "umem_get failed\n"); + err = PTR_ERR(qp->umem); + goto err_uuar; + } + } else { + qp->umem = NULL; + } + + if (qp->umem) { + mlx5_ib_cont_pages(qp->umem, ucmd->buf_addr, &npages, &page_shift, + &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift, &offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", + (unsigned long long)ucmd->buf_addr, qp->buf_size, + npages, page_shift, ncont, offset); + } + + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_umem; + } + if (qp->umem) + mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); + (*in)->ctx.params2 = cpu_to_be32(offset << 6); + + (*in)->ctx.params2 |= (qp->flags & MLX5_IB_QP_CAP_CROSS_CHANNEL ? + cpu_to_be32(MLX5_QP_BIT_COLL_MASTER) : 0); + (*in)->ctx.params2 |= (qp->flags & MLX5_IB_QP_CAP_MANAGED_SEND ? + cpu_to_be32(MLX5_QP_BIT_COLL_SYNC_SQ) : 0); + (*in)->ctx.params2 |= (qp->flags & MLX5_IB_QP_CAP_MANAGED_RECV ? + cpu_to_be32(MLX5_QP_BIT_COLL_SYNC_RQ) : 0); + + (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); + resp.uuar_index = uuarn; + qp->uuarn = uuarn; + + err = mlx5_ib_db_map_user(context, ucmd->db_addr, &qp->db); + if (err) { + mlx5_ib_warn(dev, "map failed\n"); + goto err_free; + } + + if (udata->src == IB_UDATA_EXP_CMD && + attr->qp_type == IB_QPT_RAW_PACKET && qp->sq.wqe_cnt && + (ucmd->exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_FLAGS_IDX) && + (ucmd->exp.flags & MLX5_EXP_CREATE_QP_MULTI_PACKET_WQE_REQ_FLAG)) { + /* + * Enable Multi-Packet WQE only if: + * - user process is privilege + * - SRIOV is not supported + * - Multi-Packet is supported + */ + if (priv_check(curthread, PRIV_DRIVER) == 0 && + MLX5_CAP_ETH(dev->mdev, multi_pkt_send_wqe) == 1) { + qp->allow_mp_wqe = 1; + resp.exp.comp_mask |= MLX5_EXP_CREATE_QP_RESP_MASK_FLAGS_IDX; + resp.exp.flags |= MLX5_EXP_CREATE_QP_RESP_MULTI_PACKET_WQE_FLAG; + } + } + if (udata->src == IB_UDATA_EXP_CMD) + err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); + else + err = ib_copy_to_udata(udata, &resp, sizeof(struct mlx5_ib_create_qp_resp)); + + if (err) { + mlx5_ib_err(dev, "copy failed\n"); + goto err_unmap; + } + qp->create_type = MLX5_QP_USER; + + return 0; + +err_unmap: + mlx5_ib_db_unmap_user(context, &qp->db); + +err_free: + kvfree(*in); + +err_umem: + if (qp->umem) + ib_umem_release(qp->umem); + +err_uuar: + free_uuar(&context->uuari, uuarn); + return err; +} + +static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_ucontext *context; + + context = to_mucontext(pd->uobject->context); + mlx5_ib_db_unmap_user(context, &qp->db); + if (qp->umem) + ib_umem_release(qp->umem); + if (qp->sq_umem) + ib_umem_release(qp->sq_umem); + /* + * Free only the UUARs handled by the kernel. + * UUARs of UARs allocated dynamically are handled by user. + */ + if (qp->uuarn != MLX5_EXP_INVALID_UUAR) + free_uuar(&context->uuari, qp->uuarn); +} + +static int create_kernel_qp(struct mlx5_ib_dev *dev, + struct ib_exp_qp_init_attr *init_attr, + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_mbox_in **in, int *inlen) +{ + enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW; + struct mlx5_uuar_info *uuari; + int uar_index; + int uuarn; + int err; + + uuari = &dev->mdev->priv.uuari; + if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | + IB_QP_CREATE_SIGNATURE_PIPELINE | + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + return -EINVAL; + + if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) + lc = MLX5_IB_LATENCY_CLASS_FAST_PATH; + + uuarn = alloc_uuar(uuari, lc); + if (uuarn < 0) { + mlx5_ib_warn(dev, "\n"); + return -ENOMEM; + } + + qp->bf = &uuari->bfs[uuarn]; + uar_index = qp->bf->uar->index; + + err = calc_sq_size(dev, init_attr, qp); + if (err < 0) { + mlx5_ib_warn(dev, "err %d\n", err); + goto err_uuar; + } + + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); + + err = mlx5_buf_alloc(dev->mdev, qp->buf_size, PAGE_SIZE * 2, &qp->buf); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + goto err_uuar; + } + + qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt); + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * qp->buf.npages; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_buf; + } + (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); + /* Set "fast registration enabled" for all kernel QPs */ + (*in)->ctx.params1 |= cpu_to_be32(1 << 11); + (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); + + mlx5_fill_page_array(&qp->buf, (*in)->pas); + + err = mlx5_db_alloc(dev->mdev, &qp->db); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + goto err_free; + } + + qp->sq.swr_ctx = kcalloc(qp->sq.wqe_cnt, sizeof(*qp->sq.swr_ctx), + GFP_KERNEL); + qp->rq.rwr_ctx = kcalloc(qp->rq.wqe_cnt, sizeof(*qp->rq.rwr_ctx), + GFP_KERNEL); + if (!qp->sq.swr_ctx || !qp->rq.rwr_ctx) { + err = -ENOMEM; + goto err_wrid; + } + qp->create_type = MLX5_QP_KERNEL; + + return 0; + +err_wrid: + mlx5_db_free(dev->mdev, &qp->db); + kfree(qp->sq.swr_ctx); + kfree(qp->rq.rwr_ctx); + +err_free: + kvfree(*in); + +err_buf: + mlx5_buf_free(dev->mdev, &qp->buf); + +err_uuar: + free_uuar(&dev->mdev->priv.uuari, uuarn); + return err; +} + +static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + mlx5_db_free(dev->mdev, &qp->db); + kfree(qp->sq.swr_ctx); + kfree(qp->rq.rwr_ctx); + mlx5_buf_free(dev->mdev, &qp->buf); + free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn); +} + +static __be32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_exp_qp_init_attr *attr) +{ + enum ib_qp_type qt = attr->qp_type; + + if (attr->srq || (qt == IB_QPT_XRC_TGT) || (qt == IB_QPT_XRC_INI) || + (qt == IB_EXP_QPT_DC_INI)) + return cpu_to_be32(MLX5_SRQ_RQ); + else if (!qp->has_rq) + return cpu_to_be32(MLX5_ZERO_LEN_RQ); + else + return cpu_to_be32(MLX5_NON_ZERO_RQ); +} + +static int is_connected(enum ib_qp_type qp_type) +{ + if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC) + return 1; + + return 0; +} + +static void get_cqs(enum ib_qp_type qp_type, + struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, + struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq) +{ + switch (qp_type) { + case IB_QPT_XRC_TGT: + *send_cq = NULL; + *recv_cq = NULL; + break; + case MLX5_IB_QPT_REG_UMR: + case IB_QPT_XRC_INI: + *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; + *recv_cq = NULL; + break; + + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: + case MLX5_IB_QPT_SW_CNAK: + case IB_QPT_RAW_PACKET: + *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; + *recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL; + break; + + case IB_QPT_MAX: + default: + *send_cq = NULL; + *recv_cq = NULL; + break; + } +} + +static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + mlx5_core_destroy_tir(dev->mdev, qp->tirn); +} + +static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct ib_pd *pd, + struct ib_exp_qp_init_attr *init_attr) +{ + struct ib_uobject *uobj = pd->uobject; + struct ib_ucontext *ucontext = uobj->context; + struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); + int inlen; + int err; + u32 *in; + void *tirc; + void *hfso; + u32 selected_fields = 0; + u32 tdn = mucontext->tdn; + + inlen = MLX5_ST_SZ_BYTES(create_tir_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context); + MLX5_SET(tirc, tirc, disp_type, + MLX5_TIRC_DISP_TYPE_INDIRECT); + MLX5_SET(tirc, tirc, indirect_table, + init_attr->rx_hash_conf->rwq_ind_tbl->ind_tbl_num); + MLX5_SET(tirc, tirc, transport_domain, tdn); + + hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); + switch (init_attr->rx_hash_conf->rx_hash_function) { + case IB_EX_RX_HASH_FUNC_XOR: + err = -ENOSYS; + goto err; + + case IB_EX_RX_HASH_FUNC_TOEPLITZ: + { + void *rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key); + size_t len = MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key); + + if (len != init_attr->rx_hash_conf->rx_key_len) { + err = -EINVAL; + goto err; + } + + MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ); + MLX5_SET(tirc, tirc, rx_hash_symmetric, 1); + memcpy(rss_key, init_attr->rx_hash_conf->rx_hash_key, len); + break; + } + default: + err = -EINVAL; + goto err; + } + + if (!init_attr->rx_hash_conf->rx_hash_fields_mask) { + /* special case when this TIR serves as steering entry without hashing */ + if (!init_attr->rx_hash_conf->rwq_ind_tbl->log_ind_tbl_size) + goto create_tir; + err = -EINVAL; + goto err; + } + + if (((init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_SRC_IPV4) || + (init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_DST_IPV4)) && + ((init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_SRC_IPV6) || + (init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_DST_IPV6))) { + err = -EINVAL; + goto err; + } + + /* If none of IPV4 & IPV6 SRC/DST was set - this bit field is ignored */ + if ((init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_SRC_IPV4) || + (init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_DST_IPV4)) + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV4); + else if ((init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_SRC_IPV6) || + (init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_DST_IPV6)) + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV6); + + if (((init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_SRC_PORT_TCP) || + (init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_DST_PORT_TCP)) && + ((init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_SRC_PORT_UDP) || + (init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_DST_PORT_UDP))) { + err = -EINVAL; + goto err; + } + + /* If none of TCP & UDP SRC/DST was set - this bit field is ignored */ + if ((init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_SRC_PORT_TCP) || + (init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_DST_PORT_TCP)) + MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, + MLX5_L4_PROT_TYPE_TCP); + else if ((init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_SRC_PORT_UDP) || + (init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_DST_PORT_UDP)) + MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, + MLX5_L4_PROT_TYPE_UDP); + + if ((init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_SRC_IPV4) || + (init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_SRC_IPV6)) + selected_fields |= MLX5_HASH_FIELD_SEL_SRC_IP; + + if ((init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_DST_IPV4) || + (init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_DST_IPV6)) + selected_fields |= MLX5_HASH_FIELD_SEL_DST_IP; + + if ((init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_SRC_PORT_TCP) || + (init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_SRC_PORT_UDP)) + selected_fields |= MLX5_HASH_FIELD_SEL_L4_SPORT; + + if ((init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_DST_PORT_TCP) || + (init_attr->rx_hash_conf->rx_hash_fields_mask & IB_RX_HASH_DST_PORT_UDP)) + selected_fields |= MLX5_HASH_FIELD_SEL_L4_DPORT; + + MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields); + +create_tir: + err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->tirn); + + if (err) + goto err; + + kvfree(in); + /* qpn is reserved for that QP */ + qp->mqp.qpn = 0; + return 0; + +err: + kvfree(in); + return err; +} + +static int create_raw_qp_tis(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + u32 tdn) +{ + u32 in[MLX5_ST_SZ_DW(create_tis_in)]; + void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); + + memset(in, 0, sizeof(in)); + + MLX5_SET(tisc, tisc, transport_domain, tdn); + + return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &qp->tisn); +} + +static void destroy_raw_qp_tis(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + mlx5_core_destroy_tis(dev->mdev, qp->tisn); +} + +static void mlx5_ib_sq_event(struct mlx5_core_qp *sq, int type) +{ + struct ib_qp *ibqp = &sq_to_mibqp(sq)->ibqp; + struct ib_event event; + + if (ibqp->event_handler) { + event.device = ibqp->device; + event.element.qp = ibqp; + switch (type) { + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + default: + printf("mlx5_ib: WARN: ""mlx5_ib: Unexpected event type %d on SQ %06x\n", type, sq->qpn); + return; + } + + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static void mlx5_ib_rq_event(struct mlx5_core_qp *rq, int type) +{ + struct ib_qp *ibqp = &rq_to_mibqp(rq)->ibqp; + struct ib_event event; + + if (ibqp->event_handler) { + event.device = ibqp->device; + event.element.qp = ibqp; + switch (type) { + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + default: + printf("mlx5_ib: WARN: ""mlx5_ib: Unexpected event type %d on RQ %06x\n", type, rq->qpn); + return; + } + + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static int create_raw_qp_sq(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + void *qpin, struct ib_pd *pd) +{ + __be64 *pas; + void *in; + void *sqc; + void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc); + void *wq; + int inlen; + int err; + int page_shift = 0; + int npages; + int ncont = 0; + u32 offset = 0; + + qp->sq_umem = ib_umem_get(pd->uobject->context, qp->sq_buf_addr, + qp->sq_buf_size, 0, 0); + if (IS_ERR(qp->sq_umem)) { + mlx5_ib_warn(dev, "umem_get failed\n"); + qp->sq_umem = NULL; + return -ENOMEM; + } + + mlx5_ib_cont_pages(qp->sq_umem, qp->sq_buf_addr, &npages, &page_shift, + &ncont, NULL); + err = mlx5_ib_get_buf_offset(qp->sq_buf_addr, page_shift, &offset); + if (err) { + mlx5_ib_warn(dev, "SQ bad offset\n"); + err = -EINVAL; + goto err_umem; + } + + mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", + (long long)qp->sq_buf_addr, qp->sq_buf_size, npages, page_shift, ncont, offset); + + inlen = MLX5_ST_SZ_BYTES(create_sq_in) + sizeof(u64) * ncont; + in = mlx5_vzalloc(inlen); + if (!in) { + mlx5_ib_err(dev, "allocation failed\n"); + err = -ENOMEM; + goto err_umem; + } + + sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); + MLX5_SET(sqc, sqc, flush_in_error_en, 1); + MLX5_SET(sqc, sqc, allow_multi_pkt_send_wqe, qp->allow_mp_wqe); + MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); + MLX5_SET(sqc, sqc, user_index, MLX5_GET(qpc, qpc, user_index)); + MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd)); + MLX5_SET(sqc, sqc, tis_lst_sz, 1); + MLX5_SET(sqc, sqc, tis_num_0, qp->tisn); + + wq = MLX5_ADDR_OF(sqc, sqc, wq); + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd)); + MLX5_SET(wq, wq, uar_page, MLX5_GET(qpc, qpc, uar_page)); + MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr)); + MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); + MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_sq_size)); + MLX5_SET(wq, wq, log_wq_pg_sz, page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(wq, wq, page_offset, offset); + + + pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + mlx5_ib_populate_pas(dev, qp->sq_umem, page_shift, pas, 0); + + err = mlx5_core_create_sq_tracked(dev->mdev, in, inlen, &qp->msq); + kvfree(in); + if (err) + goto err_umem; + + return 0; + +err_umem: + ib_umem_release(qp->sq_umem); + qp->sq_umem = NULL; + + return err; +} + +static void destroy_raw_qp_sq(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + mlx5_core_destroy_sq_tracked(dev->mdev, &qp->msq); +} + +static int get_rq_pas_size(void *qpc) +{ + u32 log_page_size = MLX5_GET(qpc, qpc, log_page_size) + 12; + u32 log_rq_stride = MLX5_GET(qpc, qpc, log_rq_stride); + u32 log_rq_size = MLX5_GET(qpc, qpc, log_rq_size); + u32 page_offset = MLX5_GET(qpc, qpc, page_offset); + u32 po_quanta = 1 << (log_page_size - 6); + u32 rq_sz = 1 << (log_rq_size + 4 + log_rq_stride); + u32 page_size = 1 << log_page_size; + u32 rq_sz_po = rq_sz + (page_offset * po_quanta); + u32 rq_num_pas = (rq_sz_po + page_size - 1) / page_size; + + return rq_num_pas * sizeof(u64); +} + +static int create_raw_qp_rq(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + void *qpin) +{ + __be64 *pas; + __be64 *qp_pas; + void *in; + void *rqc; + void *wq; + void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc); + int inlen; + int err; + u32 rq_pas_size = get_rq_pas_size(qpc); + + inlen = MLX5_ST_SZ_BYTES(create_rq_in) + rq_pas_size; + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + MLX5_SET(rqc, rqc, vlan_strip_disable, 1); + MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_RQ_TYPE_MEMORY_RQ_INLINE); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, flush_in_error_en, 1); + MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index)); + MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv)); + + wq = MLX5_ADDR_OF(rqc, rqc, wq); + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, end_padding_mode, MLX5_GET(qpc, qpc, end_padding_mode)); + MLX5_SET(wq, wq, page_offset, MLX5_GET(qpc, qpc, page_offset)); + MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd)); + MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr)); + MLX5_SET(wq, wq, log_wq_stride, MLX5_GET(qpc, qpc, log_rq_stride) + 4); + MLX5_SET(wq, wq, log_wq_pg_sz, MLX5_GET(qpc, qpc, log_page_size)); + MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_rq_size)); + + pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + qp_pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, qpin, pas); + memcpy(pas, qp_pas, rq_pas_size); + + err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &qp->mrq); + + kvfree(in); + + return err; +} + +static void destroy_raw_qp_rq(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + mlx5_core_destroy_rq_tracked(dev->mdev, &qp->mrq); +} + +static int create_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + u32 tdn) +{ + u32 *in; + void *tirc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_tir_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context); + MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); + MLX5_SET(tirc, tirc, inline_rqn, qp->mrq.qpn); + MLX5_SET(tirc, tirc, transport_domain, tdn); + + err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->tirn); + + kvfree(in); + + return err; +} + +static void destroy_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + mlx5_core_destroy_tir(dev->mdev, qp->tirn); +} + +static int create_raw_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct mlx5_create_qp_mbox_in *in, struct ib_pd *pd) +{ + struct ib_uobject *uobj = pd->uobject; + struct ib_ucontext *ucontext = uobj->context; + struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); + int err; + u32 tdn = mucontext->tdn; + + + if (!qp->sq.wqe_cnt && !qp->rq.wqe_cnt) + return -EINVAL; + + if (qp->sq.wqe_cnt) { + err = create_raw_qp_tis(dev, qp, tdn); + if (err) + return err; + + err = create_raw_qp_sq(dev, qp, in, pd); + if (err) + goto err_tis; + + qp->sq_state = MLX5_SQC_STATE_RST; + qp->msq.event = mlx5_ib_sq_event; + } + + if (qp->rq.wqe_cnt) { + err = create_raw_qp_rq(dev, qp, in); + if (err) + goto err_sq; + + + err = create_raw_qp_tir(dev, qp, tdn); + if (err) + goto err_rq; + + qp->rq_state = MLX5_RQC_STATE_RST; + qp->mrq.event = mlx5_ib_rq_event; + } + + /* If we allocated SQ, then the QP number should be the SQ number, + * otherwise the RQ number. + */ + qp->mqp.qpn = qp->sq.wqe_cnt ? qp->msq.qpn : qp->mrq.qpn; + + INIT_LIST_HEAD(&qp->mc_flows_list.flows_list); + mutex_init(&qp->mc_flows_list.lock); + + return 0; + +err_rq: + destroy_raw_qp_rq(dev, qp); +err_sq: + if (qp->sq.wqe_cnt) + destroy_raw_qp_sq(dev, qp); +err_tis: + if (qp->sq.wqe_cnt) + destroy_raw_qp_tis(dev, qp); + + return err; +} + +enum { + MLX5_QP_END_PAD_MODE_ALIGN = MLX5_WQ_END_PAD_MODE_ALIGN, + MLX5_QP_END_PAD_MODE_NONE = MLX5_WQ_END_PAD_MODE_NONE, +}; + +static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct ib_exp_qp_init_attr *init_attr, + struct ib_udata *udata, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_resources *devr = &dev->devr; + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_create_qp_mbox_in *in = NULL; + struct mlx5_exp_ib_create_qp ucmd; + struct mlx5_ib_create_qp *pucmd = NULL; + struct mlx5_ib_cq *send_cq; + struct mlx5_ib_cq *recv_cq; + unsigned long flags; + int inlen = sizeof(*in); + size_t ucmd_size; + int err; + int st; + u32 uidx; + void *qpc; + + if (udata && (init_attr->qp_type == MLX5_IB_QPT_REG_UMR || + init_attr->qp_type == MLX5_IB_QPT_SW_CNAK)) { + mlx5_ib_warn(dev, "required QP type is supported only for kernel consumers\n"); + return -ENOSYS; + } + + mutex_init(&qp->mutex); + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + + if (init_attr->rx_hash_conf) { + if (!udata) { + mlx5_ib_err(dev, "internal error\n"); + return -ENOSYS; + } + + err = create_rss_raw_qp_tir(dev, qp, pd, init_attr); + return err; + } + + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { + if (!MLX5_CAP_GEN(mdev, block_lb_mc)) { + mlx5_ib_warn(dev, "block multicast loopback isn't supported\n"); + return -EINVAL; + } else { + qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; + } + } + + if ((init_attr->create_flags & IB_QP_CREATE_ATOMIC_BE_REPLY) && + (dev->atomic_cap != IB_ATOMIC_HCA_REPLY_BE) && + mlx5_host_is_le()) { + mlx5_ib_dbg(dev, "Create QP with atomic BE REPLY is not supported on this type HCA device\n"); + return -EINVAL; + } + if ((init_attr->create_flags & IB_QP_CREATE_RX_END_PADDING) && + MLX5_CAP_GEN(mdev, pad_cap)) + qp->flags |= MLX5_IB_QP_CAP_RX_END_PADDING; + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; + + if (init_attr->max_inl_recv) + qp->scat_cqe = 1; + + if (pd && pd->uobject) { + memset(&ucmd, 0, sizeof(ucmd)); + if (udata->src == IB_UDATA_EXP_CMD) { + ucmd_size = sizeof(ucmd); + } else { + ucmd_size = sizeof(struct mlx5_ib_create_qp); + if (ucmd_size > offsetof(struct mlx5_exp_ib_create_qp, size_of_prefix)) { + mlx5_ib_warn(dev, "mlx5_ib_create_qp is too big to fit as prefix of mlx5_exp_ib_create_qp\n"); + return -EINVAL; + } + } + err = ib_copy_from_udata(&ucmd, udata, min(udata->inlen, ucmd_size)); + if (err) { + mlx5_ib_err(dev, "copy failed\n"); + return err; + } + if ((udata->src == IB_UDATA_EXP_CMD) && + ((ucmd.size_of_prefix > sizeof(struct mlx5_ib_create_qp)) || + (ucmd.exp.comp_mask >= MLX5_EXP_CREATE_QP_MASK_RESERVED))) { + mlx5_ib_warn(dev, "Unrecognized driver data\n"); + return -EINVAL; + } + pucmd = (struct mlx5_ib_create_qp *)&ucmd; + if (ucmd.exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_UIDX) + uidx = ucmd.exp.uidx; + else + uidx = 0xffffff; + + qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); + } else { + qp->wq_sig = !!workqueue_signature; + uidx = 0xffffff; + } + + qp->has_rq = qp_has_rq(init_attr); + err = set_rq_size(dev, &init_attr->cap, qp->has_rq, + qp, (pd && pd->uobject) ? pucmd : NULL); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + return err; + } + + if (pd) { + if (pd->uobject) { + __u32 max_wqes = + 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count); + if (ucmd.rq_wqe_shift != qp->rq.wqe_shift || + ucmd.rq_wqe_count != qp->rq.wqe_cnt) { + mlx5_ib_warn(dev, "invalid rq params\n"); + return -EINVAL; + } + if (ucmd.sq_wqe_count > max_wqes) { + mlx5_ib_warn(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n", + ucmd.sq_wqe_count, max_wqes); + return -EINVAL; + } + if ((init_attr->create_flags & + (IB_QP_CREATE_CROSS_CHANNEL | + IB_QP_CREATE_MANAGED_SEND | + IB_QP_CREATE_MANAGED_RECV)) && + !MLX5_CAP_GEN(dev->mdev, cd)) { + mlx5_ib_warn(dev, "%s does not support cross-channel operations\n", + dev->ib_dev.name); + return -EINVAL; + } + + if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL) + qp->flags |= MLX5_IB_QP_CAP_CROSS_CHANNEL; + + if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND) + qp->flags |= MLX5_IB_QP_CAP_MANAGED_SEND; + + if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV) + qp->flags |= MLX5_IB_QP_CAP_MANAGED_RECV; + + err = create_user_qp(dev, pd, qp, udata, init_attr, &in, + &inlen, &ucmd); + if (err) + mlx5_ib_warn(dev, "err %d\n", err); + } else { + if (init_attr->qp_type == IB_QPT_RAW_PACKET) { + mlx5_ib_warn(dev, "Raw Eth QP is disabled for Kernel consumers\n"); + return -EINVAL; + } + err = create_kernel_qp(dev, init_attr, qp, &in, &inlen); + if (err) + mlx5_ib_warn(dev, "err %d\n", err); + else + qp->pa_lkey = to_mpd(pd)->pa_lkey; + } + + if (err) + return err; + } else { + in = mlx5_vzalloc(sizeof(*in)); + if (!in) + return -ENOMEM; + + qp->create_type = MLX5_QP_EMPTY; + } + + if (is_sqp(init_attr->qp_type)) + qp->port = init_attr->port_num; + + st = to_mlx5_st(init_attr->qp_type); + if (st < 0) { + mlx5_ib_warn(dev, "invalid service type\n"); + err = st; + goto err_create; + } + in->ctx.flags |= cpu_to_be32(st << 16 | MLX5_QP_PM_MIGRATED << 11); + + if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR) + in->ctx.flags_pd = cpu_to_be32(to_mpd(pd ? pd : devr->p0)->pdn); + else + in->ctx.flags_pd = cpu_to_be32(MLX5_QP_LAT_SENSITIVE); + + if (qp->wq_sig) + in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_ENABLE_SIG); + + if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST); + + if (qp->flags & MLX5_IB_QP_CAP_RX_END_PADDING) + in->ctx.flags |= cpu_to_be32(MLX5_QP_END_PAD_MODE_ALIGN << 2); + else + in->ctx.flags |= cpu_to_be32(MLX5_QP_END_PAD_MODE_NONE << 2); + + if (init_attr->create_flags & IB_QP_CREATE_SIGNATURE_PIPELINE) + in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_DRAIN_SIGERR); + + if (qp->scat_cqe && is_connected(init_attr->qp_type)) { + int rcqe_sz; + int scqe_sz; + + rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq); + scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq); + + if (rcqe_sz == 128) { + in->ctx.cs_res = MLX5_RES_SCAT_DATA64_CQE; + init_attr->max_inl_recv = 64; + } else { + in->ctx.cs_res = MLX5_RES_SCAT_DATA32_CQE; + init_attr->max_inl_recv = 32; + } + + if (init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) { + in->ctx.cs_req = 0; + } else { + if (scqe_sz == 128) + in->ctx.cs_req = MLX5_REQ_SCAT_DATA64_CQE; + else + in->ctx.cs_req = MLX5_REQ_SCAT_DATA32_CQE; + } + } else { + init_attr->max_inl_recv = 0; + } + + if (qp->rq.wqe_cnt) { + in->ctx.rq_size_stride = (qp->rq.wqe_shift - 4); + in->ctx.rq_size_stride |= ilog2(qp->rq.wqe_cnt) << 3; + } + + in->ctx.rq_type_srqn = get_rx_type(qp, init_attr); + + if (qp->sq.wqe_cnt) + in->ctx.sq_crq_size |= cpu_to_be16(ilog2(qp->sq.wqe_cnt) << 11); + else + in->ctx.sq_crq_size |= cpu_to_be16(0x8000); + + /* Set default resources */ + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.cqn_send = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + in->ctx.xrcd = cpu_to_be32(to_mxrcd(init_attr->xrcd)->xrcdn); + break; + case IB_QPT_XRC_INI: + in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + break; + default: + if (init_attr->srq) { + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x0)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(init_attr->srq)->msrq.srqn); + } else { + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s1)->msrq.srqn); + } + } + + if (init_attr->send_cq) + in->ctx.cqn_send = cpu_to_be32(to_mcq(init_attr->send_cq)->mcq.cqn); + + if (init_attr->recv_cq) + in->ctx.cqn_recv = cpu_to_be32(to_mcq(init_attr->recv_cq)->mcq.cqn); + + in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma); + + if (MLX5_CAP_GEN(mdev, cqe_version)) { + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + /* 0xffffff means we ask to work with cqe version 0 */ + MLX5_SET(qpc, qpc, user_index, uidx); + } + + if (init_attr->qp_type == IB_QPT_RAW_PACKET) { + if (MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) { + mlx5_ib_warn(dev, "Raw Ethernet QP is allowed only for Ethernet link layer\n"); + return -ENOSYS; + } + if (ucmd.exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_SQ_BUFF_ADD) { + qp->sq_buf_addr = ucmd.exp.sq_buf_addr; + } else { + mlx5_ib_warn(dev, "Raw Ethernet QP needs SQ buff address\n"); + return -EINVAL; + } + err = create_raw_qp(dev, qp, in, pd); + } else { + err = mlx5_core_create_qp(dev->mdev, &qp->mqp, in, inlen); + qp->mqp.event = mlx5_ib_qp_event; + } + + if (err) { + mlx5_ib_warn(dev, "create qp failed\n"); + goto err_create; + } + + kvfree(in); + /* Hardware wants QPN written in big-endian order (after + * shifting) for send doorbell. Precompute this value to save + * a little bit when posting sends. + */ + qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); + + get_cqs(init_attr->qp_type, init_attr->send_cq, init_attr->recv_cq, + &send_cq, &recv_cq); + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx5_ib_lock_cqs(send_cq, recv_cq); + /* Maintain device to QPs access, needed for further handling via reset + * flow + */ + list_add_tail(&qp->qps_list, &dev->qp_list); + /* Maintain CQ to QPs access, needed for further handling via reset flow + */ + if (send_cq) + list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp); + if (recv_cq) + list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp); + mlx5_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + return 0; + +err_create: + if (qp->create_type == MLX5_QP_USER) + destroy_qp_user(pd, qp); + else if (qp->create_type == MLX5_QP_KERNEL) + destroy_qp_kernel(dev, qp); + + kvfree(in); + return err; +} + +static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_lock(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, + SINGLE_DEPTH_NESTING); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + spin_lock(&send_cq->lock); + __acquire(&recv_cq->lock); + } else { + spin_lock(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, + SINGLE_DEPTH_NESTING); + } + } else { + spin_lock(&send_cq->lock); + __acquire(&recv_cq->lock); + } + } else if (recv_cq) { + spin_lock(&recv_cq->lock); + __acquire(&send_cq->lock); + } else { + __acquire(&send_cq->lock); + __acquire(&recv_cq->lock); + } +} + +static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_unlock(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + __release(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock(&recv_cq->lock); + } + } else { + __release(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } + } else if (recv_cq) { + __release(&send_cq->lock); + spin_unlock(&recv_cq->lock); + } else { + __release(&recv_cq->lock); + __release(&send_cq->lock); + } +} + +static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp) +{ + return to_mpd(qp->ibqp.pd); +} + +static void destroy_raw_qp_rules(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_fs_mc_flow *flow_iter; + struct mlx5_ib_fs_mc_flow *temp_iter; + + mutex_lock(&qp->mc_flows_list.lock); + list_for_each_entry_safe(flow_iter, temp_iter, + &qp->mc_flows_list.flows_list, + list) { + if (mlx5_ib_destroy_flow(flow_iter->ib_flow)) + mlx5_ib_warn(dev, "Fail to destroy ib_flow, qp num=0x%x\n", + qp->mqp.qpn); + list_del(&flow_iter->list); + kfree(flow_iter); + } + mutex_unlock(&qp->mc_flows_list.lock); +} + +static void destroy_raw_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + destroy_raw_qp_rules(dev, qp); + + if (qp->rq.wqe_cnt) { + destroy_raw_qp_tir(dev, qp); + destroy_raw_qp_rq(dev, qp); + } + + if (qp->sq.wqe_cnt) { + destroy_raw_qp_sq(dev, qp); + destroy_raw_qp_tis(dev, qp); + } +} + +static int modify_raw_qp_to_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + u16 operation); + +static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_modify_qp_mbox_in *in; + unsigned long flags; + int err; + + if (qp->ibqp.rwq_ind_tbl) { + destroy_rss_raw_qp_tir(dev, qp); + return; + } + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return; + + if (qp->state != IB_QPS_RESET) { + if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) { + if (mlx5_core_qp_modify(dev->mdev, MLX5_CMD_OP_2RST_QP, in, 0, + &qp->mqp)) + mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", + qp->mqp.qpn); + } else { + if (modify_raw_qp_to_state(dev, qp, MLX5_CMD_OP_2RST_QP)) + mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", + qp->mqp.qpn); + } + } + + get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, + &send_cq, &recv_cq); + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx5_ib_lock_cqs(send_cq, recv_cq); + /* del from lists under both locks above to protect reset flow paths */ + list_del(&qp->qps_list); + if (send_cq) + list_del(&qp->cq_send_list); + + if (recv_cq) + list_del(&qp->cq_recv_list); + + if (qp->create_type == MLX5_QP_KERNEL) { + __mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); + if (send_cq != recv_cq) + __mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + } + mlx5_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + destroy_raw_qp(dev, qp); + } else { + err = mlx5_core_destroy_qp(dev->mdev, &qp->mqp); + if (err) + mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", + qp->mqp.qpn); + } + + kfree(in); + + if (qp->create_type == MLX5_QP_KERNEL) + destroy_qp_kernel(dev, qp); + else if (qp->create_type == MLX5_QP_USER) + destroy_qp_user(&get_pd(qp)->ibpd, qp); +} + +static const char *ib_qp_type_str(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_SMI: + return "IB_QPT_SMI"; + case IB_QPT_GSI: + return "IB_QPT_GSI"; + case IB_QPT_RC: + return "IB_QPT_RC"; + case IB_QPT_UC: + return "IB_QPT_UC"; + case IB_QPT_UD: + return "IB_QPT_UD"; + case IB_QPT_RAW_IPV6: + return "IB_QPT_RAW_IPV6"; + case IB_QPT_RAW_ETHERTYPE: + return "IB_QPT_RAW_ETHERTYPE"; + case IB_QPT_XRC_INI: + return "IB_QPT_XRC_INI"; + case IB_QPT_XRC_TGT: + return "IB_QPT_XRC_TGT"; + case IB_QPT_RAW_PACKET: + return "IB_QPT_RAW_PACKET"; + case MLX5_IB_QPT_REG_UMR: + return "MLX5_IB_QPT_REG_UMR"; + case MLX5_IB_QPT_SW_CNAK: + return "MLX5_QP_ST_SW_CNAK"; + case IB_QPT_MAX: + default: + return "Invalid QP type"; + } +} + +static struct ib_qp *__create_qp(struct ib_pd *pd, + struct ib_exp_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev; + struct mlx5_ib_qp *qp; + u16 xrcdn = 0; + int err; + u32 rcqn; + u32 scqn; + + if (pd) { + dev = to_mdev(pd->device); + } else { + /* being cautious here */ + if (init_attr->qp_type != IB_QPT_XRC_TGT && + init_attr->qp_type != MLX5_IB_QPT_REG_UMR) { + printf("mlx5_ib: WARN: ""%s: no PD for transport %s\n", __func__, ib_qp_type_str(init_attr->qp_type)); + return ERR_PTR(-EINVAL); + } + dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); + } + + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + case IB_QPT_XRC_INI: + if (!MLX5_CAP_GEN(dev->mdev, xrc)) { + mlx5_ib_warn(dev, "XRC not supported\n"); + return ERR_PTR(-ENOSYS); + } + init_attr->recv_cq = NULL; + if (init_attr->qp_type == IB_QPT_XRC_TGT) { + xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; + init_attr->send_cq = NULL; + } + + /* fall through */ + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_EXP_QPT_DC_INI: + case MLX5_IB_QPT_REG_UMR: + case MLX5_IB_QPT_SW_CNAK: + case IB_QPT_RAW_ETHERTYPE: + case IB_QPT_RAW_PACKET: + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + err = create_qp_common(dev, pd, init_attr, udata, qp); + if (err) { + mlx5_ib_warn(dev, "create_qp_common failed\n"); + kfree(qp); + return ERR_PTR(err); + } + + if (is_qp0(init_attr->qp_type)) + qp->ibqp.qp_num = 0; + else if (is_qp1(init_attr->qp_type)) + qp->ibqp.qp_num = 1; + else + qp->ibqp.qp_num = qp->mqp.qpn; + + rcqn = init_attr->recv_cq ? to_mcq(init_attr->recv_cq)->mcq.cqn : -1; + scqn = init_attr->send_cq ? to_mcq(init_attr->send_cq)->mcq.cqn : -1; + mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", + qp->ibqp.qp_num, qp->mqp.qpn, rcqn, scqn); + + qp->xrcdn = xrcdn; + + break; + + case IB_QPT_RAW_IPV6: + case IB_QPT_MAX: + default: + mlx5_ib_warn(dev, "unsupported qp type %d\n", + init_attr->qp_type); + /* Don't support raw QPs */ + return ERR_PTR(-EINVAL); + } + + return &qp->ibqp; +} + +struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct ib_exp_qp_init_attr *attrx; + struct ib_qp *qp; + + attrx = kzalloc(sizeof(*attrx), GFP_KERNEL); + if (!attrx) + return ERR_PTR(-ENOMEM); + + memcpy(attrx, init_attr, sizeof(*init_attr)); + + qp = __create_qp(pd, attrx, udata); + if (!IS_ERR(qp)) + memcpy(init_attr, attrx, sizeof(*init_attr)); + + kfree(attrx); + return qp; +} + +struct ib_qp *mlx5_ib_exp_create_qp(struct ib_pd *pd, + struct ib_exp_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + return __create_qp(pd, init_attr, udata); +} + +int mlx5_ib_destroy_qp(struct ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + + destroy_qp_common(dev, mqp); + + kfree(mqp); + + return 0; +} + +static u32 atomic_mode_qp(struct mlx5_ib_dev *dev) +{ + unsigned long mask; + unsigned long tmp; + + mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp) & + MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); + + tmp = find_last_bit(&mask, BITS_PER_LONG); + if (tmp < 2 || tmp >= BITS_PER_LONG) + return MLX5_ATOMIC_MODE_NONE; + + if (tmp == 2) + return MLX5_ATOMIC_MODE_CX; + + return tmp << MLX5_ATOMIC_MODE_OFF; +} + +static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr, + int attr_mask) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + u32 hw_access_flags = 0; + u8 dest_rd_atomic; + u32 access_flags; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + dest_rd_atomic = attr->max_dest_rd_atomic; + else + dest_rd_atomic = qp->resp_depth; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + access_flags = attr->qp_access_flags; + else + access_flags = qp->atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + hw_access_flags |= MLX5_QP_BIT_RRE; + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) + hw_access_flags |= (MLX5_QP_BIT_RAE | + atomic_mode_qp(dev)); + if (access_flags & IB_ACCESS_REMOTE_WRITE) + hw_access_flags |= MLX5_QP_BIT_RWE; + + return cpu_to_be32(hw_access_flags); +} + +enum { + MLX5_PATH_FLAG_FL = 1 << 0, + MLX5_PATH_FLAG_FREE_AR = 1 << 1, + MLX5_PATH_FLAG_COUNTER = 1 << 2, +}; + +static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) +{ + if (rate == IB_RATE_PORT_CURRENT) { + return 0; + } else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) { + return -EINVAL; + } else { + while (rate != IB_RATE_2_5_GBPS && + !(1 << (rate + MLX5_STAT_RATE_OFFSET) & + MLX5_CAP_GEN(dev->mdev, stat_rate_support))) + --rate; + } + + return rate + MLX5_STAT_RATE_OFFSET; +} + +static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, + struct mlx5_qp_path *path, u8 port, int attr_mask, + u32 path_flags, const struct ib_qp_attr *attr, + int alt) +{ + enum rdma_link_layer ll = dev->ib_dev.get_link_layer(&dev->ib_dev, + port); + int err; + int gid_type; + + if ((ll == IB_LINK_LAYER_ETHERNET) || (ah->ah_flags & IB_AH_GRH)) { + int len = dev->ib_dev.gid_tbl_len[port - 1]; + if (ah->grh.sgid_index >= len) { + printf("mlx5_ib: ERR: ""sgid_index (%u) too large. max is %d\n", ah->grh.sgid_index, len - 1); + return -EINVAL; + } + } + + if (ll == IB_LINK_LAYER_ETHERNET) { + if (!(ah->ah_flags & IB_AH_GRH)) + return -EINVAL; + + err = mlx5_get_roce_gid_type(dev, port, ah->grh.sgid_index, + &gid_type); + if (err) + return err; + memcpy(path->rmac, ah->dmac, sizeof(ah->dmac)); + path->udp_sport = mlx5_get_roce_udp_sport(dev, port, + ah->grh.sgid_index, + 0); + path->dci_cfi_prio_sl = (ah->sl & 0xf) << 4; + } else { + path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; + path->grh_mlid = ah->src_path_bits & 0x7f; + path->rlid = cpu_to_be16(ah->dlid); + if (ah->ah_flags & IB_AH_GRH) + path->grh_mlid |= 1 << 7; + if (attr_mask & IB_QP_PKEY_INDEX) + path->pkey_index = cpu_to_be16(alt ? + attr->alt_pkey_index : + attr->pkey_index); + + path->dci_cfi_prio_sl = ah->sl & 0xf; + } + + path->fl_free_ar |= (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0; + + if (ah->ah_flags & IB_AH_GRH) { + path->mgid_index = ah->grh.sgid_index; + if ((ll == IB_LINK_LAYER_ETHERNET) && + (gid_type != IB_GID_TYPE_IB) && + (ah->grh.hop_limit < 2)) + path->hop_limit = IPV6_DEFAULT_HOPLIMIT; + else + path->hop_limit = ah->grh.hop_limit; + path->tclass_flowlabel = + cpu_to_be32((ah->grh.traffic_class << 20) | + (ah->grh.flow_label)); + memcpy(path->rgid, ah->grh.dgid.raw, 16); + } + + err = ib_rate_to_mlx5(dev, ah->static_rate); + if (err < 0) + return err; + path->static_rate = err; + path->port = port; + + if (attr_mask & IB_QP_TIMEOUT) + path->ackto_lt = alt ? attr->alt_timeout << 3 : attr->timeout << 3; + + return 0; +} + +static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_PRI_PORT | + MLX5_QP_OPTPAR_DC_KEY | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_RAE, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_DC_KEY, + }, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_RNR_TIMEOUT, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_DC_KEY | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_RAE, + }, + }, + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_SRQN | + MLX5_QP_OPTPAR_CQN_RCV, + [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_DC_KEY | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_RAE, + }, + }, + [MLX5_QP_STATE_SQER] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RRE, + [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_DC_KEY | + MLX5_QP_OPTPAR_RAE, + + }, + }, + [MLX5_QP_STATE_SQD] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RRE, + }, + }, +}; + +static int ib_nr_to_mlx5_nr(int ib_mask) +{ + switch (ib_mask) { + case IB_QP_STATE: + return 0; + case IB_QP_CUR_STATE: + return 0; + case IB_QP_EN_SQD_ASYNC_NOTIFY: + return 0; + case IB_QP_ACCESS_FLAGS: + return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE; + case IB_QP_PKEY_INDEX: + return MLX5_QP_OPTPAR_PKEY_INDEX; + case IB_QP_PORT: + return MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_QKEY: + return MLX5_QP_OPTPAR_Q_KEY; + case IB_QP_AV: + return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH | + MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_PATH_MTU: + return 0; + case IB_QP_TIMEOUT: + return MLX5_QP_OPTPAR_ACK_TIMEOUT; + case IB_QP_RETRY_CNT: + return MLX5_QP_OPTPAR_RETRY_COUNT; + case IB_QP_RNR_RETRY: + return MLX5_QP_OPTPAR_RNR_RETRY; + case IB_QP_RQ_PSN: + return 0; + case IB_QP_MAX_QP_RD_ATOMIC: + return MLX5_QP_OPTPAR_SRA_MAX; + case IB_QP_ALT_PATH: + return MLX5_QP_OPTPAR_ALT_ADDR_PATH; + case IB_QP_MIN_RNR_TIMER: + return MLX5_QP_OPTPAR_RNR_TIMEOUT; + case IB_QP_SQ_PSN: + return 0; + case IB_QP_MAX_DEST_RD_ATOMIC: + return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE; + case IB_QP_PATH_MIG_STATE: + return MLX5_QP_OPTPAR_PM_STATE; + case IB_QP_CAP: + return 0; + case IB_QP_DEST_QPN: + return 0; + } + return 0; +} + +static int ib_mask_to_mlx5_opt(int ib_mask) +{ + int result = 0; + int i; + + for (i = 0; i < 8 * sizeof(int); i++) { + if ((1 << i) & ib_mask) + result |= ib_nr_to_mlx5_nr(1 << i); + } + + return result; +} + +static int modify_raw_qp_rq(struct mlx5_core_dev *dev, struct mlx5_ib_qp *qp, + int new_state) +{ + void *in; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_rq_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_rq_in, in, rqn, qp->mrq.qpn); + MLX5_SET(modify_rq_in, in, rq_state, qp->rq_state); + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + MLX5_SET(rqc, rqc, state, new_state); + + err = mlx5_core_modify_rq(dev, in, inlen); + if (err) + goto out; + + qp->rq_state = new_state; + +out: + kvfree(in); + return err; +} + +static int modify_raw_qp_sq(struct mlx5_core_dev *dev, struct mlx5_ib_qp *qp, + int new_state) +{ + void *in; + void *sqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_sq_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_sq_in, in, sqn, qp->msq.qpn); + MLX5_SET(modify_sq_in, in, sq_state, qp->sq_state); + + sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); + MLX5_SET(sqc, sqc, state, new_state); + + err = mlx5_core_modify_sq(dev, in, inlen); + if (err) + goto out; + + qp->sq_state = new_state; + +out: + kvfree(in); + return err; +} + +static int modify_raw_qp_to_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + u16 operation) +{ + struct mlx5_core_dev *mdev = dev->mdev; + int sq_state; + int rq_state; + int err; + + switch (operation) { + case MLX5_CMD_OP_2RST_QP: + sq_state = MLX5_SQC_STATE_RST; + rq_state = MLX5_RQC_STATE_RST; + break; + case MLX5_CMD_OP_2ERR_QP: + sq_state = MLX5_SQC_STATE_ERR; + rq_state = MLX5_RQC_STATE_ERR; + break; + default: + return -EINVAL; + } + + if (qp->rq.wqe_cnt) { + err = modify_raw_qp_rq(mdev, qp, rq_state); + if (err) { + mlx5_ib_warn(dev, "Failed to modify RQ %06x to %s\n", + qp->mrq.qpn, + rq_state == MLX5_RQC_STATE_RST ? "RESET" : "ERROR"); + return err; + } + } + if (qp->sq.wqe_cnt) { + err = modify_raw_qp_sq(mdev, qp, sq_state); + if (err) { + mlx5_ib_warn(dev, "Failed to modify SQ %06x to %s\n", + qp->msq.qpn, + sq_state == MLX5_SQC_STATE_RST ? "RESET" : "ERROR"); + return err; + } + } + + return 0; +} + +static int modify_raw_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + u16 operation) +{ + int err; + + switch (operation) { + case MLX5_CMD_OP_RST2INIT_QP: + if (qp->rq.wqe_cnt) { + err = modify_raw_qp_rq(dev->mdev, qp, MLX5_RQC_STATE_RDY); + if (err) + return err; + } + if (qp->sq.wqe_cnt) + return modify_raw_qp_sq(dev->mdev, qp, MLX5_SQC_STATE_RDY); + break; + case MLX5_CMD_OP_INIT2RTR_QP: + /* TBD: update eth_prio using MODIFY_TIS*/ + break; + case MLX5_CMD_OP_2ERR_QP: + case MLX5_CMD_OP_2RST_QP: + return modify_raw_qp_to_state(dev, qp, operation); + + default: + /* If we get here, then we don't need to do anything for this + * transition. The operation is always valid because the caller + * checks it + */ + return 0; + } + + return 0; +} + +static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state) +{ + static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = { + [MLX5_QP_STATE_RST] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_RST2INIT_QP, + }, + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_INIT2INIT_QP, + [MLX5_QP_STATE_RTR] = MLX5_CMD_OP_INIT2RTR_QP, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTR2RTS_QP, + }, + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTS2RTS_QP, + }, + [MLX5_QP_STATE_SQD] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQD_RTS_QP, + }, + [MLX5_QP_STATE_SQER] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQERR2RTS_QP, + }, + [MLX5_QP_STATE_ERR] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + } + }; + + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_qp_context *context; + struct mlx5_modify_qp_mbox_in *in; + struct mlx5_ib_pd *pd; + enum mlx5_qp_state mlx5_cur, mlx5_new; + enum mlx5_qp_optpar optpar; + int sqd_event; + int mlx5_st; + int err; + u16 op; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + context = &in->ctx; + err = to_mlx5_st(ibqp->qp_type); + if (err < 0) + goto out; + + context->flags = cpu_to_be32(err << 16); + + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { + context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + } else { + switch (attr->path_mig_state) { + case IB_MIG_MIGRATED: + context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + break; + case IB_MIG_REARM: + context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11); + break; + case IB_MIG_ARMED: + context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11); + break; + } + } + + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) { + context->mtu_msgmax = (IB_MTU_256 << 5) | 8; + } else if (ibqp->qp_type == IB_QPT_UD || + ibqp->qp_type == MLX5_IB_QPT_REG_UMR) { + context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; + } else if (attr_mask & IB_QP_PATH_MTU) { + if (attr->path_mtu < IB_MTU_256 || + attr->path_mtu > IB_MTU_4096) { + mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu); + err = -EINVAL; + goto out; + } + context->mtu_msgmax = (attr->path_mtu << 5) | + (u8)MLX5_CAP_GEN(dev->mdev, log_max_msg); + } + + if (attr_mask & IB_QP_DEST_QPN) + context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num); + + if (attr_mask & IB_QP_DC_KEY) + context->dc_access_key = cpu_to_be64(attr->dct_key); + + if (attr_mask & IB_QP_PKEY_INDEX) + context->pri_path.pkey_index = cpu_to_be16(attr->pkey_index); + + /* todo implement counter_index functionality */ + + if (is_sqp(ibqp->qp_type)) + context->pri_path.port = qp->port; + + if (attr_mask & IB_QP_PORT) + context->pri_path.port = attr->port_num; + + if (attr_mask & IB_QP_AV) { + err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path, + attr_mask & IB_QP_PORT ? attr->port_num : qp->port, + attr_mask, 0, attr, 0); + if (err) + goto out; + } + + if (attr_mask & IB_QP_TIMEOUT) + context->pri_path.ackto_lt |= attr->timeout << 3; + + if (attr_mask & IB_QP_ALT_PATH) { + err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path, + attr->alt_port_num, + attr_mask | IB_QP_PKEY_INDEX | IB_QP_TIMEOUT, + 0, attr, 1); + if (err) + goto out; + } + + pd = get_pd(qp); + get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, + &send_cq, &recv_cq); + + context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn); + context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0; + context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0; + context->params1 = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28); + + if (attr_mask & IB_QP_RNR_RETRY) + context->params1 |= cpu_to_be32(attr->rnr_retry << 13); + + if (attr_mask & IB_QP_RETRY_CNT) + context->params1 |= cpu_to_be32(attr->retry_cnt << 16); + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic) + context->params1 |= + cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); + } + + if (attr_mask & IB_QP_SQ_PSN) + context->next_send_psn = cpu_to_be32(attr->sq_psn & 0xffffff); + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attr->max_dest_rd_atomic) + context->params2 |= + cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); + } + + if ((attr_mask & IB_QP_ACCESS_FLAGS) && + (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !dev->enable_atomic_resp) { + mlx5_ib_warn(dev, "atomic responder is not supported\n"); + err = -EINVAL; + goto out; + } + + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) + context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask); + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); + + if (attr_mask & IB_QP_RQ_PSN) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn & 0xffffff); + + if (attr_mask & IB_QP_QKEY) + context->qkey = cpu_to_be32(attr->qkey); + + if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->db_rec_addr = cpu_to_be64(qp->db.dma); + + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && + attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) + sqd_event = 1; + else + sqd_event = 0; + + if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->sq_crq_size |= cpu_to_be16(1 << 4); + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num : + qp->port) - 1; + struct mlx5_ib_port *mibport = &dev->port[port_num]; + + context->qp_counter_set_usr_page |= + cpu_to_be32(mibport->q_cnt_id << 24); + } + + mlx5_cur = to_mlx5_state(cur_state); + mlx5_new = to_mlx5_state(new_state); + mlx5_st = to_mlx5_st(ibqp->qp_type); + if (mlx5_st < 0) + goto out; + + if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE || + !optab[mlx5_cur][mlx5_new]) + return -EINVAL; + + op = optab[mlx5_cur][mlx5_new]; + optpar = ib_mask_to_mlx5_opt(attr_mask); + optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; + in->optparam = cpu_to_be32(optpar); + + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) + err = modify_raw_qp(dev, qp, op); + else + err = mlx5_core_qp_modify(dev->mdev, op, in, sqd_event, + &qp->mqp); + if (err) + goto out; + + qp->state = new_state; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) + qp->port = attr->port_num; + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_port = attr->alt_port_num; + + /* + * If we moved a kernel QP to RESET, clean up all old CQ + * entries and reinitialize the QP. + */ + if (new_state == IB_QPS_RESET && !ibqp->uobject) { + mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (send_cq != recv_cq) + mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq.cur_post = 0; + qp->sq.last_poll = 0; + if (qp->db.db) { + qp->db.db[MLX5_RCV_DBR] = 0; + qp->db.db[MLX5_SND_DBR] = 0; + } + } + +out: + kfree(in); + return err; +} + +static int ignored_ts_check(enum ib_qp_type qp_type) +{ + if (qp_type == MLX5_IB_QPT_REG_UMR || + qp_type == MLX5_IB_QPT_SW_CNAK) + return 1; + + return 0; +} + +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + enum ib_qp_state cur_state, new_state; + int err = -EINVAL; + int port; + enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; + + mutex_lock(&qp->mutex); + + cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (!(cur_state == new_state && cur_state == IB_QPS_RESET)) { + port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port); + } + if (!ignored_ts_check(ibqp->qp_type) && + !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, ll)) + goto out; + + if ((attr_mask & IB_QP_PORT) && + (attr->port_num == 0 || + attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) + goto out; + + if (attr_mask & IB_QP_PKEY_INDEX) { + port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + if (attr->pkey_index >= + dev->mdev->port_caps[port - 1].pkey_table_len) + goto out; + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > + (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) + goto out; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > + (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) + goto out; + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + err = 0; + goto out; + } + + err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq) +{ + struct mlx5_ib_cq *cq; + unsigned cur; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max_post)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, + struct ib_send_wr *wr) +{ + memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof(struct mlx5_av)); + dseg->av.dqp_dct = cpu_to_be32(wr->wr.ud.remote_qpn | MLX5_EXTENDED_UD_AV); + dseg->av.key.qkey.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); +} + +static void set_mlx_seg(struct mlx5_mlx_seg *seg, struct mlx5_mlx_wr *wr) +{ + memset(seg, 0, sizeof(*seg)); + seg->stat_rate_sl = wr->sl & 0xf; + seg->dlid = cpu_to_be16(wr->dlid); + seg->flags = wr->icrc ? 8 : 0; +} + +static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static __be16 get_klm_octo(int npages) +{ + return cpu_to_be16(ALIGN(npages, 8) / 2); +} + +static __be64 frwr_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 sig_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_SIGERR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE | + MLX5_MKEY_MASK_BSF_EN; + + return cpu_to_be64(result); +} + +static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr, int li) +{ + memset(umr, 0, sizeof(*umr)); + + if (li) { + umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); + umr->flags = 1 << 7; + return; + } + + umr->flags = (1 << 5); /* fail if not free */ + umr->klm_octowords = get_klm_octo(wr->wr.fast_reg.page_list_len); + umr->mkey_mask = frwr_mkey_mask(); +} + +static __be64 get_umr_reg_mr_mask(int atomic) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_PD | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_FREE; + + if (atomic) + result |= MLX5_MKEY_MASK_A; + + + return cpu_to_be64(result); +} + +static __be64 get_umr_unreg_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_mtt_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr, int atomic) +{ + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; + + memset(umr, 0, sizeof(*umr)); + + if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE) + umr->flags = MLX5_UMR_CHECK_FREE; /* fail if free */ + else + umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */ + + if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) { + umr->klm_octowords = get_klm_octo(umrwr->npages); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) { + umr->mkey_mask = get_umr_update_mtt_mask(); + umr->bsf_octowords = get_klm_octo(umrwr->target.offset); + umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; + } else { + umr->mkey_mask = get_umr_reg_mr_mask(atomic); + } + } else { + umr->mkey_mask = get_umr_unreg_mr_mask(); + } + + if (!wr->num_sge) + umr->flags |= MLX5_UMR_INLINE; +} + +static u8 get_umr_flags(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN; +} + +static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, + int li, int *writ) +{ + memset(seg, 0, sizeof(*seg)); + if (li) { + seg->status = MLX5_MKEY_STATUS_FREE; + return; + } + + seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags) | + MLX5_ACCESS_MODE_MTT; + *writ = seg->flags & (MLX5_PERM_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE); + seg->qpn_mkey7_0 = cpu_to_be32((wr->wr.fast_reg.rkey & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); + seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); + seg->len = cpu_to_be64(wr->wr.fast_reg.length); + seg->xlt_oct_size = cpu_to_be32((wr->wr.fast_reg.page_list_len + 1) / 2); + seg->log2_page_size = wr->wr.fast_reg.page_shift; +} + +static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr) +{ + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; + + memset(seg, 0, sizeof(*seg)); + if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) { + seg->status = MLX5_MKEY_STATUS_FREE; + return; + } + + seg->flags = convert_access(umrwr->access_flags); + if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) { + seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); + seg->start_addr = cpu_to_be64(umrwr->target.virt_addr); + } + seg->len = cpu_to_be64(umrwr->length); + seg->log2_page_size = umrwr->page_shift; + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | + mlx5_mkey_variant(umrwr->mkey)); +} + +static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, + struct ib_send_wr *wr, + struct mlx5_core_dev *mdev, + struct mlx5_ib_pd *pd, + int writ) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list); + u64 *page_list = wr->wr.fast_reg.page_list->page_list; + u64 perm = MLX5_EN_RD | (writ ? MLX5_EN_WR : 0); + int i; + + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) + mfrpl->mapped_page_list[i] = cpu_to_be64(page_list[i] | perm); + dseg->addr = cpu_to_be64(mfrpl->map); + dseg->byte_count = cpu_to_be32(ALIGN(sizeof(u64) * wr->wr.fast_reg.page_list_len, 64)); + dseg->lkey = cpu_to_be32(pd->pa_lkey); +} + +static __be32 send_ieth(struct ib_send_wr *wr) +{ + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_WRITE_WITH_IMM: + return wr->ex.imm_data; + + case IB_WR_SEND_WITH_INV: + return cpu_to_be32(wr->ex.invalidate_rkey); + + default: + return 0; + } +} + +static u8 calc_sig(void *wqe, int size) +{ + u8 *p = wqe; + u8 res = 0; + int i; + + for (i = 0; i < size; i++) + res ^= p[i]; + + return ~res; +} + +static u8 calc_wq_sig(void *wqe) +{ + return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4); +} + +static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr, + void *wqe, int *sz) +{ + struct mlx5_wqe_inline_seg *seg; + void *qend = qp->sq.qend; + void *addr; + int inl = 0; + int copy; + int len; + int i; + + seg = wqe; + wqe += sizeof(*seg); + for (i = 0; i < wr->num_sge; i++) { + addr = (void *)(uintptr_t)(wr->sg_list[i].addr); + len = wr->sg_list[i].length; + inl += len; + + if (unlikely(inl > qp->max_inline_data)) + return -ENOMEM; + + if (unlikely(wqe + len > qend)) { + copy = (int)(qend - wqe); + memcpy(wqe, addr, copy); + addr += copy; + len -= copy; + wqe = mlx5_get_send_wqe(qp, 0); + } + memcpy(wqe, addr, len); + wqe += len; + } + + seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG); + + *sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16; + + return 0; +} + +static u16 prot_field_size(enum ib_signature_type type) +{ + switch (type) { + case IB_SIG_TYPE_T10_DIF: + return MLX5_DIF_SIZE; + default: + return 0; + } +} + +static u8 bs_selector(int block_size) +{ + switch (block_size) { + case 512: return 0x1; + case 520: return 0x2; + case 4096: return 0x3; + case 4160: return 0x4; + case 1073741824: return 0x5; + default: return 0; + } +} + +static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain, + struct mlx5_bsf_inl *inl) +{ + /* Valid inline section and allow BSF refresh */ + inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID | + MLX5_BSF_REFRESH_DIF); + inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag); + inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag); + /* repeating block */ + inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK; + inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ? + MLX5_DIF_CRC : MLX5_DIF_IPCS; + + if (domain->sig.dif.ref_remap) + inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG; + + if (domain->sig.dif.app_escape) { + if (domain->sig.dif.ref_escape) + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE; + else + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE; + } + + inl->dif_app_bitmask_check = + cpu_to_be16(domain->sig.dif.apptag_check_mask); +} + +static int mlx5_set_bsf(struct ib_mr *sig_mr, + struct ib_sig_attrs *sig_attrs, + struct mlx5_bsf *bsf, u32 data_size) +{ + struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig; + struct mlx5_bsf_basic *basic = &bsf->basic; + struct ib_sig_domain *mem = &sig_attrs->mem; + struct ib_sig_domain *wire = &sig_attrs->wire; + + memset(bsf, 0, sizeof(*bsf)); + + /* Basic + Extended + Inline */ + basic->bsf_size_sbs = 1 << 7; + /* Input domain check byte mask */ + basic->check_byte_mask = sig_attrs->check_mask; + basic->raw_data_size = cpu_to_be32(data_size); + + /* Memory domain */ + switch (sig_attrs->mem.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval); + basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx); + mlx5_fill_inl_bsf(mem, &bsf->m_inl); + break; + default: + return -EINVAL; + } + + /* Wire domain */ + switch (sig_attrs->wire.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval && + mem->sig_type == wire->sig_type) { + /* Same block structure */ + basic->bsf_size_sbs |= 1 << 4; + if (mem->sig.dif.bg_type == wire->sig.dif.bg_type) + basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK; + if (mem->sig.dif.app_tag == wire->sig.dif.app_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK; + if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK; + } else + basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval); + + basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx); + mlx5_fill_inl_bsf(wire, &bsf->w_inl); + break; + default: + return -EINVAL; + } + + return 0; +} + +static int set_sig_data_segment(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + void **seg, int *size) +{ + struct ib_sig_attrs *sig_attrs = wr->wr.sig_handover.sig_attrs; + struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr; + struct mlx5_bsf *bsf; + u32 data_len = wr->sg_list->length; + u32 data_key = wr->sg_list->lkey; + u64 data_va = wr->sg_list->addr; + int ret; + int wqe_size; + + if (!wr->wr.sig_handover.prot || + (data_key == wr->wr.sig_handover.prot->lkey && + data_va == wr->wr.sig_handover.prot->addr && + data_len == wr->wr.sig_handover.prot->length)) { + /** + * Source domain doesn't contain signature information + * or data and protection are interleaved in memory. + * So need construct: + * ------------------ + * | data_klm | + * ------------------ + * | BSF | + * ------------------ + **/ + struct mlx5_klm *data_klm = *seg; + + data_klm->bcount = cpu_to_be32(data_len); + data_klm->key = cpu_to_be32(data_key); + data_klm->va = cpu_to_be64(data_va); + wqe_size = ALIGN(sizeof(*data_klm), 64); + } else { + /** + * Source domain contains signature information + * So need construct a strided block format: + * --------------------------- + * | stride_block_ctrl | + * --------------------------- + * | data_klm | + * --------------------------- + * | prot_klm | + * --------------------------- + * | BSF | + * --------------------------- + **/ + struct mlx5_stride_block_ctrl_seg *sblock_ctrl; + struct mlx5_stride_block_entry *data_sentry; + struct mlx5_stride_block_entry *prot_sentry; + u32 prot_key = wr->wr.sig_handover.prot->lkey; + u64 prot_va = wr->wr.sig_handover.prot->addr; + u16 block_size = sig_attrs->mem.sig.dif.pi_interval; + int prot_size; + + sblock_ctrl = *seg; + data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl); + prot_sentry = (void *)data_sentry + sizeof(*data_sentry); + + prot_size = prot_field_size(sig_attrs->mem.sig_type); + if (!prot_size) { + printf("mlx5_ib: ERR: ""Bad block size given: %u\n", block_size); + return -EINVAL; + } + sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size + + prot_size); + sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP); + sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size); + sblock_ctrl->num_entries = cpu_to_be16(2); + + data_sentry->bcount = cpu_to_be16(block_size); + data_sentry->key = cpu_to_be32(data_key); + data_sentry->va = cpu_to_be64(data_va); + data_sentry->stride = cpu_to_be16(block_size); + + prot_sentry->bcount = cpu_to_be16(prot_size); + prot_sentry->key = cpu_to_be32(prot_key); + prot_sentry->va = cpu_to_be64(prot_va); + prot_sentry->stride = cpu_to_be16(prot_size); + + wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) + + sizeof(*prot_sentry), 64); + } + + *seg += wqe_size; + *size += wqe_size / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + bsf = *seg; + ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len); + if (ret) + return -EINVAL; + + *seg += sizeof(*bsf); + *size += sizeof(*bsf) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + return 0; +} + +static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, + struct ib_send_wr *wr, u32 nelements, + u32 length, u32 pdn) +{ + struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr; + u32 sig_key = sig_mr->rkey; + u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1; + + memset(seg, 0, sizeof(*seg)); + + seg->flags = get_umr_flags(wr->wr.sig_handover.access_flags) | + MLX5_ACCESS_MODE_KLM; + seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | + MLX5_MKEY_BSF_EN | pdn); + seg->len = cpu_to_be64(length); + seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements))); + seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); +} + +static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr, u32 nelements) +{ + memset(umr, 0, sizeof(*umr)); + + umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; + umr->klm_octowords = get_klm_octo(nelements); + umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); + umr->mkey_mask = sig_mkey_mask(); +} + + +static int set_sig_umr_wr(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + void **seg, int *size) +{ + struct mlx5_ib_mr *sig_mr = to_mmr(wr->wr.sig_handover.sig_mr); + u32 pdn = get_pd(qp)->pdn; + u32 klm_oct_size; + int region_len, ret; + + if (unlikely(wr->num_sge != 1) || + unlikely(wr->wr.sig_handover.access_flags & + IB_ACCESS_REMOTE_ATOMIC) || + unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) || + unlikely(!sig_mr->sig->sig_status_checked)) + return -EINVAL; + + /* length of the protected region, data + protection */ + region_len = wr->sg_list->length; + if (wr->wr.sig_handover.prot && + (wr->wr.sig_handover.prot->lkey != wr->sg_list->lkey || + wr->wr.sig_handover.prot->addr != wr->sg_list->addr || + wr->wr.sig_handover.prot->length != wr->sg_list->length)) + region_len += wr->wr.sig_handover.prot->length; + + /** + * KLM octoword size - if protection was provided + * then we use strided block format (3 octowords), + * else we use single KLM (1 octoword) + **/ + klm_oct_size = wr->wr.sig_handover.prot ? 3 : 1; + + set_sig_umr_segment(*seg, wr, klm_oct_size); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + ret = set_sig_data_segment(wr, qp, seg, size); + if (ret) + return ret; + + sig_mr->sig->sig_status_checked = false; + return 0; +} + +static int set_psv_wr(struct ib_sig_domain *domain, + u32 psv_idx, void **seg, int *size) +{ + struct mlx5_seg_set_psv *psv_seg = *seg; + + memset(psv_seg, 0, sizeof(*psv_seg)); + psv_seg->psv_num = cpu_to_be32(psv_idx); + switch (domain->sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 | + domain->sig.dif.app_tag); + psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag); + break; + default: + printf("mlx5_ib: ERR: ""Bad signature type given.\n"); + return 1; + } + + *seg += sizeof(*psv_seg); + *size += sizeof(*psv_seg) / 16; + + return 0; +} + +static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size, + struct mlx5_core_dev *mdev, struct mlx5_ib_pd *pd, struct mlx5_ib_qp *qp) +{ + int writ = 0; + int li; + + li = wr->opcode == IB_WR_LOCAL_INV ? 1 : 0; + if (unlikely(wr->send_flags & IB_SEND_INLINE)) + return -EINVAL; + + set_frwr_umr_segment(*seg, wr, li); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + set_mkey_segment(*seg, wr, li, &writ); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + if (!li) { + if (unlikely(wr->wr.fast_reg.page_list_len > + wr->wr.fast_reg.page_list->max_page_list_len)) + return -ENOMEM; + + set_frwr_pages(*seg, wr, mdev, pd, writ); + *seg += sizeof(struct mlx5_wqe_data_seg); + *size += (sizeof(struct mlx5_wqe_data_seg) / 16); + } + return 0; +} + +static void set_indir_mkey_segment(struct mlx5_mkey_seg *seg, + const struct ib_send_wr *wr, u32 pdn) +{ + u32 list_len = wr->wr.indir_reg.indir_list_len; + + memset(seg, 0, sizeof(*seg)); + + seg->flags = get_umr_flags(wr->wr.indir_reg.access_flags) | + MLX5_ACCESS_MODE_KLM; + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | + mlx5_mkey_variant(wr->wr.indir_reg.mkey)); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | pdn); + seg->len = cpu_to_be64(wr->wr.indir_reg.length); + seg->start_addr = cpu_to_be64(wr->wr.indir_reg.iova_start); + seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(list_len * 2))); +} + +static void set_indir_data_seg(const struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + u32 pa_key, void **seg, int *size) +{ + struct mlx5_wqe_data_seg *data = *seg; + struct mlx5_ib_indir_reg_list *mirl; + struct ib_sge *sg_list = wr->wr.indir_reg.indir_list->sg_list; + u32 list_len = wr->wr.indir_reg.indir_list_len; + int i; + + mirl = to_mindir_list(wr->wr.indir_reg.indir_list); + for (i = 0; i < list_len; i++) { + mirl->klms[i].va = cpu_to_be64(sg_list[i].addr); + mirl->klms[i].key = cpu_to_be32(sg_list[i].lkey); + mirl->klms[i].bcount = cpu_to_be32(sg_list[i].length); + } + + data->byte_count = cpu_to_be32(ALIGN(sizeof(struct mlx5_klm) * + list_len, 64)); + data->lkey = cpu_to_be32(pa_key); + data->addr = cpu_to_be64(mirl->map); + *seg += sizeof(*data); + *size += sizeof(*data) / 16; +} + +static void set_indir_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + const struct ib_send_wr *wr) +{ + u64 mask; + u32 list_len = wr->wr.indir_reg.indir_list_len; + + memset(umr, 0, sizeof(*umr)); + + umr->flags = MLX5_UMR_CHECK_NOT_FREE; + umr->klm_octowords = get_klm_octo(list_len * 2); + mask = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_FREE; + + umr->mkey_mask = cpu_to_be64(mask); +} + +static int set_indir_reg_wr(const struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + void **seg, int *size) +{ + struct mlx5_ib_pd *pd = get_pd(qp); + + if (unlikely(wr->send_flags & IB_SEND_INLINE)) + return -EINVAL; + + set_indir_umr_segment(*seg, wr); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely(*seg == qp->sq.qend)) + *seg = mlx5_get_send_wqe(qp, 0); + + set_indir_mkey_segment(*seg, wr, pd->pdn); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely(*seg == qp->sq.qend)) + *seg = mlx5_get_send_wqe(qp, 0); + + set_indir_data_seg(wr, qp, pd->pa_lkey, seg, size); + + return 0; +} + +static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16) +{ + __be32 *p = NULL; + int tidx = idx; + int i, j; + + pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx)); + for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) { + if ((i & 0xf) == 0) { + void *buf = mlx5_get_send_wqe(qp, tidx); + tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1); + p = buf; + j = 0; + } + pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]), + be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]), + be32_to_cpu(p[j + 3])); + } +} + +static void mlx5_bf_copy(u64 __iomem *dst, u64 *src, + unsigned bytecnt, struct mlx5_ib_qp *qp) +{ + while (bytecnt > 0) { + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + bytecnt -= 64; + if (unlikely(src == qp->sq.qend)) + src = mlx5_get_send_wqe(qp, 0); + } +} + +static u8 get_fence(u8 fence, struct ib_send_wr *wr) +{ + if (unlikely(wr->opcode == IB_WR_LOCAL_INV && + wr->send_flags & IB_SEND_FENCE)) + return MLX5_FENCE_MODE_STRONG_ORDERING; + + if (unlikely(fence)) { + if (wr->send_flags & IB_SEND_FENCE) + return MLX5_FENCE_MODE_SMALL_AND_FENCE; + else + return fence; + + } else { + return 0; + } +} + +static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + struct ib_send_wr *wr, unsigned *idx, + int *size, int nreq) +{ + int err = 0; + + if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) { + mlx5_ib_warn(to_mdev(qp->ibqp.device), "work queue overflow\n"); + err = -ENOMEM; + return err; + } + + *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); + *seg = mlx5_get_send_wqe(qp, *idx); + *ctrl = *seg; + *(u32 *)(*seg + 8) = 0; + (*ctrl)->imm = send_ieth(wr); + (*ctrl)->fm_ce_se = qp->sq_signal_bits | + (wr->send_flags & IB_SEND_SIGNALED ? + MLX5_WQE_CTRL_CQ_UPDATE : 0) | + (wr->send_flags & IB_SEND_SOLICITED ? + MLX5_WQE_CTRL_SOLICITED : 0); + + *seg += sizeof(**ctrl); + *size = sizeof(**ctrl) / 16; + + return err; +} + +static void finish_wqe(struct mlx5_ib_qp *qp, + struct mlx5_wqe_ctrl_seg *ctrl, + u8 size, unsigned idx, + struct ib_send_wr *wr, + int nreq, u8 fence, u8 next_fence, + u32 mlx5_opcode) +{ + u8 opmod = 0; + + ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | + mlx5_opcode | ((u32)opmod << 24)); + ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8)); + ctrl->fm_ce_se |= fence; + qp->fm_cache = next_fence; + if (unlikely(qp->wq_sig)) + ctrl->signature = calc_wq_sig(ctrl); + + qp->sq.swr_ctx[idx].wrid = wr->wr_id; + qp->sq.swr_ctx[idx].w_list.opcode = mlx5_opcode; + qp->sq.swr_ctx[idx].wqe_head = qp->sq.head + nreq; + qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); + qp->sq.swr_ctx[idx].w_list.next = qp->sq.cur_post; + qp->sq.swr_ctx[idx].sig_piped = !!(wr->send_flags & IB_SEND_SIG_PIPELINED); +} + +int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_mr *mr; + struct mlx5_wqe_data_seg *dpseg; + struct mlx5_wqe_xrc_seg *xrc; + struct mlx5_bf *bf = qp->bf; + int uninitialized_var(size); + void *qend = qp->sq.qend; + unsigned long flags; + unsigned idx; + int err = 0; + int inl = 0; + int num_sge; + void *seg; + int nreq; + int i; + u8 next_fence = 0; + u8 fence; + + + spin_lock_irqsave(&qp->sq.lock, flags); + + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { + mlx5_ib_warn(dev, "Invalid opcode 0x%x\n", wr->opcode); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + fence = qp->fm_cache; + num_sge = wr->num_sge; + if (unlikely(num_sge > qp->sq.max_gs)) { + mlx5_ib_warn(dev, "Max gs exceeded %d (max = %d)\n", wr->num_sge, qp->sq.max_gs); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "Failed to prepare WQE\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + switch (ibqp->qp_type) { + case IB_QPT_XRC_INI: + xrc = seg; + xrc->xrc_srqn = htonl(wr->xrc_remote_srq_num); + seg += sizeof(*xrc); + size += sizeof(*xrc) / 16; + /* fall through */ + case IB_QPT_RC: + switch (wr->opcode) { + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); + err = -ENOSYS; + *bad_wr = wr; + goto out; + + case IB_WR_LOCAL_INV: + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + qp->sq.swr_ctx[idx].wr_data = IB_WR_LOCAL_INV; + ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey); + err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); + if (err) { + mlx5_ib_warn(dev, "Failed to prepare LOCAL_INV WQE\n"); + *bad_wr = wr; + goto out; + } + num_sge = 0; + break; + + case IB_WR_FAST_REG_MR: + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + qp->sq.swr_ctx[idx].wr_data = IB_WR_FAST_REG_MR; + ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey); + err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); + if (err) { + mlx5_ib_warn(dev, "Failed to prepare FAST_REG_MR WQE\n"); + *bad_wr = wr; + goto out; + } + num_sge = 0; + break; + + case IB_WR_REG_INDIR_MR: + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + qp->sq.swr_ctx[idx].wr_data = IB_WR_REG_INDIR_MR; + ctrl->imm = cpu_to_be32(wr->wr.indir_reg.mkey); + err = set_indir_reg_wr(wr, qp, &seg, &size); + if (err) { + mlx5_ib_warn(dev, "Failed to prepare indir_reg wqe\n"); + *bad_wr = wr; + goto out; + } + num_sge = 0; + break; + case IB_WR_REG_SIG_MR: + qp->sq.swr_ctx[idx].wr_data = IB_WR_REG_SIG_MR; + mr = to_mmr(wr->wr.sig_handover.sig_mr); + + ctrl->imm = cpu_to_be32(mr->ibmr.rkey); + err = set_sig_umr_wr(wr, qp, &seg, &size); + if (err) { + mlx5_ib_warn(dev, "Failed to prepare REG_SIG_MR WQE\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_UMR); + /* + * SET_PSV WQEs are not signaled and solicited + * on error + */ + wr->send_flags &= ~IB_SEND_SIGNALED; + wr->send_flags |= IB_SEND_SOLICITED; + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "Failed to prepare REG_SIG_MR WQE\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->mem, + mr->sig->psv_memory.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "Failed to prepare REG_SIG_MR WQE\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_SET_PSV); + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "Failed to prepare REG_SIG_MR WQE\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->wire, + mr->sig->psv_wire.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "Failed to prepare REG_SIG_MR WQE\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_SET_PSV); + num_sge = 0; + goto skip_psv; + + default: + break; + } + break; + + case IB_QPT_UC: + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + + default: + break; + } + break; + + case IB_QPT_SMI: + if (!mlx5_core_is_pf(mdev)) { + err = -EINVAL; + mlx5_ib_warn(dev, "Only physical function is allowed to send SMP MADs\n"); + *bad_wr = wr; + goto out; + } + case IB_QPT_GSI: + case IB_QPT_UD: + set_datagram_seg(seg, wr); + seg += sizeof(struct mlx5_wqe_datagram_seg); + size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + break; + + case MLX5_IB_QPT_SW_CNAK: + set_mlx_seg(seg, &((struct mlx5_send_wr *)wr)->sel.mlx); + seg += sizeof(struct mlx5_mlx_seg); + size += sizeof(struct mlx5_mlx_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + break; + + case MLX5_IB_QPT_REG_UMR: + if (wr->opcode != MLX5_IB_WR_UMR) { + err = -EINVAL; + mlx5_ib_warn(dev, "bad opcode\n"); + goto out; + } + qp->sq.swr_ctx[idx].wr_data = MLX5_IB_WR_UMR; + ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey); + set_reg_umr_segment(seg, wr, !!(MLX5_CAP_GEN(mdev, atomic))); + seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + set_reg_mkey_segment(seg, wr); + seg += sizeof(struct mlx5_mkey_seg); + size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + break; + + default: + break; + } + + if (wr->send_flags & IB_SEND_INLINE && num_sge) { + int uninitialized_var(sz); + + err = set_data_inl_seg(qp, wr, seg, &sz); + if (unlikely(err)) { + mlx5_ib_warn(dev, "Failed to prepare inline data segment\n"); + *bad_wr = wr; + goto out; + } + inl = 1; + size += sz; + } else { + dpseg = seg; + for (i = 0; i < num_sge; i++) { + if (unlikely(dpseg == qend)) { + seg = mlx5_get_send_wqe(qp, 0); + dpseg = seg; + } + if (likely(wr->sg_list[i].length)) { + set_data_ptr_seg(dpseg, wr->sg_list + i); + size += sizeof(struct mlx5_wqe_data_seg) / 16; + dpseg++; + } + } + } + + finish_wqe(qp, ctrl, size, idx, wr, nreq, + get_fence(fence, wr), next_fence, + mlx5_ib_opcode[wr->opcode]); +skip_psv: + if (0) + dump_wqe(qp, idx, size); + } + +out: + if (likely(nreq)) { + qp->sq.head += nreq; + + /* Make sure that descriptors are written before + * updating doorbell record and ringing the doorbell + */ + wmb(); + + qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); + + /* Make sure doorbell record is visible to the HCA before + * we hit doorbell */ + wmb(); + + if (bf->need_lock) + spin_lock(&bf->lock); + else + __acquire(&bf->lock); + + /* TBD enable WC */ + if (BF_ENABLE && nreq == 1 && bf->uuarn && inl && size > 1 && + size <= bf->buf_size / 16) { + mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp); + /* wc_wmb(); */ + } else { + mlx5_write64((__be32 *)ctrl, bf->regreg + bf->offset, + MLX5_GET_DOORBELL_LOCK(&bf->lock32)); + /* Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + mmiowb(); + } + bf->offset ^= bf->buf_size; + if (bf->need_lock) + spin_unlock(&bf->lock); + else + __release(&bf->lock); + } + + spin_unlock_irqrestore(&qp->sq.lock, flags); + + return err; +} + +static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size) +{ + sig->signature = calc_sig(sig, size); +} + +int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_wqe_data_seg *scat; + struct mlx5_rwqe_sig *sig; + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; + unsigned long flags; + int err = 0; + int nreq; + int ind; + int i; + + spin_lock_irqsave(&qp->rq.lock, flags); + + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + if (qp->wq_sig) + scat++; + + for (i = 0; i < wr->num_sge; i++) + set_data_ptr_seg(scat + i, wr->sg_list + i); + + if (i < qp->rq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + + if (qp->wq_sig) { + sig = (struct mlx5_rwqe_sig *)scat; + set_sig_seg(sig, (qp->rq.max_gs + 1) << 2); + } + + qp->rq.rwr_ctx[ind].wrid = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} + +static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state) +{ + switch (mlx5_state) { + case MLX5_QP_STATE_RST: return IB_QPS_RESET; + case MLX5_QP_STATE_INIT: return IB_QPS_INIT; + case MLX5_QP_STATE_RTR: return IB_QPS_RTR; + case MLX5_QP_STATE_RTS: return IB_QPS_RTS; + case MLX5_QP_STATE_SQ_DRAINING: + case MLX5_QP_STATE_SQD: return IB_QPS_SQD; + case MLX5_QP_STATE_SQER: return IB_QPS_SQE; + case MLX5_QP_STATE_ERR: return IB_QPS_ERR; + default: return -1; + } +} + +static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state) +{ + switch (mlx5_mig_state) { + case MLX5_QP_PM_ARMED: return IB_MIG_ARMED; + case MLX5_QP_PM_REARM: return IB_MIG_REARM; + case MLX5_QP_PM_MIGRATED: return IB_MIG_MIGRATED; + default: return -1; + } +} + +static int to_ib_qp_access_flags(int mlx5_flags) +{ + int ib_flags = 0; + + if (mlx5_flags & MLX5_QP_BIT_RRE) + ib_flags |= IB_ACCESS_REMOTE_READ; + if (mlx5_flags & MLX5_QP_BIT_RWE) + ib_flags |= IB_ACCESS_REMOTE_WRITE; + if (mlx5_flags & MLX5_QP_BIT_RAE) + ib_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return ib_flags; +} + +static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr, + struct mlx5_qp_path *path) +{ + struct mlx5_core_dev *dev = ibdev->mdev; + + memset(ib_ah_attr, 0, sizeof(*ib_ah_attr)); + ib_ah_attr->port_num = path->port; + + if (ib_ah_attr->port_num == 0 || + ib_ah_attr->port_num > MLX5_CAP_GEN(dev, num_ports)) + return; + + ib_ah_attr->sl = path->dci_cfi_prio_sl & 0xf; + + ib_ah_attr->dlid = be16_to_cpu(path->rlid); + ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f; + ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; + ib_ah_attr->ah_flags = (path->grh_mlid & (1 << 7)) ? IB_AH_GRH : 0; + if (ib_ah_attr->ah_flags) { + ib_ah_attr->grh.sgid_index = path->mgid_index; + ib_ah_attr->grh.hop_limit = path->hop_limit; + ib_ah_attr->grh.traffic_class = + (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; + ib_ah_attr->grh.flow_label = + be32_to_cpu(path->tclass_flowlabel) & 0xfffff; + memcpy(ib_ah_attr->grh.dgid.raw, + path->rgid, sizeof(ib_ah_attr->grh.dgid.raw)); + } +} + +static int query_raw_qp_sq_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, u8 *sq_state) +{ + void *out; + void *sqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(query_sq_out); + out = mlx5_vzalloc(inlen); + if (!out) + return -ENOMEM; + + err = mlx5_core_query_sq(dev->mdev, qp->msq.qpn, out); + if (err) + goto out; + + sqc = MLX5_ADDR_OF(query_sq_out, out, sq_context); + *sq_state = MLX5_GET(sqc, sqc, state); + qp->sq_state = *sq_state; + +out: + kvfree(out); + return err; +} + +static int query_raw_qp_rq_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, u8 *rq_state) +{ + void *out; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(query_rq_out); + out = mlx5_vzalloc(inlen); + if (!out) + return -ENOMEM; + + err = mlx5_core_query_rq(dev->mdev, qp->mrq.qpn, out); + if (err) + goto out; + + rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context); + *rq_state = MLX5_GET(rqc, rqc, state); + qp->rq_state = *rq_state; + +out: + kvfree(out); + return err; +} + +static int sqrq_state_to_qp_state(u8 sq_state, u8 rq_state, + struct mlx5_ib_qp *qp, u8 *qp_state) +{ + static const u8 sqrq_trans[MLX5_RQ_NUM_STATE][MLX5_SQ_NUM_STATE] = { + [MLX5_RQC_STATE_RST] = { + [MLX5_SQC_STATE_RST] = IB_QPS_RESET, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_ERR] = MLX5_QP_STATE_BAD, + }, + [MLX5_RQC_STATE_RDY] = { + [MLX5_SQC_STATE_RST] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE, + [MLX5_SQC_STATE_ERR] = IB_QPS_SQE, + }, + [MLX5_RQC_STATE_ERR] = { + [MLX5_SQC_STATE_RST] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_ERR] = IB_QPS_ERR, + }, + }; + + *qp_state = sqrq_trans[rq_state][sq_state]; + + if (*qp_state == MLX5_QP_STATE_BAD) + return -EINVAL; + + if (*qp_state == MLX5_QP_STATE) { + if (qp->state == IB_QPS_INIT || + qp->state == IB_QPS_RTR || + qp->state == IB_QPS_RTS) { + *qp_state = qp->state; + } else { + return -EINVAL; + } + } + + return 0; +} + +static int sq_state_to_qp_state(u8 sq_state, struct mlx5_ib_qp *qp, + u8 *qp_state) +{ + switch (sq_state) { + case MLX5_SQC_STATE_RST: + *qp_state = IB_QPS_RESET; + break; + case MLX5_SQC_STATE_RDY: + if (qp->state == IB_QPS_INIT || + qp->state == IB_QPS_RTR || + qp->state == IB_QPS_RTS) + *qp_state = qp->state; + else + return -EINVAL; + break; + case MLX5_SQC_STATE_ERR: + if (qp->state == IB_QPS_SQE || + qp->state == IB_QPS_ERR) + *qp_state = qp->state; + else + return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int rq_state_to_qp_state(u8 rq_state, struct mlx5_ib_qp *qp, + u8 *qp_state) +{ + switch (rq_state) { + case MLX5_RQC_STATE_RST: + *qp_state = IB_QPS_RESET; + break; + case MLX5_RQC_STATE_RDY: + if (qp->state == IB_QPS_INIT || + qp->state == IB_QPS_RTR || + qp->state == IB_QPS_RTS) + *qp_state = qp->state; + else + return -EINVAL; + break; + case MLX5_RQC_STATE_ERR: + *qp_state = IB_QPS_ERR; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int query_raw_qp_state(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + u8 *raw_qp_state) +{ + int err; + u8 sq_state = 0; + u8 rq_state = 0; + + if (!qp->sq.wqe_cnt && !qp->rq.wqe_cnt) + return -EINVAL; + + if (qp->sq.wqe_cnt) { + err = query_raw_qp_sq_state(dev, qp, &sq_state); + if (err) + return err; + } + + if (qp->rq.wqe_cnt) { + err = query_raw_qp_rq_state(dev, qp, &rq_state); + if (err) + return err; + } + + if (qp->sq.wqe_cnt && qp->rq.wqe_cnt) + return sqrq_state_to_qp_state(sq_state, rq_state, qp, + raw_qp_state); + else if (qp->sq.wqe_cnt) + return sq_state_to_qp_state(sq_state, qp, raw_qp_state); + else + return rq_state_to_qp_state(rq_state, qp, raw_qp_state); +} + +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_query_qp_mbox_out *outb; + struct mlx5_qp_context *context; + int mlx5_state; + int err = 0; + u8 raw_qp_state; + + mutex_lock(&qp->mutex); + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + err = query_raw_qp_state(dev, qp, &raw_qp_state); + if (err) + goto out; + qp->state = raw_qp_state; + qp_attr->port_num = 1; + } else { + outb = kzalloc(sizeof(*outb), GFP_KERNEL); + if (!outb) { + err = -ENOMEM; + goto out; + } + + context = &outb->ctx; + err = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, + sizeof(*outb)); + if (err) { + kfree(outb); + goto out; + } + + mlx5_state = be32_to_cpu(context->flags) >> 28; + + qp->state = to_ib_qp_state(mlx5_state); + qp_attr->path_mtu = context->mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context->qkey); + qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context->params2)); + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); + qp_attr->alt_pkey_index = be16_to_cpu(context->alt_path.pkey_index); + qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; + } + + qp_attr->pkey_index = be16_to_cpu(context->pri_path.pkey_index); + qp_attr->port_num = context->pri_path.port; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING; + + qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context->pri_path.ackto_lt >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; + qp_attr->rnr_retry = (be32_to_cpu(context->params1) >> 13) & 0x7; + qp_attr->alt_timeout = context->alt_path.ackto_lt >> 3; + + if (be32_to_cpu(context->flags_pd) & MLX5_QP_DRAIN_SIGERR) + qp_init_attr->create_flags |= IB_QP_CREATE_SIGNATURE_PIPELINE; + + + kfree(outb); + } + + qp_attr->qp_state = qp->state; + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; + qp_attr->cap.max_recv_sge = qp->rq.max_gs; + + if (!ibqp->uobject) { + qp_attr->cap.max_send_wr = qp->sq.max_post; + qp_attr->cap.max_send_sge = qp->sq.max_gs; + qp_init_attr->qp_context = ibqp->qp_context; + } else { + qp_attr->cap.max_send_wr = 0; + qp_attr->cap.max_send_sge = 0; + } + + qp_init_attr->qp_type = ibqp->qp_type; + qp_init_attr->recv_cq = ibqp->recv_cq; + qp_init_attr->send_cq = ibqp->send_cq; + qp_init_attr->srq = ibqp->srq; + qp_attr->cap.max_inline_data = qp->max_inline_data; + + qp_init_attr->cap = qp_attr->cap; + + qp_init_attr->create_flags = 0; + if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? + IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + + if (qp->flags & MLX5_IB_QP_CAP_CROSS_CHANNEL) + qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL; + + if (qp->flags & MLX5_IB_QP_CAP_MANAGED_SEND) + qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND; + + if (qp->flags & MLX5_IB_QP_CAP_MANAGED_RECV) + qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV; + +out: + mutex_unlock(&qp->mutex); + return err; +} + +struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_xrcd *xrcd; + int err; + + if (!MLX5_CAP_GEN(dev->mdev, xrc)) + return ERR_PTR(-ENOSYS); + + xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn); + if (err) { + kfree(xrcd); + return ERR_PTR(-ENOMEM); + } + + return &xrcd->ibxrcd; +} + +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + struct mlx5_ib_dev *dev = to_mdev(xrcd->device); + u32 xrcdn = to_mxrcd(xrcd)->xrcdn; + int err; + + err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn); + if (err) { + mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn); + return err; + } + + kfree(xrcd); + + return 0; +} + +static u32 atomic_mode_dct(struct mlx5_ib_dev *dev) +{ + unsigned long mask; + unsigned long tmp; + + mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp) & + MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); + + tmp = find_last_bit(&mask, BITS_PER_LONG); + if (tmp < 2) + return MLX5_ATOMIC_MODE_DCT_NONE; + + if (tmp == 2) + return MLX5_ATOMIC_MODE_DCT_CX; + + return tmp << MLX5_ATOMIC_MODE_DCT_OFF; +} + +static u32 ib_to_dct_acess(struct mlx5_ib_dev *dev, u32 ib_flags) +{ + u32 flags = 0; + + if (ib_flags & IB_ACCESS_REMOTE_READ) + flags |= MLX5_DCT_BIT_RRE; + if (ib_flags & IB_ACCESS_REMOTE_WRITE) + flags |= (MLX5_DCT_BIT_RWE | MLX5_DCT_BIT_RRE); + if (ib_flags & IB_ACCESS_REMOTE_ATOMIC) { + flags |= (MLX5_DCT_BIT_RAE | MLX5_DCT_BIT_RWE | MLX5_DCT_BIT_RRE); + flags |= atomic_mode_dct(dev); + } + + return flags; +} + +static void mlx5_ib_dct_event(struct mlx5_core_dct *dct, int type) +{ + struct ib_dct *ibdct = &to_mibdct(dct)->ibdct; + struct ib_event event; + + if (ibdct->event_handler) { + event.device = ibdct->device; + event.element.dct = ibdct; + switch (type) { + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EXP_EVENT_DCT_REQ_ERR; + break; + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EXP_EVENT_DCT_ACCESS_ERR; + break; + case MLX5_EVENT_TYPE_DCT_KEY_VIOLATION: + event.event = IB_EXP_EVENT_DCT_KEY_VIOLATION; + break; + default: + printf("mlx5_ib: WARN: ""mlx5_ib: Unexpected event type %d on DCT %06x\n", type, dct->dctn); + return; + } + + ibdct->event_handler(&event, ibdct->dct_context); + } +} + +struct ib_dct *mlx5_ib_create_dct(struct ib_pd *pd, + struct ib_dct_init_attr *attr, + struct ib_udata *udata) +{ + struct mlx5_create_dct_mbox_in *in; + struct mlx5_dct_context *dctx; + struct mlx5_ib_create_dct ucmd; + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_dct *dct; + void *dctc; + int cqe_sz; + int err; + u32 uidx = 0; + u32 cqn; + + if (pd && pd->uobject) { + if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { + mlx5_ib_err(dev, "copy failed\n"); + return ERR_PTR(-EFAULT); + } + + if (udata->inlen) + uidx = ucmd.uidx; + else + uidx = 0xffffff; + } else { + uidx = 0xffffff; + } + + dct = kzalloc(sizeof(*dct), GFP_KERNEL); + if (!dct) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_alloc; + } + + dctx = &in->context; + + cqn = to_mcq(attr->cq)->mcq.cqn; + if (cqn & 0xff000000) { + mlx5_ib_warn(dev, "invalid cqn 0x%x\n", cqn); + err = -EINVAL; + goto err_alloc; + } + dctx->cqn = cpu_to_be32(cqn); + dctx->flags |= cpu_to_be32(ib_to_dct_acess(dev, attr->access_flags)); + + if (attr->inline_size) { + cqe_sz = mlx5_ib_get_cqe_size(dev, attr->cq); + if (cqe_sz == 128) { + dctx->cs_res = MLX5_DCT_CS_RES_64; + attr->inline_size = 64; + } else { + attr->inline_size = 0; + } + } + dctx->min_rnr = attr->min_rnr_timer; + dctx->srqn = cpu_to_be32(to_msrq(attr->srq)->msrq.srqn); + dctx->pdn = cpu_to_be32(to_mpd(pd)->pdn); + dctx->tclass_flow_label |= cpu_to_be32(attr->tclass << 24); + dctx->tclass_flow_label |= cpu_to_be32(attr->flow_label); + dctx->access_key = cpu_to_be64(attr->dc_key); + dctx->mtu = attr->mtu; + dctx->port = attr->port; + dctx->pkey_index = cpu_to_be16(attr->pkey_index); + dctx->mgid_index = attr->gid_index; + dctx->hop_limit = attr->hop_limit; + + if (MLX5_CAP_GEN(dev->mdev, cqe_version)) { + dctc = MLX5_ADDR_OF(create_dct_in, in, dct_context_entry); + /* 0xffffff means we ask to work with cqe version 0 */ + MLX5_SET(dctc, dctc, user_index, uidx); + } + + err = mlx5_core_create_dct(dev->mdev, &dct->mdct, in); + if (err) + goto err_alloc; + + dct->ibdct.dct_num = dct->mdct.dctn; + dct->mdct.event = mlx5_ib_dct_event; + kfree(in); + return &dct->ibdct; + +err_alloc: + kfree(in); + kfree(dct); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_dct(struct ib_dct *dct) +{ + struct mlx5_ib_dev *dev = to_mdev(dct->device); + struct mlx5_ib_dct *mdct = to_mdct(dct); + int err; + + err = mlx5_core_destroy_dct(dev->mdev, &mdct->mdct); + if (!err) + kfree(mdct); + + return err; +} + +static int dct_to_ib_access(u32 dc_flags) +{ + u32 flags = 0; + + if (dc_flags & MLX5_DCT_BIT_RRE) + flags |= IB_ACCESS_REMOTE_READ; + if (dc_flags & MLX5_QP_BIT_RWE) + flags |= IB_ACCESS_REMOTE_WRITE; + if ((dc_flags & MLX5_ATOMIC_MODE_CX) == MLX5_ATOMIC_MODE_CX) + flags |= IB_ACCESS_REMOTE_ATOMIC; + + return flags; +} + +int mlx5_ib_query_dct(struct ib_dct *dct, struct ib_dct_attr *attr) +{ + struct mlx5_ib_dev *dev = to_mdev(dct->device); + struct mlx5_ib_dct *mdct = to_mdct(dct); + struct mlx5_query_dct_mbox_out *out; + struct mlx5_dct_context *ctx; + int err; + + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_core_dct_query(dev->mdev, &mdct->mdct, out); + if (err) + goto out; + + ctx = &out->ctx; + + attr->dc_key = be64_to_cpu(ctx->access_key); + attr->port = ctx->port; + attr->access_flags = dct_to_ib_access(be32_to_cpu(ctx->flags)); + attr->min_rnr_timer = ctx->min_rnr & 0x1f; + attr->tclass = be32_to_cpu(ctx->tclass_flow_label) >> 24; + attr->flow_label = be32_to_cpu(ctx->tclass_flow_label) & 0xfffff; + attr->mtu = ctx->mtu & 7; + attr->pkey_index = be16_to_cpu(ctx->pkey_index); + attr->gid_index = ctx->mgid_index; + attr->hop_limit = ctx->hop_limit; + attr->key_violations = be32_to_cpu(ctx->access_violations); + attr->state = ctx->state & 0xf; + +out: + kfree(out); + return err; +} + +int mlx5_ib_arm_dct(struct ib_dct *dct, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(dct->device); + struct mlx5_ib_dct *mdct = to_mdct(dct); + struct mlx5_ib_arm_dct ucmd; + struct mlx5_ib_arm_dct_resp resp; + int err; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) { + mlx5_ib_err(dev, "copy failed\n"); + return err; + } + + if (ucmd.reserved0 || ucmd.reserved1) + return -EINVAL; + + err = mlx5_core_arm_dct(dev->mdev, &mdct->mdct); + if (err) + goto out; + + memset(&resp, 0, sizeof(resp)); + err = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (err) + mlx5_ib_err(dev, "copy failed\n"); + +out: + return err; +} + +static int mlx5_ib_create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, + struct ib_wq_init_attr *init_attr) +{ + struct mlx5_ib_dev *dev; + __be64 *rq_pas0; + void *in; + void *rqc; + void *wq; + int inlen; + int err; + + dev = to_mdev(pd->device); + + if (init_attr->mp_rq.use_mp_rq) { + if (MLX5_CAP_GEN(dev->mdev, striding_rq) != RQ_TYPE_STRIDE) + return -EOPNOTSUPP; + + if (init_attr->mp_rq.use_shift & ~IB_MP_RQ_2BYTES_SHIFT || + init_attr->mp_rq.single_stride_log_num_of_bytes < MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES || + init_attr->mp_rq.single_stride_log_num_of_bytes > MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES || + init_attr->mp_rq.single_wqe_log_num_of_strides < MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES || + init_attr->mp_rq.single_wqe_log_num_of_strides > MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES) + return -EINVAL; + } + + inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rwq->rq_num_pas; + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + if (init_attr->vlan_offloads & IB_WQ_CVLAN_STRIPPING) { + if (!(MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, vlan_cap))) + return -EOPNOTSUPP; + } else { + MLX5_SET(rqc, rqc, vlan_strip_disable, 1); + } + MLX5_SET(rqc, rqc, mem_rq_type, + MLX5_RQC_RQ_TYPE_MEMORY_RQ_INLINE); + MLX5_SET(rqc, rqc, user_index, rwq->user_index); + MLX5_SET(rqc, rqc, cqn, to_mcq(init_attr->cq)->mcq.cqn); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, flush_in_error_en, 1); + wq = MLX5_ADDR_OF(rqc, rqc, wq); + if (init_attr->mp_rq.use_mp_rq) { + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_STRQ_CYCLIC); + MLX5_SET(wq, wq, single_wqe_log_num_of_strides, + (init_attr->mp_rq.single_wqe_log_num_of_strides - + MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES)); + MLX5_SET(wq, wq, single_stride_log_num_of_bytes, + (init_attr->mp_rq.single_stride_log_num_of_bytes - + MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES)); + if (init_attr->mp_rq.use_shift == IB_MP_RQ_2BYTES_SHIFT) + MLX5_SET(wq, wq, two_byte_shift_en, 0x1); + } else { + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + } + if (MLX5_CAP_GEN(dev->mdev, pad_cap)) { + if (init_attr->flags & IB_CREATE_WQ_FLAG_RX_END_PADDING) + MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN); + else + MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_NONE); + } + MLX5_SET(wq, wq, log_wq_stride, rwq->log_rq_stride); + MLX5_SET(wq, wq, log_wq_sz, rwq->log_rq_size); + MLX5_SET(wq, wq, pd, to_mpd(pd)->pdn); + MLX5_SET(wq, wq, page_offset, rwq->rq_page_offset); + MLX5_SET(wq, wq, log_wq_pg_sz, rwq->log_page_size); + MLX5_SET(wq, wq, wq_signature, rwq->wq_sig); + MLX5_SET64(wq, wq, dbr_addr, rwq->db.dma); + rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0); + err = mlx5_core_create_rq(dev->mdev, in, inlen, &rwq->rqn); + kvfree(in); + + return err; +} + +static int set_user_rq_size(struct mlx5_ib_dev *dev, + struct ib_wq_init_attr *wq_init_attr, + struct mlx5_ib_create_wq *ucmd, + struct mlx5_ib_rwq *rwq) +{ + /* TBD: Sanity check RQ size before proceeding */ + rwq->wqe_count = ucmd->rq_wqe_count; + rwq->wqe_shift = ucmd->rq_wqe_shift; + rwq->buf_size = (rwq->wqe_count << rwq->wqe_shift); + rwq->log_rq_stride = rwq->wqe_shift; + rwq->log_rq_size = ilog2(rwq->wqe_count); + return 0; +} + +static int mlx5_ib_prepare_user_rq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata, + struct mlx5_ib_rwq *rwq) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_create_wq ucmd; + int err; + + if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { + mlx5_ib_err(dev, "copy failed\n"); + return -EFAULT; + } + + err = set_user_rq_size(dev, init_attr, &ucmd, rwq); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + return err; + } + + err = create_user_rq(dev, pd, rwq, &ucmd); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + return err; + } + + rwq->user_index = ucmd.user_index; + return 0; +} + +struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_rwq *rwq; + int err; + + if (!udata) { + mlx5_ib_warn(dev, "no udata\n"); + return ERR_PTR(-ENOSYS); + } + + switch (init_attr->wq_type) { + case IB_WQT_SRQ: + mlx5_ib_err(dev, "unsupported wq type SRQ\n"); + return ERR_PTR(-ENOSYS); + case IB_WQT_RQ: + rwq = kzalloc(sizeof(*rwq), GFP_KERNEL); + if (!rwq) + return ERR_PTR(-ENOMEM); + err = mlx5_ib_prepare_user_rq(pd, init_attr, udata, rwq); + if (err) + goto err; + err = mlx5_ib_create_rq(rwq, pd, init_attr); + if (err) { + mlx5_ib_warn(dev, "create RQ failed: %d\n", err); + goto err_user_rq; + } + break; + default: + mlx5_ib_warn(dev, "unsupported wq type %d\n", + init_attr->wq_type); + return ERR_PTR(-EINVAL); + } + + rwq->ibwq.wq_num = rwq->rqn; + rwq->ibwq.state = IB_WQS_RESET; + return &rwq->ibwq; + +err_user_rq: + destroy_user_rq(pd, rwq); +err: + kfree(rwq); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_wq(struct ib_wq *wq) +{ + struct mlx5_ib_rwq *rwq = to_mrwq(wq); + struct mlx5_ib_dev *dev = to_mdev(wq->device); + + if (wq->wq_type != IB_WQT_RQ) + return -EINVAL; + + mlx5_core_destroy_rq(dev->mdev, rwq->rqn); + destroy_user_rq(wq->pd, rwq); + kfree(rwq); + + return 0; +} + +struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_rwq_ind_table *rwq_ind_tbl; + struct mlx5_ib_dev *dev; + int inlen; + int err; + int log_tbl_sz = init_attr->log_ind_tbl_size; + int sz = 1 << log_tbl_sz; + int i; + u32 *in; + void *rqtc; + + if (!udata) + return ERR_PTR(-ENOSYS); + + rwq_ind_tbl = kzalloc(sizeof(*rwq_ind_tbl), GFP_KERNEL); + if (!rwq_ind_tbl) + return ERR_PTR(-ENOMEM); + + dev = to_mdev(device); + inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz; + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto err; + } + + rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context); + + MLX5_SET(rqtc, rqtc, rqt_actual_size, sz); + MLX5_SET(rqtc, rqtc, rqt_max_size, sz); + + for (i = 0; i < sz; i++) + MLX5_SET(rqtc, rqtc, rq_num[i], init_attr->ind_tbl[i]->wq_num); + + err = mlx5_core_create_rqt(dev->mdev, in, inlen, &rwq_ind_tbl->rqtn); + kvfree(in); + + if (err) + goto err; + + rwq_ind_tbl->ib_rwq_ind_tbl.ind_tbl_num = rwq_ind_tbl->rqtn; + return &rwq_ind_tbl->ib_rwq_ind_tbl; +err: + kfree(rwq_ind_tbl); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + struct mlx5_ib_dev *dev; + struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = to_mrwq_ind_table(ib_rwq_ind_tbl); + + dev = to_mdev(ib_rwq_ind_tbl->device); + mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn); + + kfree(rwq_ind_tbl); + return 0; +} + +int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + enum ib_wq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_rwq *rwq = to_mrwq(wq); + struct mlx5_ib_dev *dev = to_mdev(wq->device); + int wq_state; + int curr_wq_state; + void *in; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_rq_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + + MLX5_SET(modify_rq_in, in, rqn, rwq->rqn); + curr_wq_state = (attr_mask & IB_WQ_CUR_STATE) ? + wq_attr->curr_wq_state : wq->state; + wq_state = (attr_mask & IB_WQ_STATE) ? + wq_attr->wq_state : curr_wq_state; + if (curr_wq_state == IB_WQS_ERR) + curr_wq_state = MLX5_RQC_STATE_ERR; + if (wq_state == IB_WQS_ERR) + wq_state = MLX5_RQC_STATE_ERR; + MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state); + MLX5_SET(rqc, rqc, state, wq_state); + if (attr_mask & IB_WQ_VLAN_OFFLOADS) { + MLX5_SET(modify_rq_in, in, bitmask.vlan_strip_disable, 1); + if (wq_attr->vlan_offloads & IB_WQ_CVLAN_STRIPPING) { + if (!(MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, vlan_cap))) + return -EOPNOTSUPP; + MLX5_SET(rqc, rqc, vlan_strip_disable, 0); + } else { + MLX5_SET(rqc, rqc, vlan_strip_disable, 1); + } + } + + err = mlx5_core_modify_rq(dev->mdev, in, inlen); + kvfree(in); + if (!err) + rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state; + + return err; +} Index: sys/dev/mlx5/mlx5_ib/mlx5_roce.c =================================================================== --- /dev/null +++ sys/dev/mlx5/mlx5_ib/mlx5_roce.c @@ -0,0 +1,298 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include "mlx5_ib.h" + +struct net_device *mlx5_ib_get_netdev(struct ib_device *ib_dev, u8 port) +{ + struct mlx5_ib_dev *dev = to_mdev(ib_dev); + + return mlx5_get_protocol_dev(dev->mdev, MLX5_INTERFACE_PROTOCOL_ETH); +} + +static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid, + const struct ib_gid_attr *attr, + void *mlx5_addr) +{ +#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v) + char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, + source_l3_address); + void *mlx5_addr_mac = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, + source_mac_47_32); + union ib_gid zgid; + u16 vtag; + + memset(&zgid, 0, sizeof(zgid)); + if (0 == memcmp(gid, &zgid, sizeof(zgid))) + return; + + ether_addr_copy(mlx5_addr_mac, IF_LLADDR(attr->ndev)); + + if (VLAN_TAG(attr->ndev, &vtag) == 0) { + MLX5_SET_RA(mlx5_addr, vlan_valid, 1); + MLX5_SET_RA(mlx5_addr, vlan_id, vtag); + } + + switch (attr->gid_type) { + case IB_GID_TYPE_IB: + MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1); + break; +#if defined(DX_ROCE_V1_5) || defined(DX_WINDOWS) + case IB_GID_TYPE_ROCE_V1_5: + MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1_5); + break; +#endif + case IB_GID_TYPE_ROCE_V2: + MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2); + break; + + default: + WARN_ON(true); + } + + if (attr->gid_type != IB_GID_TYPE_IB) { + if (ipv6_addr_v4mapped((void *)gid)) + MLX5_SET_RA(mlx5_addr, roce_l3_type, + MLX5_ROCE_L3_TYPE_IPV4); + else + MLX5_SET_RA(mlx5_addr, roce_l3_type, + MLX5_ROCE_L3_TYPE_IPV6); + } + + if ((attr->gid_type == IB_GID_TYPE_IB) || + !ipv6_addr_v4mapped((void *)gid)) + memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid)); + else + memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4); +} + +int modify_gid_roce(struct ib_device *ib_dev, u8 port, unsigned int index, + const union ib_gid *gid, const struct ib_gid_attr *attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ib_dev); + u32 in[MLX5_ST_SZ_DW(set_roce_address_in)]; + u32 out[MLX5_ST_SZ_DW(set_roce_address_out)]; + void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address); + + memset(in, 0, sizeof(in)); + + ib_gid_to_mlx5_roce_addr(gid, attr, in_addr); + + MLX5_SET(set_roce_address_in, in, roce_address_index, index); + MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS); + + memset(out, 0, sizeof(out)); + return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); +} + +static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed, + u8 *active_width) +{ + switch (eth_proto_oper) { + case MLX5_PROT_MASK(MLX5_1000BASE_CX_SGMII): + case MLX5_PROT_MASK(MLX5_1000BASE_KX): + case MLX5_PROT_MASK(MLX5_100BASE_TX): + case MLX5_PROT_MASK(MLX5_1000BASE_T): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_SDR; + break; + case MLX5_PROT_MASK(MLX5_10GBASE_T): + case MLX5_PROT_MASK(MLX5_10GBASE_CX4): + case MLX5_PROT_MASK(MLX5_10GBASE_KX4): + case MLX5_PROT_MASK(MLX5_10GBASE_KR): + case MLX5_PROT_MASK(MLX5_10GBASE_CR): + case MLX5_PROT_MASK(MLX5_10GBASE_SR): + case MLX5_PROT_MASK(MLX5_10GBASE_ER): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_QDR; + break; + case MLX5_PROT_MASK(MLX5_25GBASE_CR): + case MLX5_PROT_MASK(MLX5_25GBASE_KR): + case MLX5_PROT_MASK(MLX5_25GBASE_SR): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_EDR; + break; + case MLX5_PROT_MASK(MLX5_40GBASE_CR4): + case MLX5_PROT_MASK(MLX5_40GBASE_KR4): + case MLX5_PROT_MASK(MLX5_40GBASE_SR4): + case MLX5_PROT_MASK(MLX5_40GBASE_LR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_QDR; + break; + case MLX5_PROT_MASK(MLX5_50GBASE_CR2): + case MLX5_PROT_MASK(MLX5_50GBASE_KR2): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_HDR; + break; + case MLX5_PROT_MASK(MLX5_56GBASE_R4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_FDR; + break; + case MLX5_PROT_MASK(MLX5_100GBASE_CR4): + case MLX5_PROT_MASK(MLX5_100GBASE_SR4): + case MLX5_PROT_MASK(MLX5_100GBASE_KR4): + case MLX5_PROT_MASK(MLX5_100GBASE_LR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_EDR; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int mlx5_query_roce_port_ptys(struct ib_device *ib_dev, + struct ib_port_attr *props, u8 port) +{ + struct mlx5_ib_dev *dev = to_mdev(ib_dev); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ptys_reg *ptys; + int err; + + ptys = kzalloc(sizeof(*ptys), GFP_KERNEL); + if (!ptys) + return -ENOMEM; + + ptys->proto_mask |= MLX5_PTYS_EN; + ptys->local_port = port; + + err = mlx5_core_access_ptys(mdev, ptys, 0); + if (err) + goto out; + + err = translate_eth_proto_oper(ptys->eth_proto_oper, + &props->active_speed, + &props->active_width); +out: + kfree(ptys); + return err; +} + +int mlx5_query_port_roce(struct ib_device *ib_dev, u8 port, + struct ib_port_attr *props) +{ + struct net_device *netdev = mlx5_ib_get_netdev(ib_dev, port); + struct mlx5_ib_dev *dev = to_mdev(ib_dev); + enum ib_mtu netdev_ib_mtu; + u8 l3_type = MLX5_CAP_ROCE(dev->mdev, l3_type); + u8 version = MLX5_CAP_ROCE(dev->mdev, roce_version); + + + memset(props, 0, sizeof(*props)); + + if ((l3_type & MLX5_ROCE_L3_TYPE_IPV4_CAP) && + (l3_type & MLX5_ROCE_L3_TYPE_IPV6_CAP)) { +#if defined(DX_ROCE_V1_5) || defined(DX_WINDOWS) + if (version & MLX5_ROCE_VERSION_1_5_CAP) + props->port_cap_flags |= IB_PORT_ROCE_V1_5; +#endif + if (version & MLX5_ROCE_VERSION_2_CAP) + props->port_cap_flags |= IB_PORT_ROCE_V2; + } +#if defined(DX_ROCE_V1_5) || defined(DX_WINDOWS) + if (props->port_cap_flags & (IB_PORT_ROCE_V1_5 | IB_PORT_ROCE_V2)) +#else + if (props->port_cap_flags & IB_PORT_ROCE_V2) +#endif + if (version & MLX5_ROCE_VERSION_1_CAP) + props->port_cap_flags |= IB_PORT_ROCE; + + props->port_cap_flags |= IB_PORT_CM_SUP; + props->port_cap_flags |= IB_PORT_IP_BASED_GIDS; + + props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev, + roce_address_table_size); + props->max_mtu = IB_MTU_4096; + props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg); + props->pkey_tbl_len = 1; + props->state = IB_PORT_DOWN; + props->phys_state = 3; + + if (mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, + (u16 *)&props->qkey_viol_cntr)) + printf("mlx5_ib: WARN: ""%s failed to query qkey violations counter\n", __func__); + + + if (!netdev) + return 0; + + if (netif_running(netdev) && netif_carrier_ok(netdev)) { + props->state = IB_PORT_ACTIVE; + props->phys_state = 5; + } + + netdev_ib_mtu = iboe_get_mtu(netdev->if_mtu); + props->active_mtu = min(props->max_mtu, netdev_ib_mtu); + + mlx5_query_roce_port_ptys(ib_dev, props, port); + + return 0; +} + +__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port, + int index, __be16 ah_s_udp_port) +{ + struct ib_gid_attr attr; + union ib_gid gid; + + ib_get_cached_gid(&dev->ib_dev, port, index, &gid, &attr); + if (attr.gid_type != IB_GID_TYPE_ROCE_V2) + return 0; + + if (ah_s_udp_port) { + u16 hp = be16_to_cpu(ah_s_udp_port); + + if (hp >= MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port) && + hp <= MLX5_CAP_ROCE(dev->mdev, r_roce_max_src_udp_port)) + return ah_s_udp_port; + } + + return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port)); +} + +int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port, + int index, int *gid_type) +{ + struct ib_gid_attr attr; + union ib_gid gid; + int ret; + + ret = ib_get_cached_gid(&dev->ib_dev, port, index, &gid, &attr); + + if (!ret) + *gid_type = attr.gid_type; + + return ret; +} Index: sys/dev/mlx5/mlx5_ib/mlx5_srq.c =================================================================== --- /dev/null +++ sys/dev/mlx5/mlx5_ib/mlx5_srq.c @@ -0,0 +1,507 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include + +#include "mlx5_ib.h" +#include "user.h" + +/* not supported currently */ +static int srq_signature; + +static void *get_wqe(struct mlx5_ib_srq *srq, int n) +{ + return mlx5_buf_offset(&srq->buf, n << srq->msrq.wqe_shift); +} + +static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, int type) +{ + struct ib_event event; + struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq; + + if (ibsrq->event_handler) { + event.device = ibsrq->device; + event.element.srq = ibsrq; + switch (type) { + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + break; + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + event.event = IB_EVENT_SRQ_ERR; + break; + default: + printf("mlx5_ib: WARN: ""mlx5_ib: Unexpected event type %d on SRQ %06x\n", type, srq->srqn); + return; + } + + ibsrq->event_handler(&event, ibsrq->srq_context); + } +} + +static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, + struct mlx5_create_srq_mbox_in **in, + struct ib_udata *udata, int buf_size, int *inlen) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_create_srq ucmd; + size_t ucmdlen; + void *xsrqc; + int err; + int npages; + int page_shift; + int ncont; + int drv_data = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); + u32 offset; + + ucmdlen = (drv_data < sizeof(ucmd)) ? + drv_data : sizeof(ucmd); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { + mlx5_ib_err(dev, "failed copy udata\n"); + return -EFAULT; + } + + if (ucmdlen == sizeof(ucmd) && + ucmd.reserved1 != 0) { + mlx5_ib_warn(dev, "corrupted ucmd\n"); + return -EINVAL; + } + + srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); + + srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, + 0, 0); + if (IS_ERR(srq->umem)) { + mlx5_ib_warn(dev, "failed umem get, size %d\n", buf_size); + err = PTR_ERR(srq->umem); + return err; + } + + mlx5_ib_cont_pages(srq->umem, ucmd.buf_addr, &npages, + &page_shift, &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, + &offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; + *in = mlx5_vzalloc(*inlen); + if (!(*in)) { + mlx5_ib_err(dev, "failed allocate mbox\n"); + err = -ENOMEM; + goto err_umem; + } + + mlx5_ib_populate_pas(dev, srq->umem, page_shift, (*in)->pas, 0); + + err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), + ucmd.db_addr, &srq->db); + if (err) { + mlx5_ib_warn(dev, "map doorbell failed\n"); + goto err_in; + } + + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); + + if (MLX5_CAP_GEN(dev->mdev, cqe_version)) { + xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, + xrc_srq_context_entry); + /* 0xffffff means we ask to work with cqe version 0 */ + if (drv_data > offsetof(struct mlx5_ib_create_srq, uidx)) + MLX5_SET(xrc_srqc, xsrqc, user_index, ucmd.uidx); + else + MLX5_SET(xrc_srqc, xsrqc, user_index, 0xffffff); + } + + return 0; + +err_in: + kvfree(*in); + +err_umem: + ib_umem_release(srq->umem); + + return err; +} + +static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, + struct mlx5_create_srq_mbox_in **in, int buf_size, + int *inlen) +{ + int err; + int i; + struct mlx5_wqe_srq_next_seg *next; + int page_shift; + int npages; + void *xsrqc; + + err = mlx5_db_alloc(dev->mdev, &srq->db); + if (err) { + mlx5_ib_warn(dev, "alloc dbell rec failed\n"); + return err; + } + + if (mlx5_buf_alloc(dev->mdev, buf_size, PAGE_SIZE * 2, &srq->buf)) { + mlx5_ib_err(dev, "buf alloc failed\n"); + err = -ENOMEM; + goto err_db; + } + page_shift = srq->buf.page_shift; + + srq->head = 0; + srq->tail = srq->msrq.max - 1; + srq->wqe_ctr = 0; + + for (i = 0; i < srq->msrq.max; i++) { + next = get_wqe(srq, i); + next->next_wqe_index = + cpu_to_be16((i + 1) & (srq->msrq.max - 1)); + } + + npages = DIV_ROUND_UP(srq->buf.npages, 1 << (page_shift - PAGE_SHIFT)); + mlx5_ib_dbg(dev, "buf_size %d, page_shift %d, npages %d, calc npages %d\n", + buf_size, page_shift, srq->buf.npages, npages); + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * npages; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + mlx5_ib_err(dev, "failed allocate mbox\n"); + err = -ENOMEM; + goto err_buf; + } + mlx5_fill_page_array(&srq->buf, (*in)->pas); + + srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL); + if (!srq->wrid) { + err = -ENOMEM; + goto err_in; + } + srq->wq_sig = !!srq_signature; + + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + + if (MLX5_CAP_GEN(dev->mdev, cqe_version)) { + xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, + xrc_srq_context_entry); + /* 0xffffff means we ask to work with cqe version 0 */ + MLX5_SET(xrc_srqc, xsrqc, user_index, 0xffffff); + } + + return 0; + +err_in: + kvfree(*in); + +err_buf: + mlx5_buf_free(dev->mdev, &srq->buf); + +err_db: + mlx5_db_free(dev->mdev, &srq->db); + return err; +} + +static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq) +{ + mlx5_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db); + ib_umem_release(srq->umem); +} + + +static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq) +{ + kfree(srq->wrid); + mlx5_buf_free(dev->mdev, &srq->buf); + mlx5_db_free(dev->mdev, &srq->db); +} + +struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_srq *srq; + int desc_size; + int buf_size; + int err; + struct mlx5_create_srq_mbox_in *uninitialized_var(in); + int uninitialized_var(inlen); + int is_xrc; + u32 flgs, xrcdn; + __u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); + + /* Sanity check SRQ size before proceeding */ + if (init_attr->attr.max_wr >= max_srq_wqes) { + mlx5_ib_warn(dev, "max_wr %d, cap %d\n", + init_attr->attr.max_wr, + max_srq_wqes); + return ERR_PTR(-EINVAL); + } + + srq = kmalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) + return ERR_PTR(-ENOMEM); + + mutex_init(&srq->mutex); + spin_lock_init(&srq->lock); + srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); + srq->msrq.max_gs = init_attr->attr.max_sge; + + desc_size = sizeof(struct mlx5_wqe_srq_next_seg) + + srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg); + desc_size = roundup_pow_of_two(desc_size); + desc_size = max_t(int, 32, desc_size); + srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) / + sizeof(struct mlx5_wqe_data_seg); + srq->msrq.wqe_shift = ilog2(desc_size); + buf_size = srq->msrq.max * desc_size; + mlx5_ib_dbg(dev, "desc_size 0x%x, req wr 0x%x, srq size 0x%x, max_gs 0x%x, max_avail_gather 0x%x\n", + desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs, + srq->msrq.max_avail_gather); + + if (pd->uobject) + err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen); + else + err = create_srq_kernel(dev, srq, &in, buf_size, &inlen); + + if (err) { + mlx5_ib_warn(dev, "create srq %s failed, err %d\n", + pd->uobject ? "user" : "kernel", err); + goto err_srq; + } + + is_xrc = (init_attr->srq_type == IB_SRQT_XRC); + in->ctx.state_log_sz = ilog2(srq->msrq.max); + flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24; + xrcdn = 0; + if (is_xrc) { + xrcdn = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn; + in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(init_attr->ext.xrc.cq)->mcq.cqn); + } else if (init_attr->srq_type == IB_SRQT_BASIC) { + xrcdn = to_mxrcd(dev->devr.x0)->xrcdn; + in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(dev->devr.c0)->mcq.cqn); + } + + in->ctx.flags_xrcd = cpu_to_be32((flgs & 0xFF000000) | (xrcdn & 0xFFFFFF)); + + in->ctx.pd = cpu_to_be32(to_mpd(pd)->pdn); + in->ctx.db_record = cpu_to_be64(srq->db.dma); + err = mlx5_core_create_srq(dev->mdev, &srq->msrq, in, inlen, is_xrc); + kvfree(in); + if (err) { + mlx5_ib_warn(dev, "create SRQ failed, err %d\n", err); + goto err_usr_kern_srq; + } + + mlx5_ib_dbg(dev, "create SRQ with srqn 0x%x\n", srq->msrq.srqn); + + srq->msrq.event = mlx5_ib_srq_event; + srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; + + if (pd->uobject) + if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) { + mlx5_ib_err(dev, "copy to user failed\n"); + err = -EFAULT; + goto err_core; + } + + init_attr->attr.max_wr = srq->msrq.max - 1; + + return &srq->ibsrq; + +err_core: + mlx5_core_destroy_srq(dev->mdev, &srq->msrq); + +err_usr_kern_srq: + if (pd->uobject) + destroy_srq_user(pd, srq); + else + destroy_srq_kernel(dev, srq); + +err_srq: + kfree(srq); + + return ERR_PTR(err); +} + +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + + /* We don't support resizing SRQs yet */ + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (attr_mask & IB_SRQ_LIMIT) { + if (attr->srq_limit >= srq->msrq.max) + return -EINVAL; + + mutex_lock(&srq->mutex); + ret = mlx5_core_arm_srq(dev->mdev, &srq->msrq, attr->srq_limit, 1); + mutex_unlock(&srq->mutex); + + if (ret) + return ret; + } + + return 0; +} + +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + struct mlx5_query_srq_mbox_out *out; + + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) + return -ENOMEM; + + ret = mlx5_core_query_srq(dev->mdev, &srq->msrq, out); + if (ret) + goto out_box; + + srq_attr->srq_limit = be16_to_cpu(out->ctx.lwm); + srq_attr->max_wr = srq->msrq.max - 1; + srq_attr->max_sge = srq->msrq.max_gs; + +out_box: + kfree(out); + return ret; +} + +int mlx5_ib_destroy_srq(struct ib_srq *srq) +{ + struct mlx5_ib_dev *dev = to_mdev(srq->device); + struct mlx5_ib_srq *msrq = to_msrq(srq); + + mlx5_core_destroy_srq(dev->mdev, &msrq->msrq); + + if (srq->uobject) { + mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); + ib_umem_release(msrq->umem); + } else { + destroy_srq_kernel(dev, msrq); + } + + kfree(srq); + return 0; +} + +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index) +{ + struct mlx5_wqe_srq_next_seg *next; + + /* always called with interrupts disabled. */ + spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = cpu_to_be16(wqe_index); + srq->tail = wqe_index; + + spin_unlock(&srq->lock); +} + +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + struct mlx5_wqe_srq_next_seg *next; + struct mlx5_wqe_data_seg *scat; + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_core_dev *mdev = dev->mdev; + unsigned long flags; + int err = 0; + int nreq; + int i; + + spin_lock_irqsave(&srq->lock, flags); + + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->num_sge > srq->msrq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely(srq->head == srq->tail)) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = be16_to_cpu(next->next_wqe_index); + scat = (struct mlx5_wqe_data_seg *)(next + 1); + + for (i = 0; i < wr->num_sge; i++) { + scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); + scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); + scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); + } + + if (i < srq->msrq.max_avail_gather) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (likely(nreq)) { + srq->wqe_ctr += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *srq->db.db = cpu_to_be32(srq->wqe_ctr); + } +out: + spin_unlock_irqrestore(&srq->lock, flags); + + return err; +} Index: sys/dev/mlx5/mlx5_ib/user.h =================================================================== --- /dev/null +++ sys/dev/mlx5/mlx5_ib/user.h @@ -0,0 +1,318 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef MLX5_IB_USER_H +#define MLX5_IB_USER_H + +#include + +enum { + MLX5_QP_FLAG_SIGNATURE = 1 << 0, +}; + +enum { + MLX5_SRQ_FLAG_SIGNATURE = 1 << 0, +}; + +enum { + MLX5_WQ_FLAG_SIGNATURE = 1 << 0, +}; + + +/* Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define MLX5_IB_UVERBS_ABI_VERSION 1 + +/* Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mlx5_ib_alloc_ucontext_req { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; +}; + +struct mlx5_ib_alloc_ucontext_req_v2 { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; + __u32 flags; + __u32 reserved; +}; + +struct mlx5_ib_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 bf_reg_size; + __u32 tot_uuars; + __u32 cache_line_size; + __u16 max_sq_desc_sz; + __u16 max_rq_desc_sz; + __u32 max_send_wqebb; + __u32 max_recv_wr; + __u32 max_srq_recv_wr; + __u16 num_ports; + __u16 reserved; + __u32 max_desc_sz_sq_dc; + __u32 atomic_arg_sizes_dc; + __u32 reserved1; + __u32 flags; + __u32 reserved2[5]; +}; + +enum mlx5_exp_ib_alloc_ucontext_data_resp_mask { + MLX5_EXP_ALLOC_CTX_RESP_MASK_CQE_COMP_MAX_NUM = 1 << 0, + MLX5_EXP_ALLOC_CTX_RESP_MASK_CQE_VERSION = 1 << 1, + MLX5_EXP_ALLOC_CTX_RESP_MASK_RROCE_UDP_SPORT_MIN = 1 << 2, + MLX5_EXP_ALLOC_CTX_RESP_MASK_RROCE_UDP_SPORT_MAX = 1 << 3, + MLX5_EXP_ALLOC_CTX_RESP_MASK_HCA_CORE_CLOCK_OFFSET = 1 << 4, +}; + +struct mlx5_exp_ib_alloc_ucontext_data_resp { + __u32 comp_mask; /* use mlx5_ib_exp_alloc_ucontext_data_resp_mask */ + __u16 cqe_comp_max_num; + __u8 cqe_version; + __u8 reserved; + __u16 rroce_udp_sport_min; + __u16 rroce_udp_sport_max; + __u32 hca_core_clock_offset; +}; + +struct mlx5_exp_ib_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 bf_reg_size; + __u32 tot_uuars; + __u32 cache_line_size; + __u16 max_sq_desc_sz; + __u16 max_rq_desc_sz; + __u32 max_send_wqebb; + __u32 max_recv_wr; + __u32 max_srq_recv_wr; + __u16 num_ports; + __u16 reserved; + __u32 max_desc_sz_sq_dc; + __u32 atomic_arg_sizes_dc; + __u32 reserved1; + __u32 flags; + __u32 reserved2[5]; + /* Some more reserved fields for + * future growth of mlx5_ib_alloc_ucontext_resp */ + __u64 prefix_reserved[8]; + struct mlx5_exp_ib_alloc_ucontext_data_resp exp_data; +}; + +struct mlx5_ib_alloc_pd_resp { + __u32 pdn; +}; + +struct mlx5_ib_create_cq { + __u64 buf_addr; + __u64 db_addr; + __u32 cqe_size; + __u32 reserved; /* explicit padding (optional on i386) */ +}; + +enum mlx5_exp_ib_create_cq_mask { + MLX5_EXP_CREATE_CQ_MASK_CQE_COMP_EN = 1 << 0, + MLX5_EXP_CREATE_CQ_MASK_CQE_COMP_RECV_TYPE = 1 << 1, + MLX5_EXP_CREATE_CQ_MASK_RESERVED = 1 << 2, +}; + +enum mlx5_exp_cqe_comp_recv_type { + MLX5_IB_CQE_FORMAT_HASH, + MLX5_IB_CQE_FORMAT_CSUM, +}; + +struct mlx5_exp_ib_create_cq_data { + __u32 comp_mask; /* use mlx5_exp_ib_creaet_cq_mask */ + __u8 cqe_comp_en; + __u8 cqe_comp_recv_type; /* use mlx5_exp_cqe_comp_recv_type */ + __u16 reserved; +}; + +struct mlx5_exp_ib_create_cq { + __u64 buf_addr; + __u64 db_addr; + __u32 cqe_size; + __u32 reserved; /* explicit padding (optional on i386) */ + + /* Some more reserved fields for future growth of mlx5_ib_create_cq */ + __u64 prefix_reserved[8]; + + /* sizeof prefix aligned with mlx5_ib_create_cq */ + __u64 size_of_prefix; + struct mlx5_exp_ib_create_cq_data exp_data; +}; + +struct mlx5_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mlx5_ib_resize_cq { + __u64 buf_addr; + __u16 cqe_size; + __u16 reserved0; + __u32 reserved1; +}; + +struct mlx5_ib_create_srq { + __u64 buf_addr; + __u64 db_addr; + __u32 flags; + __u32 reserved; /* explicit padding (optional on i386) */ + __u32 uidx; + __u32 reserved1; +}; + +struct mlx5_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mlx5_ib_create_qp { + __u64 buf_addr; + __u64 db_addr; + __u32 sq_wqe_count; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 flags; +}; + +enum mlx5_exp_ib_create_qp_mask { + MLX5_EXP_CREATE_QP_MASK_UIDX = 1 << 0, + MLX5_EXP_CREATE_QP_MASK_SQ_BUFF_ADD = 1 << 1, + MLX5_EXP_CREATE_QP_MASK_WC_UAR_IDX = 1 << 2, + MLX5_EXP_CREATE_QP_MASK_FLAGS_IDX = 1 << 3, + MLX5_EXP_CREATE_QP_MASK_RESERVED = 1 << 4, +}; + +enum mlx5_exp_create_qp_flags { + MLX5_EXP_CREATE_QP_MULTI_PACKET_WQE_REQ_FLAG = 1 << 0, +}; + +enum mlx5_exp_drv_create_qp_uar_idx { + MLX5_EXP_CREATE_QP_DB_ONLY_UUAR = -1 +}; + +struct mlx5_exp_ib_create_qp_data { + __u32 comp_mask; /* use mlx5_exp_ib_create_qp_mask */ + __u32 uidx; + __u64 sq_buf_addr; + __u32 wc_uar_index; + __u32 flags; /* use mlx5_exp_create_qp_flags */ +}; + +struct mlx5_exp_ib_create_qp { + /* To allow casting to mlx5_ib_create_qp the prefix is the same as + * struct mlx5_ib_create_qp prefix + */ + __u64 buf_addr; + __u64 db_addr; + __u32 sq_wqe_count; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 flags; + + /* Some more reserved fields for future growth of mlx5_ib_create_qp */ + __u64 prefix_reserved[8]; + + /* sizeof prefix aligned with mlx5_ib_create_qp */ + __u64 size_of_prefix; + + /* Experimental data + * Add new experimental data only inside the exp struct + */ + struct mlx5_exp_ib_create_qp_data exp; +}; + +enum { + MLX5_EXP_INVALID_UUAR = -1, +}; + +struct mlx5_ib_create_qp_resp { + __u32 uuar_index; + __u32 rsvd; +}; + +enum mlx5_exp_ib_create_qp_resp_mask { + MLX5_EXP_CREATE_QP_RESP_MASK_FLAGS_IDX = 1 << 0, + MLX5_EXP_CREATE_QP_RESP_MASK_RESERVED = 1 << 1, +}; + +enum mlx5_exp_create_qp_resp_flags { + MLX5_EXP_CREATE_QP_RESP_MULTI_PACKET_WQE_FLAG = 1 << 0, +}; + +struct mlx5_exp_ib_create_qp_resp_data { + __u32 comp_mask; /* use mlx5_exp_ib_create_qp_resp_mask */ + __u32 flags; /* use mlx5_exp_create_qp_resp_flags */ +}; + +struct mlx5_exp_ib_create_qp_resp { + __u32 uuar_index; + __u32 rsvd; + + /* Some more reserved fields for future growth of mlx5_ib_create_qp_resp */ + __u64 prefix_reserved[8]; + + /* sizeof prefix aligned with mlx5_ib_create_qp_resp */ + __u64 size_of_prefix; + + /* Experimental data + * Add new experimental data only inside the exp struct + */ + struct mlx5_exp_ib_create_qp_resp_data exp; +}; + +struct mlx5_ib_create_dct { + __u32 uidx; + __u32 reserved; +}; + +struct mlx5_ib_arm_dct { + __u64 reserved0; + __u64 reserved1; +}; + +struct mlx5_ib_arm_dct_resp { + __u64 reserved0; + __u64 reserved1; +}; + +struct mlx5_ib_create_wq { + __u64 buf_addr; + __u64 db_addr; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 user_index; + __u32 flags; +}; + +#endif /* MLX5_IB_USER_H */ Index: sys/modules/Makefile =================================================================== --- sys/modules/Makefile +++ sys/modules/Makefile @@ -238,6 +238,7 @@ ${_mlx4en} \ ${_mlx5} \ ${_mlx5en} \ + ${_mlx5ib} \ ${_mly} \ mmc \ mmcsd \ @@ -641,6 +642,7 @@ defined(ALL_MODULES) _mlx4en= mlx4en _mlx5en= mlx5en +_mlx5ib= mlx5ib .endif .if ${MK_OFED} != "no" || defined(ALL_MODULES) _mlx4ib= mlx4ib Index: sys/modules/mlx5ib/Makefile =================================================================== --- /dev/null +++ sys/modules/mlx5ib/Makefile @@ -0,0 +1,24 @@ +# $FreeBSD$ +.PATH: ${.CURDIR}/../../dev/mlx5/mlx5_ib + +KMOD=mlx5ib +SRCS= \ +mlx5_ah.c \ +mlx5_cq.c \ +mlx5_doorbell.c \ +mlx5_mad.c \ +mlx5_mem.c \ +mlx5_mr.c \ +mlx5_roce.c \ +mlx5_srq.c \ +mlx5_main.c \ +mlx5_qp.c \ +device_if.h bus_if.h vnode_if.h pci_if.h \ +opt_inet.h opt_inet6.h + +CFLAGS+= -I${.CURDIR}/../../ofed/include +CFLAGS+= -I${.CURDIR}/../../compat/linuxkpi/common/include + +.include + +CFLAGS+= -Wno-cast-qual -Wno-pointer-arith ${GCC_MS_EXTENSIONS}