diff --git a/contrib/ofed/libcxgb4/dev.c b/contrib/ofed/libcxgb4/dev.c index 89494f9b46d7..d3c289dad9f2 100644 --- a/contrib/ofed/libcxgb4/dev.c +++ b/contrib/ofed/libcxgb4/dev.c @@ -1,577 +1,584 @@ /* * Copyright (c) 2006-2016 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include "libcxgb4.h" #include "cxgb4-abi.h" #define PCI_VENDOR_ID_CHELSIO 0x1425 /* * Macros needed to support the PCI Device ID Table ... */ #define CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN \ static struct { \ unsigned vendor; \ unsigned device; \ } hca_table[] = { #define CH_PCI_DEVICE_ID_FUNCTION \ 0x4 #define CH_PCI_ID_TABLE_ENTRY(__DeviceID) \ { \ .vendor = PCI_VENDOR_ID_CHELSIO, \ .device = (__DeviceID), \ } #define CH_PCI_DEVICE_ID_TABLE_DEFINE_END \ } #include "t4_chip_type.h" #include "t4_pci_id_tbl.h" unsigned long c4iw_page_size; unsigned long c4iw_page_shift; unsigned long c4iw_page_mask; int ma_wr; int t5_en_wc = 1; static TAILQ_HEAD(,c4iw_dev) devices = TAILQ_HEAD_INITIALIZER(devices); static struct ibv_context_ops c4iw_ctx_ops = { .query_device = c4iw_query_device, .query_port = c4iw_query_port, .alloc_pd = c4iw_alloc_pd, .dealloc_pd = c4iw_free_pd, .reg_mr = c4iw_reg_mr, .dereg_mr = c4iw_dereg_mr, .create_cq = c4iw_create_cq, .resize_cq = c4iw_resize_cq, .destroy_cq = c4iw_destroy_cq, .create_srq = c4iw_create_srq, .modify_srq = c4iw_modify_srq, .destroy_srq = c4iw_destroy_srq, .create_qp = c4iw_create_qp, .modify_qp = c4iw_modify_qp, .destroy_qp = c4iw_destroy_qp, .query_qp = c4iw_query_qp, .create_ah = c4iw_create_ah, .destroy_ah = c4iw_destroy_ah, .attach_mcast = c4iw_attach_mcast, .detach_mcast = c4iw_detach_mcast, .post_srq_recv = c4iw_post_srq_recv, .req_notify_cq = c4iw_arm_cq, }; static struct ibv_context *c4iw_alloc_context(struct ibv_device *ibdev, int cmd_fd) { struct c4iw_context *context; struct ibv_get_context cmd; struct c4iw_alloc_ucontext_resp resp; struct c4iw_dev *rhp = to_c4iw_dev(ibdev); struct ibv_query_device qcmd; uint64_t raw_fw_ver; struct ibv_device_attr attr; context = malloc(sizeof *context); if (!context) return NULL; memset(context, 0, sizeof *context); context->ibv_ctx.cmd_fd = cmd_fd; resp.status_page_size = 0; resp.reserved = 0; if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd, &resp.ibv_resp, sizeof resp)) goto err_free; if (resp.reserved) PDBG("%s c4iw_alloc_ucontext_resp reserved field modified by kernel\n", __FUNCTION__); context->status_page_size = resp.status_page_size; if (resp.status_page_size) { context->status_page = mmap(NULL, resp.status_page_size, PROT_READ, MAP_SHARED, cmd_fd, resp.status_page_key); if (context->status_page == MAP_FAILED) goto err_free; } context->ibv_ctx.device = ibdev; context->ibv_ctx.ops = c4iw_ctx_ops; switch (rhp->chip_version) { case CHELSIO_T6: PDBG("%s T6/T5/T4 device\n", __FUNCTION__); case CHELSIO_T5: PDBG("%s T5/T4 device\n", __FUNCTION__); case CHELSIO_T4: PDBG("%s T4 device\n", __FUNCTION__); context->ibv_ctx.ops.async_event = c4iw_async_event; context->ibv_ctx.ops.post_send = c4iw_post_send; context->ibv_ctx.ops.post_recv = c4iw_post_receive; context->ibv_ctx.ops.poll_cq = c4iw_poll_cq; context->ibv_ctx.ops.req_notify_cq = c4iw_arm_cq; break; default: PDBG("%s unknown hca type %d\n", __FUNCTION__, rhp->chip_version); goto err_unmap; break; } if (!rhp->mmid2ptr) { int ret; ret = ibv_cmd_query_device(&context->ibv_ctx, &attr, &raw_fw_ver, &qcmd, sizeof qcmd); if (ret) goto err_unmap; rhp->max_mr = attr.max_mr; rhp->mmid2ptr = calloc(attr.max_mr, sizeof(void *)); if (!rhp->mmid2ptr) { goto err_unmap; } if (rhp->abi_version < 3) { fprintf(stderr, "Warning: iw_cxgb4 driver is of older version" " than libcxgb4:: %d\n", rhp->abi_version); rhp->max_qp = T4_QID_BASE + attr.max_qp; } else { rhp->max_qp = context->status_page->qp_start + context->status_page->qp_size; } rhp->qpid2ptr = calloc(rhp->max_qp, sizeof(void *)); if (!rhp->qpid2ptr) { goto err_unmap; } if (rhp->abi_version < 3) rhp->max_cq = T4_QID_BASE + attr.max_cq; else rhp->max_cq = context->status_page->cq_start + context->status_page->cq_size; rhp->cqid2ptr = calloc(rhp->max_cq, sizeof(void *)); if (!rhp->cqid2ptr) goto err_unmap; /* Disable userspace WC if architecture/adapter does not * support WC. * Note: To forcefully disable WC in kernel driver use the * loader tunable "hw.cxl.write_combine=0" */ if (t5_en_wc && !context->status_page->wc_supported) { t5_en_wc = 0; } } return &context->ibv_ctx; err_unmap: munmap(context->status_page, context->status_page_size); err_free: if (rhp->cqid2ptr) free(rhp->cqid2ptr); if (rhp->qpid2ptr) free(rhp->cqid2ptr); if (rhp->mmid2ptr) free(rhp->cqid2ptr); free(context); return NULL; } static void c4iw_free_context(struct ibv_context *ibctx) { struct c4iw_context *context = to_c4iw_context(ibctx); if (context->status_page_size) munmap(context->status_page, context->status_page_size); free(context); } static struct verbs_device_ops c4iw_dev_ops = { .alloc_context = c4iw_alloc_context, .free_context = c4iw_free_context }; #ifdef STALL_DETECTION int stall_to; static void dump_cq(struct c4iw_cq *chp) { int i; fprintf(stderr, "CQ: %p id %u queue %p cidx 0x%08x sw_queue %p sw_cidx %d sw_pidx %d sw_in_use %d depth %u error %u gen %d " "cidx_inc %d bits_type_ts %016" PRIx64 " notempty %d\n", chp, chp->cq.cqid, chp->cq.queue, chp->cq.cidx, chp->cq.sw_queue, chp->cq.sw_cidx, chp->cq.sw_pidx, chp->cq.sw_in_use, chp->cq.size, chp->cq.error, chp->cq.gen, chp->cq.cidx_inc, be64toh(chp->cq.bits_type_ts), t4_cq_notempty(&chp->cq)); for (i=0; i < chp->cq.size; i++) { u64 *p = (u64 *)(chp->cq.queue + i); fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64, i, be64toh(p[0]), be64toh(p[1])); if (i == chp->cq.cidx) fprintf(stderr, " <-- cidx\n"); else fprintf(stderr, "\n"); p+= 2; fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64 "\n", i, be64toh(p[0]), be64toh(p[1])); p+= 2; fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64 "\n", i, be64toh(p[0]), be64toh(p[1])); p+= 2; fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64 "\n", i, be64toh(p[0]), be64toh(p[1])); p+= 2; } } static void dump_qp(struct c4iw_qp *qhp) { int i; int j; struct t4_swsqe *swsqe; struct t4_swrqe *swrqe; u16 cidx, pidx; u64 *p; fprintf(stderr, "QP: %p id %u error %d flushed %d qid_mask 0x%x\n" " SQ: id %u queue %p sw_queue %p cidx %u pidx %u in_use %u wq_pidx %u depth %u flags 0x%x flush_cidx %d\n" " RQ: id %u queue %p sw_queue %p cidx %u pidx %u in_use %u depth %u\n", qhp, qhp->wq.sq.qid, qhp->wq.error, qhp->wq.flushed, qhp->wq.qid_mask, qhp->wq.sq.qid, qhp->wq.sq.queue, qhp->wq.sq.sw_sq, qhp->wq.sq.cidx, qhp->wq.sq.pidx, qhp->wq.sq.in_use, qhp->wq.sq.wq_pidx, qhp->wq.sq.size, qhp->wq.sq.flags, qhp->wq.sq.flush_cidx, qhp->wq.rq.qid, qhp->wq.rq.queue, qhp->wq.rq.sw_rq, qhp->wq.rq.cidx, qhp->wq.rq.pidx, qhp->wq.rq.in_use, qhp->wq.rq.size); cidx = qhp->wq.sq.cidx; pidx = qhp->wq.sq.pidx; if (cidx != pidx) fprintf(stderr, "SQ: \n"); while (cidx != pidx) { swsqe = &qhp->wq.sq.sw_sq[cidx]; fprintf(stderr, "%04u: wr_id %016" PRIx64 " sq_wptr %08x read_len %u opcode 0x%x " "complete %u signaled %u cqe %016" PRIx64 " %016" PRIx64 " %016" PRIx64 " %016" PRIx64 "\n", cidx, swsqe->wr_id, swsqe->idx, swsqe->read_len, swsqe->opcode, swsqe->complete, swsqe->signaled, htobe64(((uint64_t *)&swsqe->cqe)[0]), htobe64(((uint64_t *)&swsqe->cqe)[1]), htobe64(((uint64_t *)&swsqe->cqe)[2]), htobe64(((uint64_t *)&swsqe->cqe)[3])); if (++cidx == qhp->wq.sq.size) cidx = 0; } fprintf(stderr, "SQ WQ: \n"); p = (u64 *)qhp->wq.sq.queue; for (i=0; i < qhp->wq.sq.size * T4_SQ_NUM_SLOTS; i++) { for (j=0; j < T4_EQ_ENTRY_SIZE / 16; j++) { fprintf(stderr, "%04u %016" PRIx64 " %016" PRIx64 " ", i, be64toh(p[0]), be64toh(p[1])); if (j == 0 && i == qhp->wq.sq.wq_pidx) fprintf(stderr, " <-- pidx"); fprintf(stderr, "\n"); p += 2; } } cidx = qhp->wq.rq.cidx; pidx = qhp->wq.rq.pidx; if (cidx != pidx) fprintf(stderr, "RQ: \n"); while (cidx != pidx) { swrqe = &qhp->wq.rq.sw_rq[cidx]; fprintf(stderr, "%04u: wr_id %016" PRIx64 "\n", cidx, swrqe->wr_id ); if (++cidx == qhp->wq.rq.size) cidx = 0; } fprintf(stderr, "RQ WQ: \n"); p = (u64 *)qhp->wq.rq.queue; for (i=0; i < qhp->wq.rq.size * T4_RQ_NUM_SLOTS; i++) { for (j=0; j < T4_EQ_ENTRY_SIZE / 16; j++) { fprintf(stderr, "%04u %016" PRIx64 " %016" PRIx64 " ", i, be64toh(p[0]), be64toh(p[1])); if (j == 0 && i == qhp->wq.rq.pidx) fprintf(stderr, " <-- pidx"); if (j == 0 && i == qhp->wq.rq.cidx) fprintf(stderr, " <-- cidx"); fprintf(stderr, "\n"); p+=2; } } } void dump_state(void) { struct c4iw_dev *dev; int i; fprintf(stderr, "STALL DETECTED:\n"); TAILQ_FOREACH(dev, &devices, list) { //pthread_spin_lock(&dev->lock); fprintf(stderr, "Device %s\n", dev->ibv_dev.name); for (i=0; i < dev->max_cq; i++) { if (dev->cqid2ptr[i]) { struct c4iw_cq *chp = dev->cqid2ptr[i]; //pthread_spin_lock(&chp->lock); dump_cq(chp); //pthread_spin_unlock(&chp->lock); } } for (i=0; i < dev->max_qp; i++) { if (dev->qpid2ptr[i]) { struct c4iw_qp *qhp = dev->qpid2ptr[i]; //pthread_spin_lock(&qhp->lock); dump_qp(qhp); //pthread_spin_unlock(&qhp->lock); } } //pthread_spin_unlock(&dev->lock); } fprintf(stderr, "DUMP COMPLETE:\n"); fflush(stderr); } #endif /* end of STALL_DETECTION */ /* * c4iw_abi_version is used to store ABI for iw_cxgb4 so the user mode library * can know if the driver supports the kernel mode db ringing. */ int c4iw_abi_version = 1; static struct verbs_device *cxgb4_driver_init(const char *uverbs_sys_path, int abi_version) { char devstr[IBV_SYSFS_PATH_MAX], ibdev[16], value[128], *cp; char dev_str[IBV_SYSFS_PATH_MAX]; struct c4iw_dev *dev; unsigned vendor, device, fw_maj, fw_min; int i; char devnum; char ib_param[16]; #ifndef __linux__ if (ibv_read_sysfs_file(uverbs_sys_path, "ibdev", ibdev, sizeof ibdev) < 0) return NULL; devnum = atoi(&ibdev[5]); if (ibdev[0] == 't' && ibdev[1] >= '4' && ibdev[1] <= '6' && strstr(&ibdev[2], "nex") && devnum >= 0) { snprintf(dev_str, sizeof(dev_str), "/dev/t%cnex/%d", ibdev[1], devnum); } else return NULL; if (ibv_read_sysfs_file(dev_str, "\%pnpinfo", value, sizeof value) < 0) return NULL; else { if (strstr(value, "vendor=")) { strncpy(ib_param, strstr(value, "vendor=") + strlen("vendor="), 6); sscanf(ib_param, "%i", &vendor); } if (strstr(value, "device=")) { strncpy(ib_param, strstr(value, "device=") + strlen("device="), 6); sscanf(ib_param, "%i", &device); } } #else if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor", value, sizeof value) < 0) return NULL; sscanf(value, "%i", &vendor); if (ibv_read_sysfs_file(uverbs_sys_path, "device/device", value, sizeof value) < 0) return NULL; sscanf(value, "%i", &device); #endif for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i) if (vendor == hca_table[i].vendor && device == hca_table[i].device) goto found; return NULL; found: c4iw_abi_version = abi_version; #ifndef __linux__ if (ibv_read_sysfs_file(dev_str, "firmware_version", value, sizeof value) < 0) return NULL; #else /* * Verify that the firmware major number matches. Major number * mismatches are fatal. Minor number mismatches are tolerated. */ if (ibv_read_sysfs_file(uverbs_sys_path, "ibdev", ibdev, sizeof ibdev) < 0) return NULL; memset(devstr, 0, sizeof devstr); snprintf(devstr, sizeof devstr, "%s/class/infiniband/%s", ibv_get_sysfs_path(), ibdev); if (ibv_read_sysfs_file(devstr, "fw_ver", value, sizeof value) < 0) return NULL; #endif cp = strtok(value+1, "."); sscanf(cp, "%i", &fw_maj); cp = strtok(NULL, "."); sscanf(cp, "%i", &fw_min); if ((signed int)fw_maj < FW_MAJ) { fprintf(stderr, "libcxgb4: Fatal firmware version mismatch. " "Firmware major number is %u and libcxgb4 needs %u.\n", fw_maj, FW_MAJ); fflush(stderr); return NULL; } DBGLOG("libcxgb4"); if ((signed int)fw_min < FW_MIN) { PDBG("libcxgb4: non-fatal firmware version mismatch. " "Firmware minor number is %u and libcxgb4 needs %u.\n", fw_min, FW_MIN); fflush(stderr); } PDBG("%s found vendor %d device %d type %d\n", __FUNCTION__, vendor, device, CHELSIO_CHIP_VERSION(hca_table[i].device >> 8)); dev = calloc(1, sizeof *dev); if (!dev) { return NULL; } - pthread_spin_init(&dev->lock, PTHREAD_PROCESS_PRIVATE); + if (pthread_spin_init(&dev->lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + dev->ibv_dev.ops = &c4iw_dev_ops; dev->chip_version = CHELSIO_CHIP_VERSION(hca_table[i].device >> 8); dev->abi_version = abi_version; PDBG("%s device claimed\n", __FUNCTION__); TAILQ_INSERT_TAIL(&devices, dev, list); #ifdef STALL_DETECTION { char *c = getenv("CXGB4_STALL_TIMEOUT"); if (c) { stall_to = strtol(c, NULL, 0); if (errno || stall_to < 0) stall_to = 0; } } #endif { char *c = getenv("CXGB4_MA_WR"); if (c) { ma_wr = strtol(c, NULL, 0); if (ma_wr != 1) ma_wr = 0; } } { char *c = getenv("T5_ENABLE_WC"); if (c) { t5_en_wc = strtol(c, NULL, 0); if (t5_en_wc != 1) t5_en_wc = 0; } } return &dev->ibv_dev; + +err: + free(dev); + + return NULL; } static __attribute__((constructor)) void cxgb4_register_driver(void) { c4iw_page_size = sysconf(_SC_PAGESIZE); c4iw_page_shift = long_log2(c4iw_page_size); c4iw_page_mask = ~(c4iw_page_size - 1); verbs_register_driver("cxgb4", cxgb4_driver_init); } #ifdef STATS void __attribute__ ((destructor)) cs_fini(void); void __attribute__ ((destructor)) cs_fini(void) { syslog(LOG_NOTICE, "cxgb4 stats - sends %lu recv %lu read %lu " "write %lu arm %lu cqe %lu mr %lu qp %lu cq %lu\n", c4iw_stats.send, c4iw_stats.recv, c4iw_stats.read, c4iw_stats.write, c4iw_stats.arm, c4iw_stats.cqe, c4iw_stats.mr, c4iw_stats.qp, c4iw_stats.cq); } #endif diff --git a/contrib/ofed/libcxgb4/verbs.c b/contrib/ofed/libcxgb4/verbs.c index 04d765dff3f6..4e44b2285fff 100644 --- a/contrib/ofed/libcxgb4/verbs.c +++ b/contrib/ofed/libcxgb4/verbs.c @@ -1,710 +1,725 @@ /* * Copyright (c) 2006-2016 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include "libcxgb4.h" #include "cxgb4-abi.h" #define MASKED(x) (void *)((unsigned long)(x) & c4iw_page_mask) int c4iw_query_device(struct ibv_context *context, struct ibv_device_attr *attr) { struct ibv_query_device cmd; uint64_t raw_fw_ver; u8 major, minor, sub_minor, build; int ret; ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd); if (ret) return ret; major = (raw_fw_ver >> 24) & 0xff; minor = (raw_fw_ver >> 16) & 0xff; sub_minor = (raw_fw_ver >> 8) & 0xff; build = raw_fw_ver & 0xff; snprintf(attr->fw_ver, sizeof attr->fw_ver, "%d.%d.%d.%d", major, minor, sub_minor, build); return 0; } int c4iw_query_port(struct ibv_context *context, uint8_t port, struct ibv_port_attr *attr) { struct ibv_query_port cmd; return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd); } struct ibv_pd *c4iw_alloc_pd(struct ibv_context *context) { struct ibv_alloc_pd cmd; struct c4iw_alloc_pd_resp resp; struct c4iw_pd *pd; pd = malloc(sizeof *pd); if (!pd) return NULL; if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd, &resp.ibv_resp, sizeof resp)) { free(pd); return NULL; } return &pd->ibv_pd; } int c4iw_free_pd(struct ibv_pd *pd) { int ret; ret = ibv_cmd_dealloc_pd(pd); if (ret) return ret; free(pd); return 0; } static struct ibv_mr *__c4iw_reg_mr(struct ibv_pd *pd, void *addr, size_t length, uint64_t hca_va, int access) { struct c4iw_mr *mhp; struct ibv_reg_mr cmd; struct ibv_reg_mr_resp resp; struct c4iw_dev *dev = to_c4iw_dev(pd->context->device); mhp = malloc(sizeof *mhp); if (!mhp) return NULL; if (ibv_cmd_reg_mr(pd, addr, length, hca_va, access, &mhp->ibv_mr, &cmd, sizeof cmd, &resp, sizeof resp)) { free(mhp); return NULL; } mhp->va_fbo = hca_va; mhp->len = length; PDBG("%s stag 0x%x va_fbo 0x%" PRIx64 " len %d\n", __func__, mhp->ibv_mr.rkey, mhp->va_fbo, mhp->len); pthread_spin_lock(&dev->lock); dev->mmid2ptr[c4iw_mmid(mhp->ibv_mr.lkey)] = mhp; pthread_spin_unlock(&dev->lock); INC_STAT(mr); return &mhp->ibv_mr; } struct ibv_mr *c4iw_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) { PDBG("%s addr %p length %ld\n", __func__, addr, length); return __c4iw_reg_mr(pd, addr, length, (uintptr_t) addr, access); } int c4iw_dereg_mr(struct ibv_mr *mr) { int ret; struct c4iw_dev *dev = to_c4iw_dev(mr->pd->context->device); ret = ibv_cmd_dereg_mr(mr); if (ret) return ret; pthread_spin_lock(&dev->lock); dev->mmid2ptr[c4iw_mmid(mr->lkey)] = NULL; pthread_spin_unlock(&dev->lock); free(to_c4iw_mr(mr)); return 0; } struct ibv_cq *c4iw_create_cq(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector) { struct ibv_create_cq cmd; struct c4iw_create_cq_resp resp; struct c4iw_cq *chp; struct c4iw_dev *dev = to_c4iw_dev(context->device); int ret; chp = calloc(1, sizeof *chp); if (!chp) { return NULL; } resp.reserved = 0; ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector, &chp->ibv_cq, &cmd, sizeof cmd, &resp.ibv_resp, sizeof resp); if (ret) goto err1; if (resp.reserved) PDBG("%s c4iw_create_cq_resp reserved field modified by kernel\n", __FUNCTION__); - pthread_spin_init(&chp->lock, PTHREAD_PROCESS_PRIVATE); + ret = pthread_spin_init(&chp->lock, PTHREAD_PROCESS_PRIVATE); + if (ret) + goto err2; #ifdef STALL_DETECTION gettimeofday(&chp->time, NULL); #endif chp->rhp = dev; chp->cq.qid_mask = resp.qid_mask; chp->cq.cqid = resp.cqid; chp->cq.size = resp.size; chp->cq.memsize = resp.memsize; chp->cq.gen = 1; chp->cq.queue = mmap(NULL, chp->cq.memsize, PROT_READ|PROT_WRITE, MAP_SHARED, context->cmd_fd, resp.key); if (chp->cq.queue == MAP_FAILED) - goto err2; + goto err3; chp->cq.ugts = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED, context->cmd_fd, resp.gts_key); if (chp->cq.ugts == MAP_FAILED) - goto err3; + goto err4; if (dev_is_t4(chp->rhp)) chp->cq.ugts += 1; else chp->cq.ugts += 5; chp->cq.sw_queue = calloc(chp->cq.size, sizeof *chp->cq.queue); if (!chp->cq.sw_queue) - goto err4; + goto err5; PDBG("%s cqid 0x%x key %" PRIx64 " va %p memsize %lu gts_key %" PRIx64 " va %p qid_mask 0x%x\n", __func__, chp->cq.cqid, resp.key, chp->cq.queue, chp->cq.memsize, resp.gts_key, chp->cq.ugts, chp->cq.qid_mask); pthread_spin_lock(&dev->lock); dev->cqid2ptr[chp->cq.cqid] = chp; pthread_spin_unlock(&dev->lock); INC_STAT(cq); return &chp->ibv_cq; -err4: +err5: munmap(MASKED(chp->cq.ugts), c4iw_page_size); -err3: +err4: munmap(chp->cq.queue, chp->cq.memsize); +err3: + pthread_spin_destroy(&chp->lock); err2: (void)ibv_cmd_destroy_cq(&chp->ibv_cq); err1: free(chp); return NULL; } int c4iw_resize_cq(struct ibv_cq *ibcq, int cqe) { #if 0 int ret; struct ibv_resize_cq cmd; struct ibv_resize_cq_resp resp; ret = ibv_cmd_resize_cq(ibcq, cqe, &cmd, sizeof cmd, &resp, sizeof resp); PDBG("%s ret %d\n", __func__, ret); return ret; #else return -ENOSYS; #endif } int c4iw_destroy_cq(struct ibv_cq *ibcq) { int ret; struct c4iw_cq *chp = to_c4iw_cq(ibcq); struct c4iw_dev *dev = to_c4iw_dev(ibcq->context->device); chp->cq.error = 1; ret = ibv_cmd_destroy_cq(ibcq); if (ret) { return ret; } + verbs_cleanup_cq(ibcq); munmap(MASKED(chp->cq.ugts), c4iw_page_size); munmap(chp->cq.queue, chp->cq.memsize); pthread_spin_lock(&dev->lock); dev->cqid2ptr[chp->cq.cqid] = NULL; pthread_spin_unlock(&dev->lock); free(chp->cq.sw_queue); + pthread_spin_destroy(&chp->lock); free(chp); return 0; } struct ibv_srq *c4iw_create_srq(struct ibv_pd *pd, struct ibv_srq_init_attr *attr) { return NULL; } int c4iw_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr, int attr_mask) { return ENOSYS; } int c4iw_destroy_srq(struct ibv_srq *srq) { return ENOSYS; } int c4iw_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { return ENOSYS; } static struct ibv_qp *create_qp_v0(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) { struct ibv_create_qp cmd; struct c4iw_create_qp_resp_v0 resp; struct c4iw_qp *qhp; struct c4iw_dev *dev = to_c4iw_dev(pd->context->device); int ret; void *dbva; PDBG("%s enter qp\n", __func__); qhp = calloc(1, sizeof *qhp); if (!qhp) goto err1; ret = ibv_cmd_create_qp(pd, &qhp->ibv_qp, attr, &cmd, sizeof cmd, &resp.ibv_resp, sizeof resp); if (ret) goto err2; PDBG("%s sqid 0x%x sq key %" PRIx64 " sq db/gts key %" PRIx64 " rqid 0x%x rq key %" PRIx64 " rq db/gts key %" PRIx64 " qid_mask 0x%x\n", __func__, resp.sqid, resp.sq_key, resp.sq_db_gts_key, resp.rqid, resp.rq_key, resp.rq_db_gts_key, resp.qid_mask); qhp->wq.qid_mask = resp.qid_mask; qhp->rhp = dev; qhp->wq.sq.qid = resp.sqid; qhp->wq.sq.size = resp.sq_size; qhp->wq.sq.memsize = resp.sq_memsize; qhp->wq.sq.flags = 0; qhp->wq.rq.msn = 1; qhp->wq.rq.qid = resp.rqid; qhp->wq.rq.size = resp.rq_size; qhp->wq.rq.memsize = resp.rq_memsize; - pthread_spin_init(&qhp->lock, PTHREAD_PROCESS_PRIVATE); + ret = pthread_spin_init(&qhp->lock, PTHREAD_PROCESS_PRIVATE); + if (ret) + goto err3; dbva = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED, pd->context->cmd_fd, resp.sq_db_gts_key); if (dbva == MAP_FAILED) - goto err3; + goto err4; qhp->wq.sq.udb = dbva; qhp->wq.sq.queue = mmap(NULL, qhp->wq.sq.memsize, PROT_WRITE, MAP_SHARED, pd->context->cmd_fd, resp.sq_key); if (qhp->wq.sq.queue == MAP_FAILED) - goto err4; + goto err5; dbva = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED, pd->context->cmd_fd, resp.rq_db_gts_key); if (dbva == MAP_FAILED) - goto err5; + goto err6; qhp->wq.rq.udb = dbva; qhp->wq.rq.queue = mmap(NULL, qhp->wq.rq.memsize, PROT_WRITE, MAP_SHARED, pd->context->cmd_fd, resp.rq_key); if (qhp->wq.rq.queue == MAP_FAILED) - goto err6; + goto err7; qhp->wq.sq.sw_sq = calloc(qhp->wq.sq.size, sizeof (struct t4_swsqe)); if (!qhp->wq.sq.sw_sq) - goto err7; + goto err8; qhp->wq.rq.sw_rq = calloc(qhp->wq.rq.size, sizeof (uint64_t)); if (!qhp->wq.rq.sw_rq) - goto err8; + goto err9; PDBG("%s sq dbva %p sq qva %p sq depth %u sq memsize %lu " " rq dbva %p rq qva %p rq depth %u rq memsize %lu\n", __func__, qhp->wq.sq.udb, qhp->wq.sq.queue, qhp->wq.sq.size, qhp->wq.sq.memsize, qhp->wq.rq.udb, qhp->wq.rq.queue, qhp->wq.rq.size, qhp->wq.rq.memsize); qhp->sq_sig_all = attr->sq_sig_all; pthread_spin_lock(&dev->lock); dev->qpid2ptr[qhp->wq.sq.qid] = qhp; pthread_spin_unlock(&dev->lock); INC_STAT(qp); return &qhp->ibv_qp; -err8: +err9: free(qhp->wq.sq.sw_sq); -err7: +err8: munmap((void *)qhp->wq.rq.queue, qhp->wq.rq.memsize); -err6: +err7: munmap(MASKED(qhp->wq.rq.udb), c4iw_page_size); -err5: +err6: munmap((void *)qhp->wq.sq.queue, qhp->wq.sq.memsize); -err4: +err5: munmap(MASKED(qhp->wq.sq.udb), c4iw_page_size); +err4: + pthread_spin_destroy(&qhp->lock); err3: (void)ibv_cmd_destroy_qp(&qhp->ibv_qp); err2: free(qhp); err1: return NULL; } static struct ibv_qp *create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) { struct ibv_create_qp cmd; struct c4iw_create_qp_resp resp; struct c4iw_qp *qhp; struct c4iw_dev *dev = to_c4iw_dev(pd->context->device); struct c4iw_context *ctx = to_c4iw_context(pd->context); int ret; void *dbva; PDBG("%s enter qp\n", __func__); qhp = calloc(1, sizeof *qhp); if (!qhp) goto err1; ret = ibv_cmd_create_qp(pd, &qhp->ibv_qp, attr, &cmd, sizeof cmd, &resp.ibv_resp, sizeof resp); if (ret) goto err2; PDBG("%s sqid 0x%x sq key %" PRIx64 " sq db/gts key %" PRIx64 " rqid 0x%x rq key %" PRIx64 " rq db/gts key %" PRIx64 " qid_mask 0x%x\n", __func__, resp.sqid, resp.sq_key, resp.sq_db_gts_key, resp.rqid, resp.rq_key, resp.rq_db_gts_key, resp.qid_mask); qhp->wq.qid_mask = resp.qid_mask; qhp->rhp = dev; qhp->wq.sq.qid = resp.sqid; qhp->wq.sq.size = resp.sq_size; qhp->wq.sq.memsize = resp.sq_memsize; qhp->wq.sq.flags = resp.flags & C4IW_QPF_ONCHIP ? T4_SQ_ONCHIP : 0; qhp->wq.sq.flush_cidx = -1; qhp->wq.rq.msn = 1; qhp->wq.rq.qid = resp.rqid; qhp->wq.rq.size = resp.rq_size; qhp->wq.rq.memsize = resp.rq_memsize; if (ma_wr && resp.sq_memsize < (resp.sq_size + 1) * sizeof *qhp->wq.sq.queue + 16*sizeof(__be64) ) { ma_wr = 0; fprintf(stderr, "libcxgb4 warning - downlevel iw_cxgb4 driver. " "MA workaround disabled.\n"); } - pthread_spin_init(&qhp->lock, PTHREAD_PROCESS_PRIVATE); + ret = pthread_spin_init(&qhp->lock, PTHREAD_PROCESS_PRIVATE); + if (ret) + goto err3; dbva = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED, pd->context->cmd_fd, resp.sq_db_gts_key); if (dbva == MAP_FAILED) - goto err3; + goto err4; qhp->wq.sq.udb = dbva; if (!dev_is_t4(qhp->rhp)) { unsigned long segment_offset = 128 * (qhp->wq.sq.qid & qhp->wq.qid_mask); if (segment_offset < c4iw_page_size) { qhp->wq.sq.udb += segment_offset / 4; qhp->wq.sq.wc_reg_available = 1; } else qhp->wq.sq.bar2_qid = qhp->wq.sq.qid & qhp->wq.qid_mask; qhp->wq.sq.udb += 2; } qhp->wq.sq.queue = mmap(NULL, qhp->wq.sq.memsize, PROT_READ|PROT_WRITE, MAP_SHARED, pd->context->cmd_fd, resp.sq_key); if (qhp->wq.sq.queue == MAP_FAILED) - goto err4; + goto err5; dbva = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED, pd->context->cmd_fd, resp.rq_db_gts_key); if (dbva == MAP_FAILED) - goto err5; + goto err6; qhp->wq.rq.udb = dbva; if (!dev_is_t4(qhp->rhp)) { unsigned long segment_offset = 128 * (qhp->wq.rq.qid & qhp->wq.qid_mask); if (segment_offset < c4iw_page_size) { qhp->wq.rq.udb += segment_offset / 4; qhp->wq.rq.wc_reg_available = 1; } else qhp->wq.rq.bar2_qid = qhp->wq.rq.qid & qhp->wq.qid_mask; qhp->wq.rq.udb += 2; } qhp->wq.rq.queue = mmap(NULL, qhp->wq.rq.memsize, PROT_READ|PROT_WRITE, MAP_SHARED, pd->context->cmd_fd, resp.rq_key); if (qhp->wq.rq.queue == MAP_FAILED) - goto err6; + goto err7; qhp->wq.sq.sw_sq = calloc(qhp->wq.sq.size, sizeof (struct t4_swsqe)); if (!qhp->wq.sq.sw_sq) - goto err7; + goto err8; qhp->wq.rq.sw_rq = calloc(qhp->wq.rq.size, sizeof (uint64_t)); if (!qhp->wq.rq.sw_rq) - goto err8; + goto err9; if (t4_sq_onchip(&qhp->wq)) { qhp->wq.sq.ma_sync = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED, pd->context->cmd_fd, resp.ma_sync_key); if (qhp->wq.sq.ma_sync == MAP_FAILED) - goto err9; + goto err10; qhp->wq.sq.ma_sync += (A_PCIE_MA_SYNC & (c4iw_page_size - 1)); } if (ctx->status_page_size) { qhp->wq.db_offp = &ctx->status_page->db_off; } else { qhp->wq.db_offp = &qhp->wq.rq.queue[qhp->wq.rq.size].status.db_off; } PDBG("%s sq dbva %p sq qva %p sq depth %u sq memsize %lu " " rq dbva %p rq qva %p rq depth %u rq memsize %lu\n", __func__, qhp->wq.sq.udb, qhp->wq.sq.queue, qhp->wq.sq.size, qhp->wq.sq.memsize, qhp->wq.rq.udb, qhp->wq.rq.queue, qhp->wq.rq.size, qhp->wq.rq.memsize); qhp->sq_sig_all = attr->sq_sig_all; pthread_spin_lock(&dev->lock); dev->qpid2ptr[qhp->wq.sq.qid] = qhp; pthread_spin_unlock(&dev->lock); INC_STAT(qp); return &qhp->ibv_qp; -err9: +err10: free(qhp->wq.rq.sw_rq); -err8: +err9: free(qhp->wq.sq.sw_sq); -err7: +err8: munmap((void *)qhp->wq.rq.queue, qhp->wq.rq.memsize); -err6: +err7: munmap(MASKED(qhp->wq.rq.udb), c4iw_page_size); -err5: +err6: munmap((void *)qhp->wq.sq.queue, qhp->wq.sq.memsize); -err4: +err5: munmap(MASKED(qhp->wq.sq.udb), c4iw_page_size); +err4: + pthread_spin_destroy(&qhp->lock); err3: (void)ibv_cmd_destroy_qp(&qhp->ibv_qp); err2: free(qhp); err1: return NULL; } struct ibv_qp *c4iw_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) { struct c4iw_dev *dev = to_c4iw_dev(pd->context->device); if (dev->abi_version == 0) return create_qp_v0(pd, attr); return create_qp(pd, attr); } static void reset_qp(struct c4iw_qp *qhp) { PDBG("%s enter qp %p\n", __func__, qhp); qhp->wq.sq.cidx = 0; qhp->wq.sq.wq_pidx = qhp->wq.sq.pidx = qhp->wq.sq.in_use = 0; qhp->wq.rq.cidx = qhp->wq.rq.pidx = qhp->wq.rq.in_use = 0; qhp->wq.sq.oldest_read = NULL; memset(qhp->wq.sq.queue, 0, qhp->wq.sq.memsize); if (t4_sq_onchip(&qhp->wq)) mmio_flush_writes(); memset(qhp->wq.rq.queue, 0, qhp->wq.rq.memsize); } int c4iw_modify_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, int attr_mask) { struct ibv_modify_qp cmd = {}; struct c4iw_qp *qhp = to_c4iw_qp(ibqp); int ret; PDBG("%s enter qp %p new state %d\n", __func__, ibqp, attr_mask & IBV_QP_STATE ? attr->qp_state : -1); pthread_spin_lock(&qhp->lock); if (t4_wq_in_error(&qhp->wq)) c4iw_flush_qp(qhp); ret = ibv_cmd_modify_qp(ibqp, attr, attr_mask, &cmd, sizeof cmd); if (!ret && (attr_mask & IBV_QP_STATE) && attr->qp_state == IBV_QPS_RESET) reset_qp(qhp); pthread_spin_unlock(&qhp->lock); return ret; } int c4iw_destroy_qp(struct ibv_qp *ibqp) { int ret; struct c4iw_qp *qhp = to_c4iw_qp(ibqp); struct c4iw_dev *dev = to_c4iw_dev(ibqp->context->device); PDBG("%s enter qp %p\n", __func__, ibqp); pthread_spin_lock(&qhp->lock); c4iw_flush_qp(qhp); pthread_spin_unlock(&qhp->lock); ret = ibv_cmd_destroy_qp(ibqp); if (ret) { return ret; } if (t4_sq_onchip(&qhp->wq)) { qhp->wq.sq.ma_sync -= (A_PCIE_MA_SYNC & (c4iw_page_size - 1)); munmap((void *)qhp->wq.sq.ma_sync, c4iw_page_size); } munmap(MASKED(qhp->wq.sq.udb), c4iw_page_size); munmap(MASKED(qhp->wq.rq.udb), c4iw_page_size); munmap(qhp->wq.sq.queue, qhp->wq.sq.memsize); munmap(qhp->wq.rq.queue, qhp->wq.rq.memsize); pthread_spin_lock(&dev->lock); dev->qpid2ptr[qhp->wq.sq.qid] = NULL; pthread_spin_unlock(&dev->lock); free(qhp->wq.rq.sw_rq); free(qhp->wq.sq.sw_sq); + pthread_spin_destroy(&qhp->lock); free(qhp); return 0; } int c4iw_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) { struct ibv_query_qp cmd; struct c4iw_qp *qhp = to_c4iw_qp(ibqp); int ret; pthread_spin_lock(&qhp->lock); if (t4_wq_in_error(&qhp->wq)) c4iw_flush_qp(qhp); ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof cmd); pthread_spin_unlock(&qhp->lock); return ret; } struct ibv_ah *c4iw_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) { return NULL; } int c4iw_destroy_ah(struct ibv_ah *ah) { return ENOSYS; } int c4iw_attach_mcast(struct ibv_qp *ibqp, const union ibv_gid *gid, uint16_t lid) { struct c4iw_qp *qhp = to_c4iw_qp(ibqp); int ret; pthread_spin_lock(&qhp->lock); if (t4_wq_in_error(&qhp->wq)) c4iw_flush_qp(qhp); ret = ibv_cmd_attach_mcast(ibqp, gid, lid); pthread_spin_unlock(&qhp->lock); return ret; } int c4iw_detach_mcast(struct ibv_qp *ibqp, const union ibv_gid *gid, uint16_t lid) { struct c4iw_qp *qhp = to_c4iw_qp(ibqp); int ret; pthread_spin_lock(&qhp->lock); if (t4_wq_in_error(&qhp->wq)) c4iw_flush_qp(qhp); ret = ibv_cmd_detach_mcast(ibqp, gid, lid); pthread_spin_unlock(&qhp->lock); return ret; } void c4iw_async_event(struct ibv_async_event *event) { PDBG("%s type %d obj %p\n", __func__, event->event_type, event->element.cq); switch (event->event_type) { case IBV_EVENT_CQ_ERR: break; case IBV_EVENT_QP_FATAL: case IBV_EVENT_QP_REQ_ERR: case IBV_EVENT_QP_ACCESS_ERR: case IBV_EVENT_PATH_MIG_ERR: { struct c4iw_qp *qhp = to_c4iw_qp(event->element.qp); pthread_spin_lock(&qhp->lock); c4iw_flush_qp(qhp); pthread_spin_unlock(&qhp->lock); break; } case IBV_EVENT_SQ_DRAINED: case IBV_EVENT_PATH_MIG: case IBV_EVENT_COMM_EST: case IBV_EVENT_QP_LAST_WQE_REACHED: default: break; } } diff --git a/contrib/ofed/libibcm/cm.c b/contrib/ofed/libibcm/cm.c index 07ba481afa3d..da3412eb2fce 100644 --- a/contrib/ofed/libibcm/cm.c +++ b/contrib/ofed/libibcm/cm.c @@ -1,1023 +1,1024 @@ /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id$ */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #define PFX "libibcm: " #define IB_USER_CM_MIN_ABI_VERSION 4 #define IB_USER_CM_MAX_ABI_VERSION 5 static int abi_ver; static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; enum { IB_UCM_MAX_DEVICES = 32 }; static inline int ERR(int err) { errno = err; return -1; } #define CM_CREATE_MSG_CMD_RESP(msg, cmd, resp, type, size) \ do { \ struct ib_ucm_cmd_hdr *hdr; \ \ size = sizeof(*hdr) + sizeof(*cmd); \ msg = alloca(size); \ if (!msg) \ return ERR(ENOMEM); \ hdr = msg; \ cmd = msg + sizeof(*hdr); \ hdr->cmd = type; \ hdr->in = sizeof(*cmd); \ hdr->out = sizeof(*resp); \ memset(cmd, 0, sizeof(*cmd)); \ resp = alloca(sizeof(*resp)); \ if (!resp) \ return ERR(ENOMEM); \ cmd->response = (uintptr_t)resp;\ } while (0) #define CM_CREATE_MSG_CMD(msg, cmd, type, size) \ do { \ struct ib_ucm_cmd_hdr *hdr; \ \ size = sizeof(*hdr) + sizeof(*cmd); \ msg = alloca(size); \ if (!msg) \ return ERR(ENOMEM); \ hdr = msg; \ cmd = msg + sizeof(*hdr); \ hdr->cmd = type; \ hdr->in = sizeof(*cmd); \ hdr->out = 0; \ memset(cmd, 0, sizeof(*cmd)); \ } while (0) struct cm_id_private { struct ib_cm_id id; int events_completed; pthread_cond_t cond; pthread_mutex_t mut; }; static int check_abi_version(void) { char value[8]; if (ibv_read_sysfs_file(ibv_get_sysfs_path(), "class/infiniband_cm/abi_version", value, sizeof value) < 0) { fprintf(stderr, PFX "couldn't read ABI version\n"); return 0; } abi_ver = strtol(value, NULL, 10); if (abi_ver < IB_USER_CM_MIN_ABI_VERSION || abi_ver > IB_USER_CM_MAX_ABI_VERSION) { fprintf(stderr, PFX "kernel ABI version %d " "doesn't match library version %d.\n", abi_ver, IB_USER_CM_MAX_ABI_VERSION); return -1; } return 0; } static int ucm_init(void) { int ret = 0; pthread_mutex_lock(&mut); if (!abi_ver) ret = check_abi_version(); pthread_mutex_unlock(&mut); return ret; } static int ucm_get_dev_index(char *dev_name) { char *dev_path; char ibdev[IBV_SYSFS_NAME_MAX]; int i, ret; for (i = 0; i < IB_UCM_MAX_DEVICES; i++) { ret = asprintf(&dev_path, "/sys/class/infiniband_cm/ucm%d", i); if (ret < 0) return -1; ret = ibv_read_sysfs_file(dev_path, "ibdev", ibdev, sizeof ibdev); if (ret < 0) continue; if (!strcmp(dev_name, ibdev)) { free(dev_path); return i; } free(dev_path); } return -1; } struct ib_cm_device* ib_cm_open_device(struct ibv_context *device_context) { struct ib_cm_device *dev; char *dev_path; int index, ret; if (ucm_init()) return NULL; index = ucm_get_dev_index(device_context->device->name); if (index < 0) return NULL; dev = malloc(sizeof *dev); if (!dev) return NULL; dev->device_context = device_context; ret = asprintf(&dev_path, "/dev/ucm%d", index); if (ret < 0) goto err1; dev->fd = open(dev_path, O_RDWR); if (dev->fd < 0) goto err2; free(dev_path); return dev; err2: free(dev_path); err1: free(dev); return NULL; } void ib_cm_close_device(struct ib_cm_device *device) { close(device->fd); free(device); } static void ib_cm_free_id(struct cm_id_private *cm_id_priv) { pthread_cond_destroy(&cm_id_priv->cond); pthread_mutex_destroy(&cm_id_priv->mut); free(cm_id_priv); } static struct cm_id_private *ib_cm_alloc_id(struct ib_cm_device *device, void *context) { struct cm_id_private *cm_id_priv; cm_id_priv = malloc(sizeof *cm_id_priv); if (!cm_id_priv) return NULL; memset(cm_id_priv, 0, sizeof *cm_id_priv); cm_id_priv->id.device = device; cm_id_priv->id.context = context; - pthread_mutex_init(&cm_id_priv->mut, NULL); + if (pthread_mutex_init(&cm_id_priv->mut, NULL)) + goto err; if (pthread_cond_init(&cm_id_priv->cond, NULL)) goto err; return cm_id_priv; err: ib_cm_free_id(cm_id_priv); return NULL; } int ib_cm_create_id(struct ib_cm_device *device, struct ib_cm_id **cm_id, void *context) { struct ib_ucm_create_id_resp *resp; struct ib_ucm_create_id *cmd; struct cm_id_private *cm_id_priv; void *msg; int result; int size; cm_id_priv = ib_cm_alloc_id(device, context); if (!cm_id_priv) return ERR(ENOMEM); CM_CREATE_MSG_CMD_RESP(msg, cmd, resp, IB_USER_CM_CMD_CREATE_ID, size); cmd->uid = (uintptr_t) cm_id_priv; result = write(device->fd, msg, size); if (result != size) goto err; VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp); cm_id_priv->id.handle = resp->id; *cm_id = &cm_id_priv->id; return 0; err: ib_cm_free_id(cm_id_priv); return result; } int ib_cm_destroy_id(struct ib_cm_id *cm_id) { struct ib_ucm_destroy_id_resp *resp; struct ib_ucm_destroy_id *cmd; struct cm_id_private *cm_id_priv; void *msg; int result; int size; CM_CREATE_MSG_CMD_RESP(msg, cmd, resp, IB_USER_CM_CMD_DESTROY_ID, size); cmd->id = cm_id->handle; result = write(cm_id->device->fd, msg, size); if (result != size) return (result >= 0) ? ERR(ENODATA) : -1; VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp); cm_id_priv = container_of(cm_id, struct cm_id_private, id); pthread_mutex_lock(&cm_id_priv->mut); while (cm_id_priv->events_completed < resp->events_reported) pthread_cond_wait(&cm_id_priv->cond, &cm_id_priv->mut); pthread_mutex_unlock(&cm_id_priv->mut); ib_cm_free_id(cm_id_priv); return 0; } int ib_cm_attr_id(struct ib_cm_id *cm_id, struct ib_cm_attr_param *param) { struct ib_ucm_attr_id_resp *resp; struct ib_ucm_attr_id *cmd; void *msg; int result; int size; if (!param) return ERR(EINVAL); CM_CREATE_MSG_CMD_RESP(msg, cmd, resp, IB_USER_CM_CMD_ATTR_ID, size); cmd->id = cm_id->handle; result = write(cm_id->device->fd, msg, size); if (result != size) return (result >= 0) ? ERR(ENODATA) : -1; VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp); param->service_id = resp->service_id; param->service_mask = resp->service_mask; param->local_id = resp->local_id; param->remote_id = resp->remote_id; return 0; } int ib_cm_init_qp_attr(struct ib_cm_id *cm_id, struct ibv_qp_attr *qp_attr, int *qp_attr_mask) { struct ibv_kern_qp_attr *resp; struct ib_ucm_init_qp_attr *cmd; void *msg; int result; int size; if (!qp_attr || !qp_attr_mask) return ERR(EINVAL); CM_CREATE_MSG_CMD_RESP(msg, cmd, resp, IB_USER_CM_CMD_INIT_QP_ATTR, size); cmd->id = cm_id->handle; cmd->qp_state = qp_attr->qp_state; result = write(cm_id->device->fd, msg, size); if (result != size) return (result >= 0) ? ERR(ENODATA) : result; VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp); *qp_attr_mask = resp->qp_attr_mask; ibv_copy_qp_attr_from_kern(qp_attr, resp); return 0; } int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask) { struct ib_ucm_listen *cmd; void *msg; int result; int size; CM_CREATE_MSG_CMD(msg, cmd, IB_USER_CM_CMD_LISTEN, size); cmd->id = cm_id->handle; cmd->service_id = service_id; cmd->service_mask = service_mask; result = write(cm_id->device->fd, msg, size); if (result != size) return (result >= 0) ? ERR(ENODATA) : -1; return 0; } int ib_cm_send_req(struct ib_cm_id *cm_id, struct ib_cm_req_param *param) { struct ib_user_path_rec p_path; struct ib_user_path_rec *a_path; struct ib_ucm_req *cmd; void *msg; int result; int size; if (!param || !param->primary_path) return ERR(EINVAL); CM_CREATE_MSG_CMD(msg, cmd, IB_USER_CM_CMD_SEND_REQ, size); cmd->id = cm_id->handle; cmd->qpn = param->qp_num; cmd->qp_type = param->qp_type; cmd->psn = param->starting_psn; cmd->sid = param->service_id; cmd->peer_to_peer = param->peer_to_peer; cmd->responder_resources = param->responder_resources; cmd->initiator_depth = param->initiator_depth; cmd->remote_cm_response_timeout = param->remote_cm_response_timeout; cmd->flow_control = param->flow_control; cmd->local_cm_response_timeout = param->local_cm_response_timeout; cmd->retry_count = param->retry_count; cmd->rnr_retry_count = param->rnr_retry_count; cmd->max_cm_retries = param->max_cm_retries; cmd->srq = param->srq; ibv_copy_path_rec_to_kern(&p_path, param->primary_path); cmd->primary_path = (uintptr_t) &p_path; if (param->alternate_path) { a_path = alloca(sizeof(*a_path)); if (!a_path) return ERR(ENOMEM); ibv_copy_path_rec_to_kern(a_path, param->alternate_path); cmd->alternate_path = (uintptr_t) a_path; } if (param->private_data && param->private_data_len) { cmd->data = (uintptr_t) param->private_data; cmd->len = param->private_data_len; } result = write(cm_id->device->fd, msg, size); if (result != size) return (result >= 0) ? ERR(ENODATA) : -1; return 0; } int ib_cm_send_rep(struct ib_cm_id *cm_id, struct ib_cm_rep_param *param) { struct ib_ucm_rep *cmd; void *msg; int result; int size; if (!param) return ERR(EINVAL); CM_CREATE_MSG_CMD(msg, cmd, IB_USER_CM_CMD_SEND_REP, size); cmd->uid = (uintptr_t) container_of(cm_id, struct cm_id_private, id); cmd->id = cm_id->handle; cmd->qpn = param->qp_num; cmd->psn = param->starting_psn; cmd->responder_resources = param->responder_resources; cmd->initiator_depth = param->initiator_depth; cmd->target_ack_delay = param->target_ack_delay; cmd->failover_accepted = param->failover_accepted; cmd->flow_control = param->flow_control; cmd->rnr_retry_count = param->rnr_retry_count; cmd->srq = param->srq; if (param->private_data && param->private_data_len) { cmd->data = (uintptr_t) param->private_data; cmd->len = param->private_data_len; } result = write(cm_id->device->fd, msg, size); if (result != size) return (result >= 0) ? ERR(ENODATA) : -1; return 0; } static inline int cm_send_private_data(struct ib_cm_id *cm_id, uint32_t type, void *private_data, uint8_t private_data_len) { struct ib_ucm_private_data *cmd; void *msg; int result; int size; CM_CREATE_MSG_CMD(msg, cmd, type, size); cmd->id = cm_id->handle; if (private_data && private_data_len) { cmd->data = (uintptr_t) private_data; cmd->len = private_data_len; } result = write(cm_id->device->fd, msg, size); if (result != size) return (result >= 0) ? ERR(ENODATA) : -1; return 0; } int ib_cm_send_rtu(struct ib_cm_id *cm_id, void *private_data, uint8_t private_data_len) { return cm_send_private_data(cm_id, IB_USER_CM_CMD_SEND_RTU, private_data, private_data_len); } int ib_cm_send_dreq(struct ib_cm_id *cm_id, void *private_data, uint8_t private_data_len) { return cm_send_private_data(cm_id, IB_USER_CM_CMD_SEND_DREQ, private_data, private_data_len); } int ib_cm_send_drep(struct ib_cm_id *cm_id, void *private_data, uint8_t private_data_len) { return cm_send_private_data(cm_id, IB_USER_CM_CMD_SEND_DREP, private_data, private_data_len); } static int cm_establish(struct ib_cm_id *cm_id) { /* In kernel ABI 4 ESTABLISH was repurposed as NOTIFY and gained an extra field. For some reason the compat definitions were deleted from the uapi headers :( */ #define IB_USER_CM_CMD_ESTABLISH IB_USER_CM_CMD_NOTIFY struct cm_abi_establish { /* ABI 4 support */ __u32 id; }; struct cm_abi_establish *cmd; void *msg; int result; int size; CM_CREATE_MSG_CMD(msg, cmd, IB_USER_CM_CMD_ESTABLISH, size); cmd->id = cm_id->handle; result = write(cm_id->device->fd, msg, size); if (result != size) return (result >= 0) ? ERR(ENODATA) : -1; return 0; } int ib_cm_notify(struct ib_cm_id *cm_id, enum ibv_event_type event) { struct ib_ucm_notify *cmd; void *msg; int result; int size; if (abi_ver == 4) { if (event == IBV_EVENT_COMM_EST) return cm_establish(cm_id); else return ERR(EINVAL); } CM_CREATE_MSG_CMD(msg, cmd, IB_USER_CM_CMD_NOTIFY, size); cmd->id = cm_id->handle; cmd->event = event; result = write(cm_id->device->fd, msg, size); if (result != size) return (result >= 0) ? ERR(ENODATA) : -1; return 0; } static inline int cm_send_status(struct ib_cm_id *cm_id, uint32_t type, int status, void *info, uint8_t info_length, void *private_data, uint8_t private_data_len) { struct ib_ucm_info *cmd; void *msg; int result; int size; CM_CREATE_MSG_CMD(msg, cmd, type, size); cmd->id = cm_id->handle; cmd->status = status; if (private_data && private_data_len) { cmd->data = (uintptr_t) private_data; cmd->data_len = private_data_len; } if (info && info_length) { cmd->info = (uintptr_t) info; cmd->info_len = info_length; } result = write(cm_id->device->fd, msg, size); if (result != size) return (result >= 0) ? ERR(ENODATA) : -1; return 0; } int ib_cm_send_rej(struct ib_cm_id *cm_id, enum ib_cm_rej_reason reason, void *ari, uint8_t ari_length, void *private_data, uint8_t private_data_len) { return cm_send_status(cm_id, IB_USER_CM_CMD_SEND_REJ, reason, ari, ari_length, private_data, private_data_len); } int ib_cm_send_apr(struct ib_cm_id *cm_id, enum ib_cm_apr_status status, void *info, uint8_t info_length, void *private_data, uint8_t private_data_len) { return cm_send_status(cm_id, IB_USER_CM_CMD_SEND_APR, status, info, info_length, private_data, private_data_len); } int ib_cm_send_mra(struct ib_cm_id *cm_id, uint8_t service_timeout, void *private_data, uint8_t private_data_len) { struct ib_ucm_mra *cmd; void *msg; int result; int size; CM_CREATE_MSG_CMD(msg, cmd, IB_USER_CM_CMD_SEND_MRA, size); cmd->id = cm_id->handle; cmd->timeout = service_timeout; if (private_data && private_data_len) { cmd->data = (uintptr_t) private_data; cmd->len = private_data_len; } result = write(cm_id->device->fd, msg, size); if (result != size) return (result >= 0) ? ERR(ENODATA) : result; return 0; } int ib_cm_send_lap(struct ib_cm_id *cm_id, struct ibv_sa_path_rec *alternate_path, void *private_data, uint8_t private_data_len) { struct ib_user_path_rec abi_path; struct ib_ucm_lap *cmd; void *msg; int result; int size; CM_CREATE_MSG_CMD(msg, cmd, IB_USER_CM_CMD_SEND_LAP, size); cmd->id = cm_id->handle; ibv_copy_path_rec_to_kern(&abi_path, alternate_path); cmd->path = (uintptr_t) &abi_path; if (private_data && private_data_len) { cmd->data = (uintptr_t) private_data; cmd->len = private_data_len; } result = write(cm_id->device->fd, msg, size); if (result != size) return (result >= 0) ? ERR(ENODATA) : -1; return 0; } int ib_cm_send_sidr_req(struct ib_cm_id *cm_id, struct ib_cm_sidr_req_param *param) { struct ib_user_path_rec abi_path; struct ib_ucm_sidr_req *cmd; void *msg; int result; int size; if (!param || !param->path) return ERR(EINVAL); CM_CREATE_MSG_CMD(msg, cmd, IB_USER_CM_CMD_SEND_SIDR_REQ, size); cmd->id = cm_id->handle; cmd->sid = param->service_id; cmd->timeout = param->timeout_ms; cmd->max_cm_retries = param->max_cm_retries; ibv_copy_path_rec_to_kern(&abi_path, param->path); cmd->path = (uintptr_t) &abi_path; if (param->private_data && param->private_data_len) { cmd->data = (uintptr_t) param->private_data; cmd->len = param->private_data_len; } result = write(cm_id->device->fd, msg, size); if (result != size) return (result >= 0) ? ERR(ENODATA) : result; return 0; } int ib_cm_send_sidr_rep(struct ib_cm_id *cm_id, struct ib_cm_sidr_rep_param *param) { struct ib_ucm_sidr_rep *cmd; void *msg; int result; int size; if (!param) return ERR(EINVAL); CM_CREATE_MSG_CMD(msg, cmd, IB_USER_CM_CMD_SEND_SIDR_REP, size); cmd->id = cm_id->handle; cmd->qpn = param->qp_num; cmd->qkey = param->qkey; cmd->status = param->status; if (param->private_data && param->private_data_len) { cmd->data = (uintptr_t) param->private_data; cmd->data_len = param->private_data_len; } if (param->info && param->info_length) { cmd->info = (uintptr_t) param->info; cmd->info_len = param->info_length; } result = write(cm_id->device->fd, msg, size); if (result != size) return (result >= 0) ? ERR(ENODATA) : -1; return 0; } static void cm_event_req_get(struct ib_cm_req_event_param *ureq, struct ib_ucm_req_event_resp *kreq) { ureq->remote_ca_guid = kreq->remote_ca_guid; ureq->remote_qkey = kreq->remote_qkey; ureq->remote_qpn = kreq->remote_qpn; ureq->qp_type = kreq->qp_type; ureq->starting_psn = kreq->starting_psn; ureq->responder_resources = kreq->responder_resources; ureq->initiator_depth = kreq->initiator_depth; ureq->local_cm_response_timeout = kreq->local_cm_response_timeout; ureq->flow_control = kreq->flow_control; ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout; ureq->retry_count = kreq->retry_count; ureq->rnr_retry_count = kreq->rnr_retry_count; ureq->srq = kreq->srq; ureq->port = kreq->port; ibv_copy_path_rec_from_kern(ureq->primary_path, &kreq->primary_path); if (ureq->alternate_path) ibv_copy_path_rec_from_kern(ureq->alternate_path, &kreq->alternate_path); } static void cm_event_rep_get(struct ib_cm_rep_event_param *urep, struct ib_ucm_rep_event_resp *krep) { urep->remote_ca_guid = krep->remote_ca_guid; urep->remote_qkey = krep->remote_qkey; urep->remote_qpn = krep->remote_qpn; urep->starting_psn = krep->starting_psn; urep->responder_resources = krep->responder_resources; urep->initiator_depth = krep->initiator_depth; urep->target_ack_delay = krep->target_ack_delay; urep->failover_accepted = krep->failover_accepted; urep->flow_control = krep->flow_control; urep->rnr_retry_count = krep->rnr_retry_count; urep->srq = krep->srq; } static void cm_event_sidr_rep_get(struct ib_cm_sidr_rep_event_param *urep, struct ib_ucm_sidr_rep_event_resp *krep) { urep->status = krep->status; urep->qkey = krep->qkey; urep->qpn = krep->qpn; }; int ib_cm_get_event(struct ib_cm_device *device, struct ib_cm_event **event) { struct cm_id_private *cm_id_priv; struct ib_ucm_cmd_hdr *hdr; struct ib_ucm_event_get *cmd; struct ib_ucm_event_resp *resp; struct ib_cm_event *evt = NULL; struct ibv_sa_path_rec *path_a = NULL; struct ibv_sa_path_rec *path_b = NULL; void *data = NULL; void *info = NULL; void *msg; int result = 0; int size; if (!event) return ERR(EINVAL); size = sizeof(*hdr) + sizeof(*cmd); msg = alloca(size); if (!msg) return ERR(ENOMEM); hdr = msg; cmd = msg + sizeof(*hdr); hdr->cmd = IB_USER_CM_CMD_EVENT; hdr->in = sizeof(*cmd); hdr->out = sizeof(*resp); memset(cmd, 0, sizeof(*cmd)); resp = alloca(sizeof(*resp)); if (!resp) return ERR(ENOMEM); cmd->response = (uintptr_t) resp; cmd->data_len = (uint8_t)(~0U); cmd->info_len = (uint8_t)(~0U); data = malloc(cmd->data_len); if (!data) { result = ERR(ENOMEM); goto done; } info = malloc(cmd->info_len); if (!info) { result = ERR(ENOMEM); goto done; } cmd->data = (uintptr_t) data; cmd->info = (uintptr_t) info; result = write(device->fd, msg, size); if (result != size) { result = (result >= 0) ? ERR(ENODATA) : -1; goto done; } VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp); /* * decode event. */ evt = malloc(sizeof(*evt)); if (!evt) { result = ERR(ENOMEM); goto done; } memset(evt, 0, sizeof(*evt)); evt->cm_id = (void *) (uintptr_t) resp->uid; evt->event = resp->event; if (resp->present & IB_UCM_PRES_PRIMARY) { path_a = malloc(sizeof(*path_a)); if (!path_a) { result = ERR(ENOMEM); goto done; } } if (resp->present & IB_UCM_PRES_ALTERNATE) { path_b = malloc(sizeof(*path_b)); if (!path_b) { result = ERR(ENOMEM); goto done; } } switch (evt->event) { case IB_CM_REQ_RECEIVED: evt->param.req_rcvd.listen_id = evt->cm_id; cm_id_priv = ib_cm_alloc_id(evt->cm_id->device, evt->cm_id->context); if (!cm_id_priv) { result = ERR(ENOMEM); goto done; } cm_id_priv->id.handle = resp->id; evt->cm_id = &cm_id_priv->id; evt->param.req_rcvd.primary_path = path_a; evt->param.req_rcvd.alternate_path = path_b; path_a = NULL; path_b = NULL; cm_event_req_get(&evt->param.req_rcvd, &resp->u.req_resp); break; case IB_CM_REP_RECEIVED: cm_event_rep_get(&evt->param.rep_rcvd, &resp->u.rep_resp); break; case IB_CM_MRA_RECEIVED: evt->param.mra_rcvd.service_timeout = resp->u.mra_resp.timeout; break; case IB_CM_REJ_RECEIVED: evt->param.rej_rcvd.reason = resp->u.rej_resp.reason; evt->param.rej_rcvd.ari = info; info = NULL; break; case IB_CM_LAP_RECEIVED: evt->param.lap_rcvd.alternate_path = path_b; path_b = NULL; ibv_copy_path_rec_from_kern(evt->param.lap_rcvd.alternate_path, &resp->u.lap_resp.path); break; case IB_CM_APR_RECEIVED: evt->param.apr_rcvd.ap_status = resp->u.apr_resp.status; evt->param.apr_rcvd.apr_info = info; info = NULL; break; case IB_CM_SIDR_REQ_RECEIVED: evt->param.sidr_req_rcvd.listen_id = evt->cm_id; cm_id_priv = ib_cm_alloc_id(evt->cm_id->device, evt->cm_id->context); if (!cm_id_priv) { result = ERR(ENOMEM); goto done; } cm_id_priv->id.handle = resp->id; evt->cm_id = &cm_id_priv->id; evt->param.sidr_req_rcvd.pkey = resp->u.sidr_req_resp.pkey; evt->param.sidr_req_rcvd.port = resp->u.sidr_req_resp.port; break; case IB_CM_SIDR_REP_RECEIVED: cm_event_sidr_rep_get(&evt->param.sidr_rep_rcvd, &resp->u.sidr_rep_resp); evt->param.sidr_rep_rcvd.info = info; info = NULL; break; default: evt->param.send_status = resp->u.send_status; break; } if (resp->present & IB_UCM_PRES_DATA) { evt->private_data = data; data = NULL; } *event = evt; evt = NULL; result = 0; done: if (data) free(data); if (info) free(info); if (path_a) free(path_a); if (path_b) free(path_b); if (evt) free(evt); return result; } int ib_cm_ack_event(struct ib_cm_event *event) { struct cm_id_private *cm_id_priv; if (!event) return ERR(EINVAL); if (event->private_data) free(event->private_data); cm_id_priv = container_of(event->cm_id, struct cm_id_private, id); switch (event->event) { case IB_CM_REQ_RECEIVED: cm_id_priv = container_of(event->param.req_rcvd.listen_id, struct cm_id_private, id); free(event->param.req_rcvd.primary_path); if (event->param.req_rcvd.alternate_path) free(event->param.req_rcvd.alternate_path); break; case IB_CM_REJ_RECEIVED: if (event->param.rej_rcvd.ari) free(event->param.rej_rcvd.ari); break; case IB_CM_LAP_RECEIVED: free(event->param.lap_rcvd.alternate_path); break; case IB_CM_APR_RECEIVED: if (event->param.apr_rcvd.apr_info) free(event->param.apr_rcvd.apr_info); break; case IB_CM_SIDR_REQ_RECEIVED: cm_id_priv = container_of(event->param.sidr_req_rcvd.listen_id, struct cm_id_private, id); break; case IB_CM_SIDR_REP_RECEIVED: if (event->param.sidr_rep_rcvd.info) free(event->param.sidr_rep_rcvd.info); default: break; } pthread_mutex_lock(&cm_id_priv->mut); cm_id_priv->events_completed++; pthread_cond_signal(&cm_id_priv->cond); pthread_mutex_unlock(&cm_id_priv->mut); free(event); return 0; } diff --git a/contrib/ofed/libibverbs/cmd.c b/contrib/ofed/libibverbs/cmd.c index 0a9cc3831d41..488ffedd146b 100644 --- a/contrib/ofed/libibverbs/cmd.c +++ b/contrib/ofed/libibverbs/cmd.c @@ -1,2090 +1,2172 @@ /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include "ibverbs.h" #include int ibv_cmd_get_context(struct ibv_context *context, struct ibv_get_context *cmd, size_t cmd_size, struct ibv_get_context_resp *resp, size_t resp_size) { if (abi_ver < IB_USER_VERBS_MIN_ABI_VERSION) return ENOSYS; IBV_INIT_CMD_RESP(cmd, cmd_size, GET_CONTEXT, resp, resp_size); if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); context->async_fd = resp->async_fd; context->num_comp_vectors = resp->num_comp_vectors; return 0; } static void copy_query_dev_fields(struct ibv_device_attr *device_attr, struct ibv_query_device_resp *resp, uint64_t *raw_fw_ver) { *raw_fw_ver = resp->fw_ver; device_attr->node_guid = resp->node_guid; device_attr->sys_image_guid = resp->sys_image_guid; device_attr->max_mr_size = resp->max_mr_size; device_attr->page_size_cap = resp->page_size_cap; device_attr->vendor_id = resp->vendor_id; device_attr->vendor_part_id = resp->vendor_part_id; device_attr->hw_ver = resp->hw_ver; device_attr->max_qp = resp->max_qp; device_attr->max_qp_wr = resp->max_qp_wr; device_attr->device_cap_flags = resp->device_cap_flags; device_attr->max_sge = resp->max_sge; device_attr->max_sge_rd = resp->max_sge_rd; device_attr->max_cq = resp->max_cq; device_attr->max_cqe = resp->max_cqe; device_attr->max_mr = resp->max_mr; device_attr->max_pd = resp->max_pd; device_attr->max_qp_rd_atom = resp->max_qp_rd_atom; device_attr->max_ee_rd_atom = resp->max_ee_rd_atom; device_attr->max_res_rd_atom = resp->max_res_rd_atom; device_attr->max_qp_init_rd_atom = resp->max_qp_init_rd_atom; device_attr->max_ee_init_rd_atom = resp->max_ee_init_rd_atom; device_attr->atomic_cap = resp->atomic_cap; device_attr->max_ee = resp->max_ee; device_attr->max_rdd = resp->max_rdd; device_attr->max_mw = resp->max_mw; device_attr->max_raw_ipv6_qp = resp->max_raw_ipv6_qp; device_attr->max_raw_ethy_qp = resp->max_raw_ethy_qp; device_attr->max_mcast_grp = resp->max_mcast_grp; device_attr->max_mcast_qp_attach = resp->max_mcast_qp_attach; device_attr->max_total_mcast_qp_attach = resp->max_total_mcast_qp_attach; device_attr->max_ah = resp->max_ah; device_attr->max_fmr = resp->max_fmr; device_attr->max_map_per_fmr = resp->max_map_per_fmr; device_attr->max_srq = resp->max_srq; device_attr->max_srq_wr = resp->max_srq_wr; device_attr->max_srq_sge = resp->max_srq_sge; device_attr->max_pkeys = resp->max_pkeys; device_attr->local_ca_ack_delay = resp->local_ca_ack_delay; device_attr->phys_port_cnt = resp->phys_port_cnt; } int ibv_cmd_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr, uint64_t *raw_fw_ver, struct ibv_query_device *cmd, size_t cmd_size) { struct ibv_query_device_resp resp; IBV_INIT_CMD_RESP(cmd, cmd_size, QUERY_DEVICE, &resp, sizeof resp); if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); memset(device_attr->fw_ver, 0, sizeof device_attr->fw_ver); copy_query_dev_fields(device_attr, &resp, raw_fw_ver); return 0; } int ibv_cmd_query_device_ex(struct ibv_context *context, const struct ibv_query_device_ex_input *input, struct ibv_device_attr_ex *attr, size_t attr_size, uint64_t *raw_fw_ver, struct ibv_query_device_ex *cmd, size_t cmd_core_size, size_t cmd_size, struct ibv_query_device_resp_ex *resp, size_t resp_core_size, size_t resp_size) { int err; if (input && input->comp_mask) return EINVAL; if (attr_size < offsetof(struct ibv_device_attr_ex, comp_mask) + sizeof(attr->comp_mask)) return EINVAL; if (resp_core_size < offsetof(struct ibv_query_device_resp_ex, response_length) + sizeof(resp->response_length)) return EINVAL; IBV_INIT_CMD_RESP_EX_V(cmd, cmd_core_size, cmd_size, QUERY_DEVICE_EX, resp, resp_core_size, resp_size); cmd->comp_mask = 0; cmd->reserved = 0; memset(attr->orig_attr.fw_ver, 0, sizeof(attr->orig_attr.fw_ver)); memset(&attr->comp_mask, 0, attr_size - sizeof(attr->orig_attr)); err = write(context->cmd_fd, cmd, cmd_size); if (err != cmd_size) return errno; (void)VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); copy_query_dev_fields(&attr->orig_attr, &resp->base, raw_fw_ver); /* Report back supported comp_mask bits. For now no comp_mask bit is * defined */ attr->comp_mask = resp->comp_mask & 0; if (attr_size >= offsetof(struct ibv_device_attr_ex, odp_caps) + sizeof(attr->odp_caps)) { if (resp->response_length >= offsetof(struct ibv_query_device_resp_ex, odp_caps) + sizeof(resp->odp_caps)) { attr->odp_caps.general_caps = resp->odp_caps.general_caps; attr->odp_caps.per_transport_caps.rc_odp_caps = resp->odp_caps.per_transport_caps.rc_odp_caps; attr->odp_caps.per_transport_caps.uc_odp_caps = resp->odp_caps.per_transport_caps.uc_odp_caps; attr->odp_caps.per_transport_caps.ud_odp_caps = resp->odp_caps.per_transport_caps.ud_odp_caps; } } if (attr_size >= offsetof(struct ibv_device_attr_ex, completion_timestamp_mask) + sizeof(attr->completion_timestamp_mask)) { if (resp->response_length >= offsetof(struct ibv_query_device_resp_ex, timestamp_mask) + sizeof(resp->timestamp_mask)) attr->completion_timestamp_mask = resp->timestamp_mask; } if (attr_size >= offsetof(struct ibv_device_attr_ex, hca_core_clock) + sizeof(attr->hca_core_clock)) { if (resp->response_length >= offsetof(struct ibv_query_device_resp_ex, hca_core_clock) + sizeof(resp->hca_core_clock)) attr->hca_core_clock = resp->hca_core_clock; } if (attr_size >= offsetof(struct ibv_device_attr_ex, device_cap_flags_ex) + sizeof(attr->device_cap_flags_ex)) { if (resp->response_length >= offsetof(struct ibv_query_device_resp_ex, device_cap_flags_ex) + sizeof(resp->device_cap_flags_ex)) attr->device_cap_flags_ex = resp->device_cap_flags_ex; } if (attr_size >= offsetof(struct ibv_device_attr_ex, rss_caps) + sizeof(attr->rss_caps)) { if (resp->response_length >= offsetof(struct ibv_query_device_resp_ex, rss_caps) + sizeof(resp->rss_caps)) { attr->rss_caps.supported_qpts = resp->rss_caps.supported_qpts; attr->rss_caps.max_rwq_indirection_tables = resp->rss_caps.max_rwq_indirection_tables; attr->rss_caps.max_rwq_indirection_table_size = resp->rss_caps.max_rwq_indirection_table_size; } } if (attr_size >= offsetof(struct ibv_device_attr_ex, max_wq_type_rq) + sizeof(attr->max_wq_type_rq)) { if (resp->response_length >= offsetof(struct ibv_query_device_resp_ex, max_wq_type_rq) + sizeof(resp->max_wq_type_rq)) attr->max_wq_type_rq = resp->max_wq_type_rq; } if (attr_size >= offsetof(struct ibv_device_attr_ex, raw_packet_caps) + sizeof(attr->raw_packet_caps)) { if (resp->response_length >= offsetof(struct ibv_query_device_resp_ex, raw_packet_caps) + sizeof(resp->raw_packet_caps)) attr->raw_packet_caps = resp->raw_packet_caps; } return 0; } int ibv_cmd_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr, struct ibv_query_port *cmd, size_t cmd_size) { struct ibv_query_port_resp resp; IBV_INIT_CMD_RESP(cmd, cmd_size, QUERY_PORT, &resp, sizeof resp); cmd->port_num = port_num; memset(cmd->reserved, 0, sizeof cmd->reserved); if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); port_attr->state = resp.state; port_attr->max_mtu = resp.max_mtu; port_attr->active_mtu = resp.active_mtu; port_attr->gid_tbl_len = resp.gid_tbl_len; port_attr->port_cap_flags = resp.port_cap_flags; port_attr->max_msg_sz = resp.max_msg_sz; port_attr->bad_pkey_cntr = resp.bad_pkey_cntr; port_attr->qkey_viol_cntr = resp.qkey_viol_cntr; port_attr->pkey_tbl_len = resp.pkey_tbl_len; port_attr->lid = resp.lid; port_attr->sm_lid = resp.sm_lid; port_attr->lmc = resp.lmc; port_attr->max_vl_num = resp.max_vl_num; port_attr->sm_sl = resp.sm_sl; port_attr->subnet_timeout = resp.subnet_timeout; port_attr->init_type_reply = resp.init_type_reply; port_attr->active_width = resp.active_width; port_attr->active_speed = resp.active_speed; port_attr->phys_state = resp.phys_state; port_attr->link_layer = resp.link_layer; return 0; } int ibv_cmd_alloc_pd(struct ibv_context *context, struct ibv_pd *pd, struct ibv_alloc_pd *cmd, size_t cmd_size, struct ibv_alloc_pd_resp *resp, size_t resp_size) { IBV_INIT_CMD_RESP(cmd, cmd_size, ALLOC_PD, resp, resp_size); if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); pd->handle = resp->pd_handle; pd->context = context; return 0; } int ibv_cmd_dealloc_pd(struct ibv_pd *pd) { struct ibv_dealloc_pd cmd; IBV_INIT_CMD(&cmd, sizeof cmd, DEALLOC_PD); cmd.pd_handle = pd->handle; if (write(pd->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) return errno; return 0; } int ibv_cmd_open_xrcd(struct ibv_context *context, struct verbs_xrcd *xrcd, int vxrcd_size, struct ibv_xrcd_init_attr *attr, struct ibv_open_xrcd *cmd, size_t cmd_size, struct ibv_open_xrcd_resp *resp, size_t resp_size) { IBV_INIT_CMD_RESP(cmd, cmd_size, OPEN_XRCD, resp, resp_size); if (attr->comp_mask >= IBV_XRCD_INIT_ATTR_RESERVED) return ENOSYS; if (!(attr->comp_mask & IBV_XRCD_INIT_ATTR_FD) || !(attr->comp_mask & IBV_XRCD_INIT_ATTR_OFLAGS)) return EINVAL; cmd->fd = attr->fd; cmd->oflags = attr->oflags; if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); xrcd->xrcd.context = context; xrcd->comp_mask = 0; if (vext_field_avail(struct verbs_xrcd, handle, vxrcd_size)) { xrcd->comp_mask = VERBS_XRCD_HANDLE; xrcd->handle = resp->xrcd_handle; } return 0; } int ibv_cmd_close_xrcd(struct verbs_xrcd *xrcd) { struct ibv_close_xrcd cmd; IBV_INIT_CMD(&cmd, sizeof cmd, CLOSE_XRCD); cmd.xrcd_handle = xrcd->handle; if (write(xrcd->xrcd.context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) return errno; return 0; } int ibv_cmd_reg_mr(struct ibv_pd *pd, void *addr, size_t length, uint64_t hca_va, int access, struct ibv_mr *mr, struct ibv_reg_mr *cmd, size_t cmd_size, struct ibv_reg_mr_resp *resp, size_t resp_size) { IBV_INIT_CMD_RESP(cmd, cmd_size, REG_MR, resp, resp_size); cmd->start = (uintptr_t) addr; cmd->length = length; cmd->hca_va = hca_va; cmd->pd_handle = pd->handle; cmd->access_flags = access; if (write(pd->context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); mr->handle = resp->mr_handle; mr->lkey = resp->lkey; mr->rkey = resp->rkey; mr->context = pd->context; return 0; } int ibv_cmd_rereg_mr(struct ibv_mr *mr, uint32_t flags, void *addr, size_t length, uint64_t hca_va, int access, struct ibv_pd *pd, struct ibv_rereg_mr *cmd, size_t cmd_sz, struct ibv_rereg_mr_resp *resp, size_t resp_sz) { IBV_INIT_CMD_RESP(cmd, cmd_sz, REREG_MR, resp, resp_sz); cmd->mr_handle = mr->handle; cmd->flags = flags; cmd->start = (uintptr_t)addr; cmd->length = length; cmd->hca_va = hca_va; cmd->pd_handle = (flags & IBV_REREG_MR_CHANGE_PD) ? pd->handle : 0; cmd->access_flags = access; if (write(mr->context->cmd_fd, cmd, cmd_sz) != cmd_sz) return errno; (void)VALGRIND_MAKE_MEM_DEFINED(resp, resp_sz); mr->lkey = resp->lkey; mr->rkey = resp->rkey; if (flags & IBV_REREG_MR_CHANGE_PD) mr->context = pd->context; return 0; } int ibv_cmd_dereg_mr(struct ibv_mr *mr) { struct ibv_dereg_mr cmd; IBV_INIT_CMD(&cmd, sizeof cmd, DEREG_MR); cmd.mr_handle = mr->handle; if (write(mr->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) return errno; return 0; } int ibv_cmd_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type, struct ibv_mw *mw, struct ibv_alloc_mw *cmd, size_t cmd_size, struct ibv_alloc_mw_resp *resp, size_t resp_size) { IBV_INIT_CMD_RESP(cmd, cmd_size, ALLOC_MW, resp, resp_size); cmd->pd_handle = pd->handle; cmd->mw_type = type; memset(cmd->reserved, 0, sizeof(cmd->reserved)); if (write(pd->context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); mw->context = pd->context; mw->pd = pd; mw->rkey = resp->rkey; mw->handle = resp->mw_handle; mw->type = type; return 0; } int ibv_cmd_dealloc_mw(struct ibv_mw *mw, struct ibv_dealloc_mw *cmd, size_t cmd_size) { IBV_INIT_CMD(cmd, cmd_size, DEALLOC_MW); cmd->mw_handle = mw->handle; cmd->reserved = 0; if (write(mw->context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; return 0; } int ibv_cmd_create_cq(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector, struct ibv_cq *cq, struct ibv_create_cq *cmd, size_t cmd_size, struct ibv_create_cq_resp *resp, size_t resp_size) { IBV_INIT_CMD_RESP(cmd, cmd_size, CREATE_CQ, resp, resp_size); cmd->user_handle = (uintptr_t) cq; cmd->cqe = cqe; cmd->comp_vector = comp_vector; cmd->comp_channel = channel ? channel->fd : -1; cmd->reserved = 0; if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); cq->handle = resp->cq_handle; cq->cqe = resp->cqe; cq->context = context; return 0; } int ibv_cmd_create_cq_ex(struct ibv_context *context, struct ibv_cq_init_attr_ex *cq_attr, struct ibv_cq_ex *cq, struct ibv_create_cq_ex *cmd, size_t cmd_core_size, size_t cmd_size, struct ibv_create_cq_resp_ex *resp, size_t resp_core_size, size_t resp_size) { int err; memset(cmd, 0, cmd_core_size); IBV_INIT_CMD_RESP_EX_V(cmd, cmd_core_size, cmd_size, CREATE_CQ_EX, resp, resp_core_size, resp_size); if (cq_attr->comp_mask & ~(IBV_CQ_INIT_ATTR_MASK_RESERVED - 1)) return EINVAL; cmd->user_handle = (uintptr_t)cq; cmd->cqe = cq_attr->cqe; cmd->comp_vector = cq_attr->comp_vector; cmd->comp_channel = cq_attr->channel ? cq_attr->channel->fd : -1; cmd->comp_mask = 0; if (cmd_core_size >= offsetof(struct ibv_create_cq_ex, flags) + sizeof(cmd->flags)) { if ((cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS) && (cq_attr->flags & ~(IBV_CREATE_CQ_ATTR_RESERVED - 1))) return EOPNOTSUPP; if (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP) cmd->flags |= IBV_CREATE_CQ_EX_KERNEL_FLAG_COMPLETION_TIMESTAMP; } err = write(context->cmd_fd, cmd, cmd_size); if (err != cmd_size) return errno; (void)VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); cq->handle = resp->base.cq_handle; cq->cqe = resp->base.cqe; cq->context = context; return 0; } int ibv_cmd_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) { struct ibv_poll_cq cmd; struct ibv_poll_cq_resp *resp; int i; int rsize; int ret; rsize = sizeof *resp + ne * sizeof(struct ibv_kern_wc); resp = malloc(rsize); if (!resp) return -1; IBV_INIT_CMD_RESP(&cmd, sizeof cmd, POLL_CQ, resp, rsize); cmd.cq_handle = ibcq->handle; cmd.ne = ne; if (write(ibcq->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) { ret = -1; goto out; } (void) VALGRIND_MAKE_MEM_DEFINED(resp, rsize); for (i = 0; i < resp->count; i++) { wc[i].wr_id = resp->wc[i].wr_id; wc[i].status = resp->wc[i].status; wc[i].opcode = resp->wc[i].opcode; wc[i].vendor_err = resp->wc[i].vendor_err; wc[i].byte_len = resp->wc[i].byte_len; wc[i].imm_data = resp->wc[i].imm_data; wc[i].qp_num = resp->wc[i].qp_num; wc[i].src_qp = resp->wc[i].src_qp; wc[i].wc_flags = resp->wc[i].wc_flags; wc[i].pkey_index = resp->wc[i].pkey_index; wc[i].slid = resp->wc[i].slid; wc[i].sl = resp->wc[i].sl; wc[i].dlid_path_bits = resp->wc[i].dlid_path_bits; } ret = resp->count; out: free(resp); return ret; } int ibv_cmd_req_notify_cq(struct ibv_cq *ibcq, int solicited_only) { struct ibv_req_notify_cq cmd; IBV_INIT_CMD(&cmd, sizeof cmd, REQ_NOTIFY_CQ); cmd.cq_handle = ibcq->handle; cmd.solicited = !!solicited_only; if (write(ibcq->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) return errno; return 0; } int ibv_cmd_resize_cq(struct ibv_cq *cq, int cqe, struct ibv_resize_cq *cmd, size_t cmd_size, struct ibv_resize_cq_resp *resp, size_t resp_size) { IBV_INIT_CMD_RESP(cmd, cmd_size, RESIZE_CQ, resp, resp_size); cmd->cq_handle = cq->handle; cmd->cqe = cqe; if (write(cq->context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); cq->cqe = resp->cqe; return 0; } int ibv_cmd_destroy_cq(struct ibv_cq *cq) { struct ibv_destroy_cq cmd; struct ibv_destroy_cq_resp resp; IBV_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_CQ, &resp, sizeof resp); cmd.cq_handle = cq->handle; cmd.reserved = 0; if (write(cq->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); pthread_mutex_lock(&cq->mutex); while (cq->comp_events_completed != resp.comp_events_reported || cq->async_events_completed != resp.async_events_reported) pthread_cond_wait(&cq->cond, &cq->mutex); pthread_mutex_unlock(&cq->mutex); return 0; } int ibv_cmd_create_srq(struct ibv_pd *pd, struct ibv_srq *srq, struct ibv_srq_init_attr *attr, struct ibv_create_srq *cmd, size_t cmd_size, struct ibv_create_srq_resp *resp, size_t resp_size) { IBV_INIT_CMD_RESP(cmd, cmd_size, CREATE_SRQ, resp, resp_size); cmd->user_handle = (uintptr_t) srq; cmd->pd_handle = pd->handle; cmd->max_wr = attr->attr.max_wr; cmd->max_sge = attr->attr.max_sge; cmd->srq_limit = attr->attr.srq_limit; if (write(pd->context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); srq->handle = resp->srq_handle; srq->context = pd->context; if (abi_ver > 5) { attr->attr.max_wr = resp->max_wr; attr->attr.max_sge = resp->max_sge; } else { struct ibv_create_srq_resp_v5 *resp_v5 = (struct ibv_create_srq_resp_v5 *) resp; memmove((void *) resp + sizeof *resp, (void *) resp_v5 + sizeof *resp_v5, resp_size - sizeof *resp); } return 0; } int ibv_cmd_create_srq_ex(struct ibv_context *context, struct verbs_srq *srq, int vsrq_sz, struct ibv_srq_init_attr_ex *attr_ex, struct ibv_create_xsrq *cmd, size_t cmd_size, struct ibv_create_srq_resp *resp, size_t resp_size) { struct verbs_xrcd *vxrcd = NULL; + int ret = 0; IBV_INIT_CMD_RESP(cmd, cmd_size, CREATE_XSRQ, resp, resp_size); if (attr_ex->comp_mask >= IBV_SRQ_INIT_ATTR_RESERVED) return ENOSYS; if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_PD)) return EINVAL; cmd->user_handle = (uintptr_t) srq; cmd->pd_handle = attr_ex->pd->handle; cmd->max_wr = attr_ex->attr.max_wr; cmd->max_sge = attr_ex->attr.max_sge; cmd->srq_limit = attr_ex->attr.srq_limit; cmd->srq_type = (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ? attr_ex->srq_type : IBV_SRQT_BASIC; if (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) { if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_CQ)) return EINVAL; vxrcd = container_of(attr_ex->xrcd, struct verbs_xrcd, xrcd); cmd->xrcd_handle = vxrcd->handle; cmd->cq_handle = attr_ex->cq->handle; } - if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) - return errno; + ret = pthread_mutex_init(&srq->srq.mutex, NULL); + if (ret) + goto err; + ret = pthread_cond_init(&srq->srq.cond, NULL); + if (ret) + goto err_mutex; + + if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) { + ret = errno; + goto err_cond; + } (void) VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); srq->srq.handle = resp->srq_handle; srq->srq.context = context; srq->srq.srq_context = attr_ex->srq_context; srq->srq.pd = attr_ex->pd; srq->srq.events_completed = 0; - pthread_mutex_init(&srq->srq.mutex, NULL); - pthread_cond_init(&srq->srq.cond, NULL); /* * check that the last field is available. * If it is than all the others exist as well */ if (vext_field_avail(struct verbs_srq, srq_num, vsrq_sz)) { srq->comp_mask = IBV_SRQ_INIT_ATTR_TYPE; srq->srq_type = (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ? attr_ex->srq_type : IBV_SRQT_BASIC; if (srq->srq_type == IBV_SRQT_XRC) { srq->comp_mask |= VERBS_SRQ_NUM; srq->srq_num = resp->srqn; } if (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) { srq->comp_mask |= VERBS_SRQ_XRCD; srq->xrcd = vxrcd; } if (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_CQ) { srq->comp_mask |= VERBS_SRQ_CQ; srq->cq = attr_ex->cq; } } attr_ex->attr.max_wr = resp->max_wr; attr_ex->attr.max_sge = resp->max_sge; return 0; +err_cond: + pthread_cond_destroy(&srq->srq.cond); +err_mutex: + pthread_mutex_destroy(&srq->srq.mutex); +err: + return ret; } static int ibv_cmd_modify_srq_v3(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, int srq_attr_mask, struct ibv_modify_srq *new_cmd, size_t new_cmd_size) { struct ibv_modify_srq_v3 *cmd; size_t cmd_size; cmd_size = sizeof *cmd + new_cmd_size - sizeof *new_cmd; cmd = alloca(cmd_size); memcpy(cmd->driver_data, new_cmd->driver_data, new_cmd_size - sizeof *new_cmd); IBV_INIT_CMD(cmd, cmd_size, MODIFY_SRQ); cmd->srq_handle = srq->handle; cmd->attr_mask = srq_attr_mask; cmd->max_wr = srq_attr->max_wr; cmd->srq_limit = srq_attr->srq_limit; cmd->max_sge = 0; cmd->reserved = 0; if (write(srq->context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; return 0; } int ibv_cmd_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, int srq_attr_mask, struct ibv_modify_srq *cmd, size_t cmd_size) { if (abi_ver == 3) return ibv_cmd_modify_srq_v3(srq, srq_attr, srq_attr_mask, cmd, cmd_size); IBV_INIT_CMD(cmd, cmd_size, MODIFY_SRQ); cmd->srq_handle = srq->handle; cmd->attr_mask = srq_attr_mask; cmd->max_wr = srq_attr->max_wr; cmd->srq_limit = srq_attr->srq_limit; if (write(srq->context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; return 0; } int ibv_cmd_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, struct ibv_query_srq *cmd, size_t cmd_size) { struct ibv_query_srq_resp resp; IBV_INIT_CMD_RESP(cmd, cmd_size, QUERY_SRQ, &resp, sizeof resp); cmd->srq_handle = srq->handle; cmd->reserved = 0; if (write(srq->context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); srq_attr->max_wr = resp.max_wr; srq_attr->max_sge = resp.max_sge; srq_attr->srq_limit = resp.srq_limit; return 0; } int ibv_cmd_destroy_srq(struct ibv_srq *srq) { struct ibv_destroy_srq cmd; struct ibv_destroy_srq_resp resp; IBV_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_SRQ, &resp, sizeof resp); cmd.srq_handle = srq->handle; cmd.reserved = 0; if (write(srq->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); pthread_mutex_lock(&srq->mutex); while (srq->events_completed != resp.events_reported) pthread_cond_wait(&srq->cond, &srq->mutex); pthread_mutex_unlock(&srq->mutex); + pthread_cond_destroy(&srq->cond); + pthread_mutex_destroy(&srq->mutex); + return 0; } static int create_qp_ex_common(struct verbs_qp *qp, struct ibv_qp_init_attr_ex *qp_attr, struct verbs_xrcd *vxrcd, struct ibv_create_qp_common *cmd) { cmd->user_handle = (uintptr_t)qp; if (qp_attr->comp_mask & IBV_QP_INIT_ATTR_XRCD) { vxrcd = container_of(qp_attr->xrcd, struct verbs_xrcd, xrcd); cmd->pd_handle = vxrcd->handle; } else { if (!(qp_attr->comp_mask & IBV_QP_INIT_ATTR_PD)) return EINVAL; cmd->pd_handle = qp_attr->pd->handle; if (qp_attr->comp_mask & IBV_QP_INIT_ATTR_IND_TABLE) { if (cmd->max_recv_wr || cmd->max_recv_sge || cmd->recv_cq_handle || qp_attr->srq) return EINVAL; /* send_cq is optinal */ if (qp_attr->cap.max_send_wr) cmd->send_cq_handle = qp_attr->send_cq->handle; } else { cmd->send_cq_handle = qp_attr->send_cq->handle; if (qp_attr->qp_type != IBV_QPT_XRC_SEND) { cmd->recv_cq_handle = qp_attr->recv_cq->handle; cmd->srq_handle = qp_attr->srq ? qp_attr->srq->handle : 0; } } } cmd->max_send_wr = qp_attr->cap.max_send_wr; cmd->max_recv_wr = qp_attr->cap.max_recv_wr; cmd->max_send_sge = qp_attr->cap.max_send_sge; cmd->max_recv_sge = qp_attr->cap.max_recv_sge; cmd->max_inline_data = qp_attr->cap.max_inline_data; cmd->sq_sig_all = qp_attr->sq_sig_all; cmd->qp_type = qp_attr->qp_type; cmd->is_srq = !!qp_attr->srq; cmd->reserved = 0; return 0; } +static int create_qp_handle_resp_common_cleanup(struct verbs_qp *qp) +{ + pthread_cond_destroy(&qp->qp.cond); + pthread_mutex_destroy(&qp->qp.mutex); +} + +static int create_qp_handle_resp_common_init(struct verbs_qp *qp) +{ + int ret = 0; + + ret = pthread_mutex_init(&qp->qp.mutex, NULL); + if (ret) + return ret; + ret = pthread_cond_init(&qp->qp.cond, NULL); + if (ret) + goto err; + + return ret; + +err: + pthread_mutex_destroy(&qp->qp.mutex); + + return ret; +} + static void create_qp_handle_resp_common(struct ibv_context *context, struct verbs_qp *qp, struct ibv_qp_init_attr_ex *qp_attr, struct ibv_create_qp_resp *resp, struct verbs_xrcd *vxrcd, int vqp_sz) { if (abi_ver > 3) { qp_attr->cap.max_recv_sge = resp->max_recv_sge; qp_attr->cap.max_send_sge = resp->max_send_sge; qp_attr->cap.max_recv_wr = resp->max_recv_wr; qp_attr->cap.max_send_wr = resp->max_send_wr; qp_attr->cap.max_inline_data = resp->max_inline_data; } qp->qp.handle = resp->qp_handle; qp->qp.qp_num = resp->qpn; qp->qp.context = context; qp->qp.qp_context = qp_attr->qp_context; qp->qp.pd = qp_attr->pd; qp->qp.send_cq = qp_attr->send_cq; qp->qp.recv_cq = qp_attr->recv_cq; qp->qp.srq = qp_attr->srq; qp->qp.qp_type = qp_attr->qp_type; qp->qp.state = IBV_QPS_RESET; qp->qp.events_completed = 0; - pthread_mutex_init(&qp->qp.mutex, NULL); - pthread_cond_init(&qp->qp.cond, NULL); qp->comp_mask = 0; if (vext_field_avail(struct verbs_qp, xrcd, vqp_sz) && (qp_attr->comp_mask & IBV_QP_INIT_ATTR_XRCD)) { qp->comp_mask |= VERBS_QP_XRCD; qp->xrcd = vxrcd; } } enum { CREATE_QP_EX2_SUP_CREATE_FLAGS = IBV_QP_CREATE_BLOCK_SELF_MCAST_LB | IBV_QP_CREATE_SCATTER_FCS | IBV_QP_CREATE_CVLAN_STRIPPING, }; int ibv_cmd_create_qp_ex2(struct ibv_context *context, struct verbs_qp *qp, int vqp_sz, struct ibv_qp_init_attr_ex *qp_attr, struct ibv_create_qp_ex *cmd, size_t cmd_core_size, size_t cmd_size, struct ibv_create_qp_resp_ex *resp, size_t resp_core_size, size_t resp_size) { struct verbs_xrcd *vxrcd = NULL; int err; if (qp_attr->comp_mask >= IBV_QP_INIT_ATTR_RESERVED) return EINVAL; if (resp_core_size < offsetof(struct ibv_create_qp_resp_ex, response_length) + sizeof(resp->response_length)) return EINVAL; memset(cmd, 0, cmd_core_size); IBV_INIT_CMD_RESP_EX_V(cmd, cmd_core_size, cmd_size, CREATE_QP_EX, resp, resp_core_size, resp_size); err = create_qp_ex_common(qp, qp_attr, vxrcd, &cmd->base); if (err) return err; if (qp_attr->comp_mask & IBV_QP_INIT_ATTR_CREATE_FLAGS) { if (qp_attr->create_flags & ~CREATE_QP_EX2_SUP_CREATE_FLAGS) return EINVAL; if (cmd_core_size < offsetof(struct ibv_create_qp_ex, create_flags) + sizeof(qp_attr->create_flags)) return EINVAL; cmd->create_flags = qp_attr->create_flags; } if (qp_attr->comp_mask & IBV_QP_INIT_ATTR_IND_TABLE) { if (cmd_core_size < offsetof(struct ibv_create_qp_ex, ind_tbl_handle) + sizeof(cmd->ind_tbl_handle)) return EINVAL; cmd->ind_tbl_handle = qp_attr->rwq_ind_tbl->ind_tbl_handle; cmd->comp_mask = IBV_CREATE_QP_EX_KERNEL_MASK_IND_TABLE; } + err = create_qp_handle_resp_common_init(qp); + if (err) + return err; + err = write(context->cmd_fd, cmd, cmd_size); - if (err != cmd_size) - return errno; + if (err != cmd_size) { + err = errno; + goto err; + } (void)VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); create_qp_handle_resp_common(context, qp, qp_attr, &resp->base, vxrcd, vqp_sz); return 0; + +err: + create_qp_handle_resp_common_cleanup(qp); + + return err; } int ibv_cmd_create_qp_ex(struct ibv_context *context, struct verbs_qp *qp, int vqp_sz, struct ibv_qp_init_attr_ex *attr_ex, struct ibv_create_qp *cmd, size_t cmd_size, struct ibv_create_qp_resp *resp, size_t resp_size) { struct verbs_xrcd *vxrcd = NULL; int err; IBV_INIT_CMD_RESP(cmd, cmd_size, CREATE_QP, resp, resp_size); if (attr_ex->comp_mask > (IBV_QP_INIT_ATTR_XRCD | IBV_QP_INIT_ATTR_PD)) return ENOSYS; err = create_qp_ex_common(qp, attr_ex, vxrcd, (struct ibv_create_qp_common *)&cmd->user_handle); if (err) return err; - if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) - return errno; + err = create_qp_handle_resp_common_init(qp); + if (err) + return err; + + err = write(context->cmd_fd, cmd, cmd_size); + if (err != cmd_size) { + err = errno; + goto err; + } (void)VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); if (abi_ver == 4) { struct ibv_create_qp_resp_v4 *resp_v4 = (struct ibv_create_qp_resp_v4 *)resp; memmove((void *)resp + sizeof *resp, (void *)resp_v4 + sizeof *resp_v4, resp_size - sizeof *resp); } else if (abi_ver <= 3) { struct ibv_create_qp_resp_v3 *resp_v3 = (struct ibv_create_qp_resp_v3 *)resp; memmove((void *)resp + sizeof *resp, (void *)resp_v3 + sizeof *resp_v3, resp_size - sizeof *resp); } create_qp_handle_resp_common(context, qp, attr_ex, resp, vxrcd, vqp_sz); return 0; + +err: + create_qp_handle_resp_common_cleanup(qp); + + return err; } int ibv_cmd_create_qp(struct ibv_pd *pd, struct ibv_qp *qp, struct ibv_qp_init_attr *attr, struct ibv_create_qp *cmd, size_t cmd_size, struct ibv_create_qp_resp *resp, size_t resp_size) { IBV_INIT_CMD_RESP(cmd, cmd_size, CREATE_QP, resp, resp_size); cmd->user_handle = (uintptr_t) qp; cmd->pd_handle = pd->handle; cmd->send_cq_handle = attr->send_cq->handle; cmd->recv_cq_handle = attr->recv_cq->handle; cmd->srq_handle = attr->srq ? attr->srq->handle : 0; cmd->max_send_wr = attr->cap.max_send_wr; cmd->max_recv_wr = attr->cap.max_recv_wr; cmd->max_send_sge = attr->cap.max_send_sge; cmd->max_recv_sge = attr->cap.max_recv_sge; cmd->max_inline_data = attr->cap.max_inline_data; cmd->sq_sig_all = attr->sq_sig_all; cmd->qp_type = attr->qp_type; cmd->is_srq = !!attr->srq; cmd->reserved = 0; if (write(pd->context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); qp->handle = resp->qp_handle; qp->qp_num = resp->qpn; qp->context = pd->context; if (abi_ver > 3) { attr->cap.max_recv_sge = resp->max_recv_sge; attr->cap.max_send_sge = resp->max_send_sge; attr->cap.max_recv_wr = resp->max_recv_wr; attr->cap.max_send_wr = resp->max_send_wr; attr->cap.max_inline_data = resp->max_inline_data; } if (abi_ver == 4) { struct ibv_create_qp_resp_v4 *resp_v4 = (struct ibv_create_qp_resp_v4 *) resp; memmove((void *) resp + sizeof *resp, (void *) resp_v4 + sizeof *resp_v4, resp_size - sizeof *resp); } else if (abi_ver <= 3) { struct ibv_create_qp_resp_v3 *resp_v3 = (struct ibv_create_qp_resp_v3 *) resp; memmove((void *) resp + sizeof *resp, (void *) resp_v3 + sizeof *resp_v3, resp_size - sizeof *resp); } return 0; } int ibv_cmd_open_qp(struct ibv_context *context, struct verbs_qp *qp, int vqp_sz, struct ibv_qp_open_attr *attr, struct ibv_open_qp *cmd, size_t cmd_size, struct ibv_create_qp_resp *resp, size_t resp_size) { + int err = 0; struct verbs_xrcd *xrcd; IBV_INIT_CMD_RESP(cmd, cmd_size, OPEN_QP, resp, resp_size); if (attr->comp_mask >= IBV_QP_OPEN_ATTR_RESERVED) return ENOSYS; if (!(attr->comp_mask & IBV_QP_OPEN_ATTR_XRCD) || !(attr->comp_mask & IBV_QP_OPEN_ATTR_NUM) || !(attr->comp_mask & IBV_QP_OPEN_ATTR_TYPE)) return EINVAL; xrcd = container_of(attr->xrcd, struct verbs_xrcd, xrcd); cmd->user_handle = (uintptr_t) qp; cmd->pd_handle = xrcd->handle; cmd->qpn = attr->qp_num; cmd->qp_type = attr->qp_type; - if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) - return errno; + err = pthread_mutex_init(&qp->qp.mutex, NULL); + if (err) + return err; + err = pthread_cond_init(&qp->qp.cond, NULL); + if (err) + goto err_mutex; + + err = write(context->cmd_fd, cmd, cmd_size); + if (err != cmd_size) { + err = errno; + goto err_cond; + } (void) VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); qp->qp.handle = resp->qp_handle; qp->qp.context = context; qp->qp.qp_context = attr->qp_context; qp->qp.pd = NULL; qp->qp.send_cq = NULL; qp->qp.recv_cq = NULL; qp->qp.srq = NULL; qp->qp.qp_num = attr->qp_num; qp->qp.qp_type = attr->qp_type; qp->qp.state = IBV_QPS_UNKNOWN; qp->qp.events_completed = 0; - pthread_mutex_init(&qp->qp.mutex, NULL); - pthread_cond_init(&qp->qp.cond, NULL); qp->comp_mask = 0; if (vext_field_avail(struct verbs_qp, xrcd, vqp_sz)) { qp->comp_mask = VERBS_QP_XRCD; qp->xrcd = xrcd; } return 0; + +err_cond: + pthread_cond_destroy(&qp->qp.cond); +err_mutex: + pthread_mutex_destroy(&qp->qp.mutex); + + return err; } int ibv_cmd_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr, struct ibv_query_qp *cmd, size_t cmd_size) { struct ibv_query_qp_resp resp; /* * Masks over IBV_QP_DEST_QPN are not supported by * that not extended command. */ if (attr_mask & ~((IBV_QP_DEST_QPN << 1) - 1)) return EOPNOTSUPP; IBV_INIT_CMD_RESP(cmd, cmd_size, QUERY_QP, &resp, sizeof resp); cmd->qp_handle = qp->handle; cmd->attr_mask = attr_mask; if (write(qp->context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); attr->qkey = resp.qkey; attr->rq_psn = resp.rq_psn; attr->sq_psn = resp.sq_psn; attr->dest_qp_num = resp.dest_qp_num; attr->qp_access_flags = resp.qp_access_flags; attr->pkey_index = resp.pkey_index; attr->alt_pkey_index = resp.alt_pkey_index; attr->qp_state = resp.qp_state; attr->cur_qp_state = resp.cur_qp_state; attr->path_mtu = resp.path_mtu; attr->path_mig_state = resp.path_mig_state; attr->sq_draining = resp.sq_draining; attr->max_rd_atomic = resp.max_rd_atomic; attr->max_dest_rd_atomic = resp.max_dest_rd_atomic; attr->min_rnr_timer = resp.min_rnr_timer; attr->port_num = resp.port_num; attr->timeout = resp.timeout; attr->retry_cnt = resp.retry_cnt; attr->rnr_retry = resp.rnr_retry; attr->alt_port_num = resp.alt_port_num; attr->alt_timeout = resp.alt_timeout; attr->cap.max_send_wr = resp.max_send_wr; attr->cap.max_recv_wr = resp.max_recv_wr; attr->cap.max_send_sge = resp.max_send_sge; attr->cap.max_recv_sge = resp.max_recv_sge; attr->cap.max_inline_data = resp.max_inline_data; memcpy(attr->ah_attr.grh.dgid.raw, resp.dest.dgid, 16); attr->ah_attr.grh.flow_label = resp.dest.flow_label; attr->ah_attr.dlid = resp.dest.dlid; attr->ah_attr.grh.sgid_index = resp.dest.sgid_index; attr->ah_attr.grh.hop_limit = resp.dest.hop_limit; attr->ah_attr.grh.traffic_class = resp.dest.traffic_class; attr->ah_attr.sl = resp.dest.sl; attr->ah_attr.src_path_bits = resp.dest.src_path_bits; attr->ah_attr.static_rate = resp.dest.static_rate; attr->ah_attr.is_global = resp.dest.is_global; attr->ah_attr.port_num = resp.dest.port_num; memcpy(attr->alt_ah_attr.grh.dgid.raw, resp.alt_dest.dgid, 16); attr->alt_ah_attr.grh.flow_label = resp.alt_dest.flow_label; attr->alt_ah_attr.dlid = resp.alt_dest.dlid; attr->alt_ah_attr.grh.sgid_index = resp.alt_dest.sgid_index; attr->alt_ah_attr.grh.hop_limit = resp.alt_dest.hop_limit; attr->alt_ah_attr.grh.traffic_class = resp.alt_dest.traffic_class; attr->alt_ah_attr.sl = resp.alt_dest.sl; attr->alt_ah_attr.src_path_bits = resp.alt_dest.src_path_bits; attr->alt_ah_attr.static_rate = resp.alt_dest.static_rate; attr->alt_ah_attr.is_global = resp.alt_dest.is_global; attr->alt_ah_attr.port_num = resp.alt_dest.port_num; init_attr->qp_context = qp->qp_context; init_attr->send_cq = qp->send_cq; init_attr->recv_cq = qp->recv_cq; init_attr->srq = qp->srq; init_attr->qp_type = qp->qp_type; init_attr->cap.max_send_wr = resp.max_send_wr; init_attr->cap.max_recv_wr = resp.max_recv_wr; init_attr->cap.max_send_sge = resp.max_send_sge; init_attr->cap.max_recv_sge = resp.max_recv_sge; init_attr->cap.max_inline_data = resp.max_inline_data; init_attr->sq_sig_all = resp.sq_sig_all; return 0; } static void copy_modify_qp_fields(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_modify_qp_common *cmd) { cmd->qp_handle = qp->handle; cmd->attr_mask = attr_mask; if (attr_mask & IBV_QP_STATE) cmd->qp_state = attr->qp_state; if (attr_mask & IBV_QP_CUR_STATE) cmd->cur_qp_state = attr->cur_qp_state; if (attr_mask & IBV_QP_EN_SQD_ASYNC_NOTIFY) cmd->en_sqd_async_notify = attr->en_sqd_async_notify; if (attr_mask & IBV_QP_ACCESS_FLAGS) cmd->qp_access_flags = attr->qp_access_flags; if (attr_mask & IBV_QP_PKEY_INDEX) cmd->pkey_index = attr->pkey_index; if (attr_mask & IBV_QP_PORT) cmd->port_num = attr->port_num; if (attr_mask & IBV_QP_QKEY) cmd->qkey = attr->qkey; if (attr_mask & IBV_QP_AV) { memcpy(cmd->dest.dgid, attr->ah_attr.grh.dgid.raw, 16); cmd->dest.flow_label = attr->ah_attr.grh.flow_label; cmd->dest.dlid = attr->ah_attr.dlid; cmd->dest.reserved = 0; cmd->dest.sgid_index = attr->ah_attr.grh.sgid_index; cmd->dest.hop_limit = attr->ah_attr.grh.hop_limit; cmd->dest.traffic_class = attr->ah_attr.grh.traffic_class; cmd->dest.sl = attr->ah_attr.sl; cmd->dest.src_path_bits = attr->ah_attr.src_path_bits; cmd->dest.static_rate = attr->ah_attr.static_rate; cmd->dest.is_global = attr->ah_attr.is_global; cmd->dest.port_num = attr->ah_attr.port_num; } if (attr_mask & IBV_QP_PATH_MTU) cmd->path_mtu = attr->path_mtu; if (attr_mask & IBV_QP_TIMEOUT) cmd->timeout = attr->timeout; if (attr_mask & IBV_QP_RETRY_CNT) cmd->retry_cnt = attr->retry_cnt; if (attr_mask & IBV_QP_RNR_RETRY) cmd->rnr_retry = attr->rnr_retry; if (attr_mask & IBV_QP_RQ_PSN) cmd->rq_psn = attr->rq_psn; if (attr_mask & IBV_QP_MAX_QP_RD_ATOMIC) cmd->max_rd_atomic = attr->max_rd_atomic; if (attr_mask & IBV_QP_ALT_PATH) { cmd->alt_pkey_index = attr->alt_pkey_index; cmd->alt_port_num = attr->alt_port_num; cmd->alt_timeout = attr->alt_timeout; memcpy(cmd->alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16); cmd->alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label; cmd->alt_dest.dlid = attr->alt_ah_attr.dlid; cmd->alt_dest.reserved = 0; cmd->alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index; cmd->alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit; cmd->alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class; cmd->alt_dest.sl = attr->alt_ah_attr.sl; cmd->alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits; cmd->alt_dest.static_rate = attr->alt_ah_attr.static_rate; cmd->alt_dest.is_global = attr->alt_ah_attr.is_global; cmd->alt_dest.port_num = attr->alt_ah_attr.port_num; } if (attr_mask & IBV_QP_MIN_RNR_TIMER) cmd->min_rnr_timer = attr->min_rnr_timer; if (attr_mask & IBV_QP_SQ_PSN) cmd->sq_psn = attr->sq_psn; if (attr_mask & IBV_QP_MAX_DEST_RD_ATOMIC) cmd->max_dest_rd_atomic = attr->max_dest_rd_atomic; if (attr_mask & IBV_QP_PATH_MIG_STATE) cmd->path_mig_state = attr->path_mig_state; if (attr_mask & IBV_QP_DEST_QPN) cmd->dest_qp_num = attr->dest_qp_num; cmd->reserved[0] = cmd->reserved[1] = 0; } int ibv_cmd_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_modify_qp *cmd, size_t cmd_size) { /* * Masks over IBV_QP_DEST_QPN are only supported by * ibv_cmd_modify_qp_ex. */ if (attr_mask & ~((IBV_QP_DEST_QPN << 1) - 1)) return EOPNOTSUPP; IBV_INIT_CMD(cmd, cmd_size, MODIFY_QP); copy_modify_qp_fields(qp, attr, attr_mask, &cmd->base); if (write(qp->context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; return 0; } int ibv_cmd_modify_qp_ex(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_modify_qp_ex *cmd, size_t cmd_core_size, size_t cmd_size, struct ibv_modify_qp_resp_ex *resp, size_t resp_core_size, size_t resp_size) { if (resp_core_size < offsetof(struct ibv_modify_qp_resp_ex, response_length) + sizeof(resp->response_length)) return EINVAL; IBV_INIT_CMD_RESP_EX_V(cmd, cmd_core_size, cmd_size, MODIFY_QP_EX, resp, resp_core_size, resp_size); copy_modify_qp_fields(qp, attr, attr_mask, &cmd->base); if (attr_mask & IBV_QP_RATE_LIMIT) { if (cmd_size >= offsetof(struct ibv_modify_qp_ex, rate_limit) + sizeof(cmd->rate_limit)) cmd->rate_limit = attr->rate_limit; else return EINVAL; } if (write(qp->context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; (void)VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); return 0; } int ibv_cmd_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { struct ibv_post_send *cmd; struct ibv_post_send_resp resp; struct ibv_send_wr *i; struct ibv_kern_send_wr *n, *tmp; struct ibv_sge *s; unsigned wr_count = 0; unsigned sge_count = 0; int cmd_size; int ret = 0; for (i = wr; i; i = i->next) { wr_count++; sge_count += i->num_sge; } cmd_size = sizeof *cmd + wr_count * sizeof *n + sge_count * sizeof *s; cmd = alloca(cmd_size); IBV_INIT_CMD_RESP(cmd, cmd_size, POST_SEND, &resp, sizeof resp); cmd->qp_handle = ibqp->handle; cmd->wr_count = wr_count; cmd->sge_count = sge_count; cmd->wqe_size = sizeof *n; n = (struct ibv_kern_send_wr *) ((void *) cmd + sizeof *cmd); s = (struct ibv_sge *) (n + wr_count); tmp = n; for (i = wr; i; i = i->next) { tmp->wr_id = i->wr_id; tmp->num_sge = i->num_sge; tmp->opcode = i->opcode; tmp->send_flags = i->send_flags; tmp->imm_data = i->imm_data; if (ibqp->qp_type == IBV_QPT_UD) { tmp->wr.ud.ah = i->wr.ud.ah->handle; tmp->wr.ud.remote_qpn = i->wr.ud.remote_qpn; tmp->wr.ud.remote_qkey = i->wr.ud.remote_qkey; } else { switch (i->opcode) { case IBV_WR_RDMA_WRITE: case IBV_WR_RDMA_WRITE_WITH_IMM: case IBV_WR_RDMA_READ: tmp->wr.rdma.remote_addr = i->wr.rdma.remote_addr; tmp->wr.rdma.rkey = i->wr.rdma.rkey; break; case IBV_WR_ATOMIC_CMP_AND_SWP: case IBV_WR_ATOMIC_FETCH_AND_ADD: tmp->wr.atomic.remote_addr = i->wr.atomic.remote_addr; tmp->wr.atomic.compare_add = i->wr.atomic.compare_add; tmp->wr.atomic.swap = i->wr.atomic.swap; tmp->wr.atomic.rkey = i->wr.atomic.rkey; break; default: break; } } if (tmp->num_sge) { memcpy(s, i->sg_list, tmp->num_sge * sizeof *s); s += tmp->num_sge; } tmp++; } resp.bad_wr = 0; if (write(ibqp->context->cmd_fd, cmd, cmd_size) != cmd_size) ret = errno; (void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); wr_count = resp.bad_wr; if (wr_count) { i = wr; while (--wr_count) i = i->next; *bad_wr = i; } else if (ret) *bad_wr = wr; return ret; } int ibv_cmd_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { struct ibv_post_recv *cmd; struct ibv_post_recv_resp resp; struct ibv_recv_wr *i; struct ibv_kern_recv_wr *n, *tmp; struct ibv_sge *s; unsigned wr_count = 0; unsigned sge_count = 0; int cmd_size; int ret = 0; for (i = wr; i; i = i->next) { wr_count++; sge_count += i->num_sge; } cmd_size = sizeof *cmd + wr_count * sizeof *n + sge_count * sizeof *s; cmd = alloca(cmd_size); IBV_INIT_CMD_RESP(cmd, cmd_size, POST_RECV, &resp, sizeof resp); cmd->qp_handle = ibqp->handle; cmd->wr_count = wr_count; cmd->sge_count = sge_count; cmd->wqe_size = sizeof *n; n = (struct ibv_kern_recv_wr *) ((void *) cmd + sizeof *cmd); s = (struct ibv_sge *) (n + wr_count); tmp = n; for (i = wr; i; i = i->next) { tmp->wr_id = i->wr_id; tmp->num_sge = i->num_sge; if (tmp->num_sge) { memcpy(s, i->sg_list, tmp->num_sge * sizeof *s); s += tmp->num_sge; } tmp++; } resp.bad_wr = 0; if (write(ibqp->context->cmd_fd, cmd, cmd_size) != cmd_size) ret = errno; (void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); wr_count = resp.bad_wr; if (wr_count) { i = wr; while (--wr_count) i = i->next; *bad_wr = i; } else if (ret) *bad_wr = wr; return ret; } int ibv_cmd_post_srq_recv(struct ibv_srq *srq, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { struct ibv_post_srq_recv *cmd; struct ibv_post_srq_recv_resp resp; struct ibv_recv_wr *i; struct ibv_kern_recv_wr *n, *tmp; struct ibv_sge *s; unsigned wr_count = 0; unsigned sge_count = 0; int cmd_size; int ret = 0; for (i = wr; i; i = i->next) { wr_count++; sge_count += i->num_sge; } cmd_size = sizeof *cmd + wr_count * sizeof *n + sge_count * sizeof *s; cmd = alloca(cmd_size); IBV_INIT_CMD_RESP(cmd, cmd_size, POST_SRQ_RECV, &resp, sizeof resp); cmd->srq_handle = srq->handle; cmd->wr_count = wr_count; cmd->sge_count = sge_count; cmd->wqe_size = sizeof *n; n = (struct ibv_kern_recv_wr *) ((void *) cmd + sizeof *cmd); s = (struct ibv_sge *) (n + wr_count); tmp = n; for (i = wr; i; i = i->next) { tmp->wr_id = i->wr_id; tmp->num_sge = i->num_sge; if (tmp->num_sge) { memcpy(s, i->sg_list, tmp->num_sge * sizeof *s); s += tmp->num_sge; } tmp++; } resp.bad_wr = 0; if (write(srq->context->cmd_fd, cmd, cmd_size) != cmd_size) ret = errno; (void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); wr_count = resp.bad_wr; if (wr_count) { i = wr; while (--wr_count) i = i->next; *bad_wr = i; } else if (ret) *bad_wr = wr; return ret; } int ibv_cmd_create_ah(struct ibv_pd *pd, struct ibv_ah *ah, struct ibv_ah_attr *attr, struct ibv_create_ah_resp *resp, size_t resp_size) { struct ibv_create_ah cmd; IBV_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_AH, resp, resp_size); cmd.user_handle = (uintptr_t) ah; cmd.pd_handle = pd->handle; cmd.attr.dlid = attr->dlid; cmd.attr.sl = attr->sl; cmd.attr.src_path_bits = attr->src_path_bits; cmd.attr.static_rate = attr->static_rate; cmd.attr.is_global = attr->is_global; cmd.attr.port_num = attr->port_num; cmd.attr.grh.flow_label = attr->grh.flow_label; cmd.attr.grh.sgid_index = attr->grh.sgid_index; cmd.attr.grh.hop_limit = attr->grh.hop_limit; cmd.attr.grh.traffic_class = attr->grh.traffic_class; memcpy(cmd.attr.grh.dgid, attr->grh.dgid.raw, 16); if (write(pd->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); ah->handle = resp->handle; ah->context = pd->context; return 0; } int ibv_cmd_destroy_ah(struct ibv_ah *ah) { struct ibv_destroy_ah cmd; IBV_INIT_CMD(&cmd, sizeof cmd, DESTROY_AH); cmd.ah_handle = ah->handle; if (write(ah->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) return errno; return 0; } int ibv_cmd_destroy_qp(struct ibv_qp *qp) { struct ibv_destroy_qp cmd; struct ibv_destroy_qp_resp resp; IBV_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_QP, &resp, sizeof resp); cmd.qp_handle = qp->handle; cmd.reserved = 0; if (write(qp->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); pthread_mutex_lock(&qp->mutex); while (qp->events_completed != resp.events_reported) pthread_cond_wait(&qp->cond, &qp->mutex); pthread_mutex_unlock(&qp->mutex); + pthread_cond_destroy(&qp->cond); + pthread_mutex_destroy(&qp->mutex); + return 0; } int ibv_cmd_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) { struct ibv_attach_mcast cmd; IBV_INIT_CMD(&cmd, sizeof cmd, ATTACH_MCAST); memcpy(cmd.gid, gid->raw, sizeof cmd.gid); cmd.qp_handle = qp->handle; cmd.mlid = lid; cmd.reserved = 0; if (write(qp->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) return errno; return 0; } int ibv_cmd_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) { struct ibv_detach_mcast cmd; IBV_INIT_CMD(&cmd, sizeof cmd, DETACH_MCAST); memcpy(cmd.gid, gid->raw, sizeof cmd.gid); cmd.qp_handle = qp->handle; cmd.mlid = lid; cmd.reserved = 0; if (write(qp->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) return errno; return 0; } static int buffer_is_zero(char *addr, ssize_t size) { return addr[0] == 0 && !memcmp(addr, addr + 1, size - 1); } static int get_filters_size(struct ibv_flow_spec *ib_spec, struct ibv_kern_spec *kern_spec, int *ib_filter_size, int *kern_filter_size, enum ibv_flow_spec_type type) { void *ib_spec_filter_mask; int curr_kern_filter_size; int min_filter_size; *ib_filter_size = (ib_spec->hdr.size - sizeof(ib_spec->hdr)) / 2; switch (type) { case IBV_FLOW_SPEC_IPV4_EXT: min_filter_size = offsetof(struct ibv_kern_ipv4_ext_filter, flags) + sizeof(kern_spec->ipv4_ext.mask.flags); curr_kern_filter_size = min_filter_size; ib_spec_filter_mask = (void *)&ib_spec->ipv4_ext.val + *ib_filter_size; break; case IBV_FLOW_SPEC_IPV6: min_filter_size = offsetof(struct ibv_kern_ipv6_filter, hop_limit) + sizeof(kern_spec->ipv6.mask.hop_limit); curr_kern_filter_size = min_filter_size; ib_spec_filter_mask = (void *)&ib_spec->ipv6.val + *ib_filter_size; break; case IBV_FLOW_SPEC_VXLAN_TUNNEL: min_filter_size = offsetof(struct ibv_kern_tunnel_filter, tunnel_id) + sizeof(kern_spec->tunnel.mask.tunnel_id); curr_kern_filter_size = min_filter_size; ib_spec_filter_mask = (void *)&ib_spec->tunnel.val + *ib_filter_size; break; default: return EINVAL; } if (*ib_filter_size < min_filter_size) return EINVAL; if (*ib_filter_size > curr_kern_filter_size && !buffer_is_zero(ib_spec_filter_mask + curr_kern_filter_size, *ib_filter_size - curr_kern_filter_size)) return EOPNOTSUPP; *kern_filter_size = min_t(int, curr_kern_filter_size, *ib_filter_size); return 0; } static int ib_spec_to_kern_spec(struct ibv_flow_spec *ib_spec, struct ibv_kern_spec *kern_spec) { int kern_filter_size; int ib_filter_size; int ret; kern_spec->hdr.type = ib_spec->hdr.type; switch (kern_spec->hdr.type) { case IBV_FLOW_SPEC_ETH: case IBV_FLOW_SPEC_ETH | IBV_FLOW_SPEC_INNER: kern_spec->eth.size = sizeof(struct ibv_kern_spec_eth); memcpy(&kern_spec->eth.val, &ib_spec->eth.val, sizeof(struct ibv_flow_eth_filter)); memcpy(&kern_spec->eth.mask, &ib_spec->eth.mask, sizeof(struct ibv_flow_eth_filter)); break; case IBV_FLOW_SPEC_IPV4: case IBV_FLOW_SPEC_IPV4 | IBV_FLOW_SPEC_INNER: kern_spec->ipv4.size = sizeof(struct ibv_kern_spec_ipv4); memcpy(&kern_spec->ipv4.val, &ib_spec->ipv4.val, sizeof(struct ibv_flow_ipv4_filter)); memcpy(&kern_spec->ipv4.mask, &ib_spec->ipv4.mask, sizeof(struct ibv_flow_ipv4_filter)); break; case IBV_FLOW_SPEC_IPV4_EXT: case IBV_FLOW_SPEC_IPV4_EXT | IBV_FLOW_SPEC_INNER: ret = get_filters_size(ib_spec, kern_spec, &ib_filter_size, &kern_filter_size, IBV_FLOW_SPEC_IPV4_EXT); if (ret) return ret; kern_spec->hdr.type = IBV_FLOW_SPEC_IPV4 | (IBV_FLOW_SPEC_INNER & ib_spec->hdr.type); kern_spec->ipv4_ext.size = sizeof(struct ibv_kern_spec_ipv4_ext); memcpy(&kern_spec->ipv4_ext.val, &ib_spec->ipv4_ext.val, kern_filter_size); memcpy(&kern_spec->ipv4_ext.mask, (void *)&ib_spec->ipv4_ext.val + ib_filter_size, kern_filter_size); break; case IBV_FLOW_SPEC_IPV6: case IBV_FLOW_SPEC_IPV6 | IBV_FLOW_SPEC_INNER: ret = get_filters_size(ib_spec, kern_spec, &ib_filter_size, &kern_filter_size, IBV_FLOW_SPEC_IPV6); if (ret) return ret; kern_spec->ipv6.size = sizeof(struct ibv_kern_spec_ipv6); memcpy(&kern_spec->ipv6.val, &ib_spec->ipv6.val, kern_filter_size); memcpy(&kern_spec->ipv6.mask, (void *)&ib_spec->ipv6.val + ib_filter_size, kern_filter_size); break; case IBV_FLOW_SPEC_TCP: case IBV_FLOW_SPEC_UDP: case IBV_FLOW_SPEC_TCP | IBV_FLOW_SPEC_INNER: case IBV_FLOW_SPEC_UDP | IBV_FLOW_SPEC_INNER: kern_spec->tcp_udp.size = sizeof(struct ibv_kern_spec_tcp_udp); memcpy(&kern_spec->tcp_udp.val, &ib_spec->tcp_udp.val, sizeof(struct ibv_flow_ipv4_filter)); memcpy(&kern_spec->tcp_udp.mask, &ib_spec->tcp_udp.mask, sizeof(struct ibv_flow_tcp_udp_filter)); break; case IBV_FLOW_SPEC_VXLAN_TUNNEL: ret = get_filters_size(ib_spec, kern_spec, &ib_filter_size, &kern_filter_size, IBV_FLOW_SPEC_VXLAN_TUNNEL); if (ret) return ret; kern_spec->tunnel.size = sizeof(struct ibv_kern_spec_tunnel); memcpy(&kern_spec->tunnel.val, &ib_spec->tunnel.val, kern_filter_size); memcpy(&kern_spec->tunnel.mask, (void *)&ib_spec->tunnel.val + ib_filter_size, kern_filter_size); break; case IBV_FLOW_SPEC_ACTION_TAG: kern_spec->flow_tag.size = sizeof(struct ibv_kern_spec_action_tag); kern_spec->flow_tag.tag_id = ib_spec->flow_tag.tag_id; break; case IBV_FLOW_SPEC_ACTION_DROP: kern_spec->drop.size = sizeof(struct ibv_kern_spec_action_drop); break; default: return EINVAL; } return 0; } struct ibv_flow *ibv_cmd_create_flow(struct ibv_qp *qp, struct ibv_flow_attr *flow_attr) { struct ibv_create_flow *cmd; struct ibv_create_flow_resp resp; struct ibv_flow *flow_id; size_t cmd_size; size_t written_size; int i, err; void *kern_spec; void *ib_spec; cmd_size = sizeof(*cmd) + (flow_attr->num_of_specs * sizeof(struct ibv_kern_spec)); cmd = alloca(cmd_size); flow_id = malloc(sizeof(*flow_id)); if (!flow_id) return NULL; memset(cmd, 0, cmd_size); cmd->qp_handle = qp->handle; cmd->flow_attr.type = flow_attr->type; cmd->flow_attr.priority = flow_attr->priority; cmd->flow_attr.num_of_specs = flow_attr->num_of_specs; cmd->flow_attr.port = flow_attr->port; cmd->flow_attr.flags = flow_attr->flags; kern_spec = cmd + 1; ib_spec = flow_attr + 1; for (i = 0; i < flow_attr->num_of_specs; i++) { err = ib_spec_to_kern_spec(ib_spec, kern_spec); if (err) { errno = err; goto err; } cmd->flow_attr.size += ((struct ibv_kern_spec *)kern_spec)->hdr.size; kern_spec += ((struct ibv_kern_spec *)kern_spec)->hdr.size; ib_spec += ((struct ibv_flow_spec *)ib_spec)->hdr.size; } written_size = sizeof(*cmd) + cmd->flow_attr.size; IBV_INIT_CMD_RESP_EX_VCMD(cmd, written_size, written_size, CREATE_FLOW, &resp, sizeof(resp)); if (write(qp->context->cmd_fd, cmd, written_size) != written_size) goto err; (void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof(resp)); flow_id->context = qp->context; flow_id->handle = resp.flow_handle; return flow_id; err: free(flow_id); return NULL; } int ibv_cmd_destroy_flow(struct ibv_flow *flow_id) { struct ibv_destroy_flow cmd; int ret = 0; memset(&cmd, 0, sizeof(cmd)); IBV_INIT_CMD_EX(&cmd, sizeof(cmd), DESTROY_FLOW); cmd.flow_handle = flow_id->handle; if (write(flow_id->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) ret = errno; free(flow_id); return ret; } int ibv_cmd_create_wq(struct ibv_context *context, struct ibv_wq_init_attr *wq_init_attr, struct ibv_wq *wq, struct ibv_create_wq *cmd, size_t cmd_core_size, size_t cmd_size, struct ibv_create_wq_resp *resp, size_t resp_core_size, size_t resp_size) { int err; if (wq_init_attr->comp_mask >= IBV_WQ_INIT_ATTR_RESERVED) return EINVAL; IBV_INIT_CMD_RESP_EX_V(cmd, cmd_core_size, cmd_size, CREATE_WQ, resp, resp_core_size, resp_size); cmd->user_handle = (uintptr_t)wq; cmd->pd_handle = wq_init_attr->pd->handle; cmd->cq_handle = wq_init_attr->cq->handle; cmd->wq_type = wq_init_attr->wq_type; cmd->max_sge = wq_init_attr->max_sge; cmd->max_wr = wq_init_attr->max_wr; cmd->comp_mask = 0; if (cmd_core_size >= offsetof(struct ibv_create_wq, create_flags) + sizeof(cmd->create_flags)) { if (wq_init_attr->comp_mask & IBV_WQ_INIT_ATTR_FLAGS) { if (wq_init_attr->create_flags & ~(IBV_WQ_FLAGS_RESERVED - 1)) return EOPNOTSUPP; cmd->create_flags = wq_init_attr->create_flags; } } err = write(context->cmd_fd, cmd, cmd_size); if (err != cmd_size) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); if (resp->response_length < resp_core_size) return EINVAL; wq->handle = resp->wq_handle; wq_init_attr->max_wr = resp->max_wr; wq_init_attr->max_sge = resp->max_sge; wq->wq_num = resp->wqn; wq->context = context; wq->cq = wq_init_attr->cq; wq->pd = wq_init_attr->pd; wq->wq_type = wq_init_attr->wq_type; return 0; } int ibv_cmd_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *attr, struct ibv_modify_wq *cmd, size_t cmd_core_size, size_t cmd_size) { if (attr->attr_mask >= IBV_WQ_ATTR_RESERVED) return EINVAL; memset(cmd, 0, cmd_core_size); IBV_INIT_CMD_EX(cmd, cmd_size, MODIFY_WQ); cmd->curr_wq_state = attr->curr_wq_state; cmd->wq_state = attr->wq_state; if (cmd_core_size >= offsetof(struct ibv_modify_wq, flags_mask) + sizeof(cmd->flags_mask)) { if (attr->attr_mask & IBV_WQ_ATTR_FLAGS) { if (attr->flags_mask & ~(IBV_WQ_FLAGS_RESERVED - 1)) return EOPNOTSUPP; cmd->flags = attr->flags; cmd->flags_mask = attr->flags_mask; } } cmd->wq_handle = wq->handle; cmd->attr_mask = attr->attr_mask; if (write(wq->context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; if (attr->attr_mask & IBV_WQ_ATTR_STATE) wq->state = attr->wq_state; return 0; } int ibv_cmd_destroy_wq(struct ibv_wq *wq) { struct ibv_destroy_wq cmd; struct ibv_destroy_wq_resp resp; int ret = 0; memset(&cmd, 0, sizeof(cmd)); memset(&resp, 0, sizeof(resp)); IBV_INIT_CMD_RESP_EX(&cmd, sizeof(cmd), DESTROY_WQ, &resp, sizeof(resp)); cmd.wq_handle = wq->handle; if (write(wq->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) return errno; if (resp.response_length < sizeof(resp)) return EINVAL; pthread_mutex_lock(&wq->mutex); while (wq->events_completed != resp.events_reported) pthread_cond_wait(&wq->cond, &wq->mutex); pthread_mutex_unlock(&wq->mutex); return ret; } int ibv_cmd_create_rwq_ind_table(struct ibv_context *context, struct ibv_rwq_ind_table_init_attr *init_attr, struct ibv_rwq_ind_table *rwq_ind_table, struct ibv_create_rwq_ind_table *cmd, size_t cmd_core_size, size_t cmd_size, struct ibv_create_rwq_ind_table_resp *resp, size_t resp_core_size, size_t resp_size) { int err, i; uint32_t required_tbl_size, alloc_tbl_size; uint32_t *tbl_start; int num_tbl_entries; if (init_attr->comp_mask >= IBV_CREATE_IND_TABLE_RESERVED) return EINVAL; alloc_tbl_size = cmd_core_size - sizeof(*cmd); num_tbl_entries = 1 << init_attr->log_ind_tbl_size; /* Data must be u64 aligned */ required_tbl_size = (num_tbl_entries * sizeof(uint32_t)) < sizeof(uint64_t) ? sizeof(uint64_t) : (num_tbl_entries * sizeof(uint32_t)); if (alloc_tbl_size < required_tbl_size) return EINVAL; tbl_start = (uint32_t *)((uint8_t *)cmd + sizeof(*cmd)); for (i = 0; i < num_tbl_entries; i++) tbl_start[i] = init_attr->ind_tbl[i]->handle; IBV_INIT_CMD_RESP_EX_V(cmd, cmd_core_size, cmd_size, CREATE_RWQ_IND_TBL, resp, resp_core_size, resp_size); cmd->log_ind_tbl_size = init_attr->log_ind_tbl_size; cmd->comp_mask = 0; err = write(context->cmd_fd, cmd, cmd_size); if (err != cmd_size) return errno; (void) VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); if (resp->response_length < resp_core_size) return EINVAL; rwq_ind_table->ind_tbl_handle = resp->ind_tbl_handle; rwq_ind_table->ind_tbl_num = resp->ind_tbl_num; rwq_ind_table->context = context; return 0; } int ibv_cmd_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table) { struct ibv_destroy_rwq_ind_table cmd; int ret = 0; memset(&cmd, 0, sizeof(cmd)); IBV_INIT_CMD_EX(&cmd, sizeof(cmd), DESTROY_RWQ_IND_TBL); cmd.ind_tbl_handle = rwq_ind_table->ind_tbl_handle; if (write(rwq_ind_table->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) ret = errno; return ret; } diff --git a/contrib/ofed/libibverbs/device.c b/contrib/ofed/libibverbs/device.c index d5cd2173cd8b..c3d0dbf573ab 100644 --- a/contrib/ofed/libibverbs/device.c +++ b/contrib/ofed/libibverbs/device.c @@ -1,395 +1,462 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include "ibverbs.h" /* Hack to avoid GCC's -Wmissing-prototypes and the similar error from sparse with these prototypes. Symbol versionining requires the goofy names, the prototype must match the version in verbs.h. */ struct ibv_device **__ibv_get_device_list(int *num_devices); void __ibv_free_device_list(struct ibv_device **list); const char *__ibv_get_device_name(struct ibv_device *device); __be64 __ibv_get_device_guid(struct ibv_device *device); struct ibv_context *__ibv_open_device(struct ibv_device *device); int __ibv_close_device(struct ibv_context *context); int __ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event); void __ibv_ack_async_event(struct ibv_async_event *event); static pthread_once_t device_list_once = PTHREAD_ONCE_INIT; static int num_devices; static struct ibv_device **device_list; static void count_devices(void) { num_devices = ibverbs_init(&device_list); } struct ibv_device **__ibv_get_device_list(int *num) { struct ibv_device **l; int i; if (num) *num = 0; pthread_once(&device_list_once, count_devices); if (num_devices < 0) { errno = -num_devices; return NULL; } l = calloc(num_devices + 1, sizeof (struct ibv_device *)); if (!l) { errno = ENOMEM; return NULL; } for (i = 0; i < num_devices; ++i) l[i] = device_list[i]; if (num) *num = num_devices; return l; } default_symver(__ibv_get_device_list, ibv_get_device_list); void __ibv_free_device_list(struct ibv_device **list) { free(list); } default_symver(__ibv_free_device_list, ibv_free_device_list); const char *__ibv_get_device_name(struct ibv_device *device) { return device->name; } default_symver(__ibv_get_device_name, ibv_get_device_name); __be64 __ibv_get_device_guid(struct ibv_device *device) { char attr[24]; uint64_t guid = 0; uint16_t parts[4]; int i; if (ibv_read_sysfs_file(device->ibdev_path, "node_guid", attr, sizeof attr) < 0) return 0; if (sscanf(attr, "%hx:%hx:%hx:%hx", parts, parts + 1, parts + 2, parts + 3) != 4) return 0; for (i = 0; i < 4; ++i) guid = (guid << 16) | parts[i]; return htobe64(guid); } default_symver(__ibv_get_device_guid, ibv_get_device_guid); -void verbs_init_cq(struct ibv_cq *cq, struct ibv_context *context, +int verbs_init_cq(struct ibv_cq *cq, struct ibv_context *context, struct ibv_comp_channel *channel, void *cq_context) { + int err = 0; + cq->context = context; cq->channel = channel; + err = pthread_mutex_init(&cq->mutex, NULL); + if (err) + return err; + err = pthread_cond_init(&cq->cond, NULL); + if (err) + goto err; + if (cq->channel) { pthread_mutex_lock(&context->mutex); ++cq->channel->refcnt; pthread_mutex_unlock(&context->mutex); } cq->cq_context = cq_context; cq->comp_events_completed = 0; cq->async_events_completed = 0; - pthread_mutex_init(&cq->mutex, NULL); - pthread_cond_init(&cq->cond, NULL); + + return err; + +err: + pthread_mutex_destroy(&cq->mutex); + + return err; +} + +void verbs_cleanup_cq(struct ibv_cq *cq) +{ + pthread_cond_destroy(&cq->cond); + pthread_mutex_destroy(&cq->mutex); } static struct ibv_cq_ex * __lib_ibv_create_cq_ex(struct ibv_context *context, struct ibv_cq_init_attr_ex *cq_attr) { struct verbs_context *vctx = verbs_get_ctx(context); struct ibv_cq_ex *cq; + int err = 0; if (cq_attr->wc_flags & ~IBV_CREATE_CQ_SUP_WC_FLAGS) { errno = EOPNOTSUPP; return NULL; } cq = vctx->priv->create_cq_ex(context, cq_attr); + if (!cq) + return NULL; - if (cq) - verbs_init_cq(ibv_cq_ex_to_cq(cq), context, - cq_attr->channel, cq_attr->cq_context); + err = verbs_init_cq(ibv_cq_ex_to_cq(cq), context, + cq_attr->channel, cq_attr->cq_context); + if (err) + goto err; return cq; + +err: + context->ops.destroy_cq(ibv_cq_ex_to_cq(cq)); + + return NULL; } struct ibv_context *__ibv_open_device(struct ibv_device *device) { struct verbs_device *verbs_device = verbs_get_device(device); char *devpath; int cmd_fd, ret; struct ibv_context *context; struct verbs_context *context_ex; if (asprintf(&devpath, "/dev/%s", device->dev_name) < 0) return NULL; /* * We'll only be doing writes, but we need O_RDWR in case the * provider needs to mmap() the file. */ cmd_fd = open(devpath, O_RDWR | O_CLOEXEC); free(devpath); if (cmd_fd < 0) return NULL; if (!verbs_device->ops->init_context) { context = verbs_device->ops->alloc_context(device, cmd_fd); if (!context) goto err; + + if (pthread_mutex_init(&context->mutex, NULL)) { + verbs_device->ops->free_context(context); + goto err; + } } else { struct verbs_ex_private *priv; /* Library now allocates the context */ context_ex = calloc(1, sizeof(*context_ex) + verbs_device->size_of_context); if (!context_ex) { errno = ENOMEM; goto err; } priv = calloc(1, sizeof(*priv)); if (!priv) { errno = ENOMEM; - free(context_ex); - goto err; + goto err_context; } context_ex->priv = priv; context_ex->context.abi_compat = __VERBS_ABI_IS_EXTENDED; context_ex->sz = sizeof(*context_ex); context = &context_ex->context; + if (pthread_mutex_init(&context->mutex, NULL)) + goto verbs_err; + ret = verbs_device->ops->init_context(verbs_device, context, cmd_fd); if (ret) - goto verbs_err; + goto err_mutex; /* * In order to maintain backward/forward binary compatibility * with apps compiled against libibverbs-1.1.8 that use the * flow steering addition, we need to set the two * ABI_placeholder entries to match the driver set flow * entries. This is because apps compiled against * libibverbs-1.1.8 use an inline ibv_create_flow and * ibv_destroy_flow function that looks in the placeholder * spots for the proper entry points. For apps compiled * against libibverbs-1.1.9 and later, the inline functions * will be looking in the right place. */ context_ex->ABI_placeholder1 = (void (*)(void)) context_ex->ibv_create_flow; context_ex->ABI_placeholder2 = (void (*)(void)) context_ex->ibv_destroy_flow; if (context_ex->create_cq_ex) { priv->create_cq_ex = context_ex->create_cq_ex; context_ex->create_cq_ex = __lib_ibv_create_cq_ex; } } context->device = device; context->cmd_fd = cmd_fd; - pthread_mutex_init(&context->mutex, NULL); return context; +err_mutex: + pthread_mutex_destroy(&context->mutex); verbs_err: free(context_ex->priv); +err_context: free(context_ex); err: close(cmd_fd); return NULL; } default_symver(__ibv_open_device, ibv_open_device); int __ibv_close_device(struct ibv_context *context) { int async_fd = context->async_fd; int cmd_fd = context->cmd_fd; struct verbs_context *context_ex; struct verbs_device *verbs_device = verbs_get_device(context->device); + pthread_mutex_destroy(&context->mutex); context_ex = verbs_get_ctx(context); if (context_ex) { verbs_device->ops->uninit_context(verbs_device, context); free(context_ex->priv); free(context_ex); } else { verbs_device->ops->free_context(context); } close(async_fd); close(cmd_fd); return 0; } default_symver(__ibv_close_device, ibv_close_device); int __ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event) { struct ibv_kern_async_event ev; if (read(context->async_fd, &ev, sizeof ev) != sizeof ev) return -1; event->event_type = ev.event_type; switch (event->event_type) { case IBV_EVENT_CQ_ERR: event->element.cq = (void *) (uintptr_t) ev.element; break; case IBV_EVENT_QP_FATAL: case IBV_EVENT_QP_REQ_ERR: case IBV_EVENT_QP_ACCESS_ERR: case IBV_EVENT_COMM_EST: case IBV_EVENT_SQ_DRAINED: case IBV_EVENT_PATH_MIG: case IBV_EVENT_PATH_MIG_ERR: case IBV_EVENT_QP_LAST_WQE_REACHED: event->element.qp = (void *) (uintptr_t) ev.element; break; case IBV_EVENT_SRQ_ERR: case IBV_EVENT_SRQ_LIMIT_REACHED: event->element.srq = (void *) (uintptr_t) ev.element; break; case IBV_EVENT_WQ_FATAL: event->element.wq = (void *) (uintptr_t) ev.element; break; default: event->element.port_num = ev.element; break; } if (context->ops.async_event) context->ops.async_event(event); return 0; } default_symver(__ibv_get_async_event, ibv_get_async_event); void __ibv_ack_async_event(struct ibv_async_event *event) { switch (event->event_type) { case IBV_EVENT_CQ_ERR: { struct ibv_cq *cq = event->element.cq; pthread_mutex_lock(&cq->mutex); ++cq->async_events_completed; pthread_cond_signal(&cq->cond); pthread_mutex_unlock(&cq->mutex); return; } case IBV_EVENT_QP_FATAL: case IBV_EVENT_QP_REQ_ERR: case IBV_EVENT_QP_ACCESS_ERR: case IBV_EVENT_COMM_EST: case IBV_EVENT_SQ_DRAINED: case IBV_EVENT_PATH_MIG: case IBV_EVENT_PATH_MIG_ERR: case IBV_EVENT_QP_LAST_WQE_REACHED: { struct ibv_qp *qp = event->element.qp; pthread_mutex_lock(&qp->mutex); ++qp->events_completed; pthread_cond_signal(&qp->cond); pthread_mutex_unlock(&qp->mutex); return; } case IBV_EVENT_SRQ_ERR: case IBV_EVENT_SRQ_LIMIT_REACHED: { struct ibv_srq *srq = event->element.srq; pthread_mutex_lock(&srq->mutex); ++srq->events_completed; pthread_cond_signal(&srq->cond); pthread_mutex_unlock(&srq->mutex); return; } case IBV_EVENT_WQ_FATAL: { struct ibv_wq *wq = event->element.wq; pthread_mutex_lock(&wq->mutex); ++wq->events_completed; pthread_cond_signal(&wq->cond); pthread_mutex_unlock(&wq->mutex); return; } default: return; } } default_symver(__ibv_ack_async_event, ibv_ack_async_event); + +int __ibv_init_wq(struct ibv_wq *wq) +{ + int err = 0; + wq->events_completed = 0; + err = pthread_mutex_init(&wq->mutex, NULL); + if (err) + return err; + + err = pthread_cond_init(&wq->cond, NULL); + if (err) + goto err; + + return err; + +err: + pthread_mutex_destroy(&wq->mutex); + + return err; +} +default_symver(__ibv_init_wq, ibv_init_wq); + +void __ibv_cleanup_wq(struct ibv_wq *wq) +{ + pthread_cond_destroy(&wq->mutex); + pthread_mutex_destroy(&wq->mutex); +} +default_symver(__ibv_cleanup_wq, ibv_cleanup_wq); diff --git a/contrib/ofed/libibverbs/driver.h b/contrib/ofed/libibverbs/driver.h index ec87afd7f11e..60824bf27e37 100644 --- a/contrib/ofed/libibverbs/driver.h +++ b/contrib/ofed/libibverbs/driver.h @@ -1,325 +1,328 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef INFINIBAND_DRIVER_H #define INFINIBAND_DRIVER_H #include #include #ifdef __cplusplus # define BEGIN_C_DECLS extern "C" { # define END_C_DECLS } #else /* !__cplusplus */ # define BEGIN_C_DECLS # define END_C_DECLS #endif /* __cplusplus */ /* * Extension that low-level drivers should add to their .so filename * (probably via libtool "-release" option). For example a low-level * driver named "libfoo" should build a plug-in named "libfoo-rdmav2.so". */ #define IBV_DEVICE_LIBRARY_EXTENSION rdmav2 struct verbs_device; enum verbs_xrcd_mask { VERBS_XRCD_HANDLE = 1 << 0, VERBS_XRCD_RESERVED = 1 << 1 }; struct verbs_xrcd { struct ibv_xrcd xrcd; uint32_t comp_mask; uint32_t handle; }; enum verbs_srq_mask { VERBS_SRQ_TYPE = 1 << 0, VERBS_SRQ_XRCD = 1 << 1, VERBS_SRQ_CQ = 1 << 2, VERBS_SRQ_NUM = 1 << 3, VERBS_SRQ_RESERVED = 1 << 4 }; struct verbs_srq { struct ibv_srq srq; uint32_t comp_mask; enum ibv_srq_type srq_type; struct verbs_xrcd *xrcd; struct ibv_cq *cq; uint32_t srq_num; }; enum verbs_qp_mask { VERBS_QP_XRCD = 1 << 0, VERBS_QP_RESERVED = 1 << 1 }; enum ibv_gid_type { IBV_GID_TYPE_IB_ROCE_V1, IBV_GID_TYPE_ROCE_V2, }; struct verbs_qp { struct ibv_qp qp; uint32_t comp_mask; struct verbs_xrcd *xrcd; }; /* Must change the PRIVATE IBVERBS_PRIVATE_ symbol if this is changed */ struct verbs_device_ops { /* Old interface, do not use in new code. */ struct ibv_context *(*alloc_context)(struct ibv_device *device, int cmd_fd); void (*free_context)(struct ibv_context *context); /* New interface */ int (*init_context)(struct verbs_device *device, struct ibv_context *ctx, int cmd_fd); void (*uninit_context)(struct verbs_device *device, struct ibv_context *ctx); }; /* Must change the PRIVATE IBVERBS_PRIVATE_ symbol if this is changed */ struct verbs_device { struct ibv_device device; /* Must be first */ const struct verbs_device_ops *ops; size_t sz; size_t size_of_context; }; static inline struct verbs_device * verbs_get_device(const struct ibv_device *dev) { return container_of(dev, struct verbs_device, device); } typedef struct verbs_device *(*verbs_driver_init_func)(const char *uverbs_sys_path, int abi_version); void verbs_register_driver(const char *name, verbs_driver_init_func init_func); -void verbs_init_cq(struct ibv_cq *cq, struct ibv_context *context, +int verbs_init_cq(struct ibv_cq *cq, struct ibv_context *context, struct ibv_comp_channel *channel, void *cq_context); +void verbs_cleanup_cq(struct ibv_cq *cq); +int ibv_init_wq(struct ibv_wq *wq); +void ibv_cleanup_wq(struct ibv_wq *wq); int ibv_cmd_get_context(struct ibv_context *context, struct ibv_get_context *cmd, size_t cmd_size, struct ibv_get_context_resp *resp, size_t resp_size); int ibv_cmd_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr, uint64_t *raw_fw_ver, struct ibv_query_device *cmd, size_t cmd_size); int ibv_cmd_query_device_ex(struct ibv_context *context, const struct ibv_query_device_ex_input *input, struct ibv_device_attr_ex *attr, size_t attr_size, uint64_t *raw_fw_ver, struct ibv_query_device_ex *cmd, size_t cmd_core_size, size_t cmd_size, struct ibv_query_device_resp_ex *resp, size_t resp_core_size, size_t resp_size); int ibv_cmd_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr, struct ibv_query_port *cmd, size_t cmd_size); int ibv_cmd_alloc_pd(struct ibv_context *context, struct ibv_pd *pd, struct ibv_alloc_pd *cmd, size_t cmd_size, struct ibv_alloc_pd_resp *resp, size_t resp_size); int ibv_cmd_dealloc_pd(struct ibv_pd *pd); int ibv_cmd_open_xrcd(struct ibv_context *context, struct verbs_xrcd *xrcd, int vxrcd_size, struct ibv_xrcd_init_attr *attr, struct ibv_open_xrcd *cmd, size_t cmd_size, struct ibv_open_xrcd_resp *resp, size_t resp_size); int ibv_cmd_close_xrcd(struct verbs_xrcd *xrcd); #define IBV_CMD_REG_MR_HAS_RESP_PARAMS int ibv_cmd_reg_mr(struct ibv_pd *pd, void *addr, size_t length, uint64_t hca_va, int access, struct ibv_mr *mr, struct ibv_reg_mr *cmd, size_t cmd_size, struct ibv_reg_mr_resp *resp, size_t resp_size); int ibv_cmd_rereg_mr(struct ibv_mr *mr, uint32_t flags, void *addr, size_t length, uint64_t hca_va, int access, struct ibv_pd *pd, struct ibv_rereg_mr *cmd, size_t cmd_sz, struct ibv_rereg_mr_resp *resp, size_t resp_sz); int ibv_cmd_dereg_mr(struct ibv_mr *mr); int ibv_cmd_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type, struct ibv_mw *mw, struct ibv_alloc_mw *cmd, size_t cmd_size, struct ibv_alloc_mw_resp *resp, size_t resp_size); int ibv_cmd_dealloc_mw(struct ibv_mw *mw, struct ibv_dealloc_mw *cmd, size_t cmd_size); int ibv_cmd_create_cq(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector, struct ibv_cq *cq, struct ibv_create_cq *cmd, size_t cmd_size, struct ibv_create_cq_resp *resp, size_t resp_size); int ibv_cmd_create_cq_ex(struct ibv_context *context, struct ibv_cq_init_attr_ex *cq_attr, struct ibv_cq_ex *cq, struct ibv_create_cq_ex *cmd, size_t cmd_core_size, size_t cmd_size, struct ibv_create_cq_resp_ex *resp, size_t resp_core_size, size_t resp_size); int ibv_cmd_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); int ibv_cmd_req_notify_cq(struct ibv_cq *cq, int solicited_only); #define IBV_CMD_RESIZE_CQ_HAS_RESP_PARAMS int ibv_cmd_resize_cq(struct ibv_cq *cq, int cqe, struct ibv_resize_cq *cmd, size_t cmd_size, struct ibv_resize_cq_resp *resp, size_t resp_size); int ibv_cmd_destroy_cq(struct ibv_cq *cq); int ibv_cmd_create_srq(struct ibv_pd *pd, struct ibv_srq *srq, struct ibv_srq_init_attr *attr, struct ibv_create_srq *cmd, size_t cmd_size, struct ibv_create_srq_resp *resp, size_t resp_size); int ibv_cmd_create_srq_ex(struct ibv_context *context, struct verbs_srq *srq, int vsrq_sz, struct ibv_srq_init_attr_ex *attr_ex, struct ibv_create_xsrq *cmd, size_t cmd_size, struct ibv_create_srq_resp *resp, size_t resp_size); int ibv_cmd_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, int srq_attr_mask, struct ibv_modify_srq *cmd, size_t cmd_size); int ibv_cmd_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, struct ibv_query_srq *cmd, size_t cmd_size); int ibv_cmd_destroy_srq(struct ibv_srq *srq); int ibv_cmd_create_qp(struct ibv_pd *pd, struct ibv_qp *qp, struct ibv_qp_init_attr *attr, struct ibv_create_qp *cmd, size_t cmd_size, struct ibv_create_qp_resp *resp, size_t resp_size); int ibv_cmd_create_qp_ex(struct ibv_context *context, struct verbs_qp *qp, int vqp_sz, struct ibv_qp_init_attr_ex *attr_ex, struct ibv_create_qp *cmd, size_t cmd_size, struct ibv_create_qp_resp *resp, size_t resp_size); int ibv_cmd_create_qp_ex2(struct ibv_context *context, struct verbs_qp *qp, int vqp_sz, struct ibv_qp_init_attr_ex *qp_attr, struct ibv_create_qp_ex *cmd, size_t cmd_core_size, size_t cmd_size, struct ibv_create_qp_resp_ex *resp, size_t resp_core_size, size_t resp_size); int ibv_cmd_open_qp(struct ibv_context *context, struct verbs_qp *qp, int vqp_sz, struct ibv_qp_open_attr *attr, struct ibv_open_qp *cmd, size_t cmd_size, struct ibv_create_qp_resp *resp, size_t resp_size); int ibv_cmd_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *qp_attr, int attr_mask, struct ibv_qp_init_attr *qp_init_attr, struct ibv_query_qp *cmd, size_t cmd_size); int ibv_cmd_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_modify_qp *cmd, size_t cmd_size); int ibv_cmd_modify_qp_ex(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_modify_qp_ex *cmd, size_t cmd_core_size, size_t cmd_size, struct ibv_modify_qp_resp_ex *resp, size_t resp_core_size, size_t resp_size); int ibv_cmd_destroy_qp(struct ibv_qp *qp); int ibv_cmd_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr); int ibv_cmd_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); int ibv_cmd_post_srq_recv(struct ibv_srq *srq, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); int ibv_cmd_create_ah(struct ibv_pd *pd, struct ibv_ah *ah, struct ibv_ah_attr *attr, struct ibv_create_ah_resp *resp, size_t resp_size); int ibv_cmd_destroy_ah(struct ibv_ah *ah); int ibv_cmd_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); int ibv_cmd_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); struct ibv_flow *ibv_cmd_create_flow(struct ibv_qp *qp, struct ibv_flow_attr *flow_attr); int ibv_cmd_destroy_flow(struct ibv_flow *flow_id); int ibv_cmd_create_wq(struct ibv_context *context, struct ibv_wq_init_attr *wq_init_attr, struct ibv_wq *wq, struct ibv_create_wq *cmd, size_t cmd_core_size, size_t cmd_size, struct ibv_create_wq_resp *resp, size_t resp_core_size, size_t resp_size); int ibv_cmd_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *attr, struct ibv_modify_wq *cmd, size_t cmd_core_size, size_t cmd_size); int ibv_cmd_destroy_wq(struct ibv_wq *wq); int ibv_cmd_create_rwq_ind_table(struct ibv_context *context, struct ibv_rwq_ind_table_init_attr *init_attr, struct ibv_rwq_ind_table *rwq_ind_table, struct ibv_create_rwq_ind_table *cmd, size_t cmd_core_size, size_t cmd_size, struct ibv_create_rwq_ind_table_resp *resp, size_t resp_core_size, size_t resp_size); int ibv_cmd_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table); int ibv_dontfork_range(void *base, size_t size); int ibv_dofork_range(void *base, size_t size); /* * sysfs helper functions */ const char *ibv_get_sysfs_path(void); int ibv_read_sysfs_file(const char *dir, const char *file, char *buf, size_t size); static inline int verbs_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num) { struct verbs_srq *vsrq = container_of(srq, struct verbs_srq, srq); if (vsrq->comp_mask & VERBS_SRQ_NUM) { *srq_num = vsrq->srq_num; return 0; } return ENOSYS; } int ibv_query_gid_type(struct ibv_context *context, uint8_t port_num, unsigned int index, enum ibv_gid_type *type); #endif /* INFINIBAND_DRIVER_H */ diff --git a/contrib/ofed/libibverbs/libibverbs.map b/contrib/ofed/libibverbs/libibverbs.map index b49c09a06ce3..d94743389f47 100644 --- a/contrib/ofed/libibverbs/libibverbs.map +++ b/contrib/ofed/libibverbs/libibverbs.map @@ -1,139 +1,142 @@ /* Do not change this file without reading Documentation/versioning.md */ IBVERBS_1.0 { global: ibv_get_device_list; ibv_free_device_list; ibv_get_device_name; ibv_get_device_guid; ibv_open_device; ibv_close_device; ibv_get_async_event; ibv_ack_async_event; ibv_query_device; ibv_query_port; ibv_query_gid; ibv_query_pkey; ibv_alloc_pd; ibv_dealloc_pd; ibv_reg_mr; ibv_dereg_mr; ibv_create_comp_channel; ibv_destroy_comp_channel; ibv_create_cq; ibv_resize_cq; ibv_destroy_cq; ibv_get_cq_event; ibv_ack_cq_events; ibv_create_srq; ibv_modify_srq; ibv_query_srq; ibv_destroy_srq; ibv_create_qp; ibv_query_qp; ibv_modify_qp; ibv_destroy_qp; ibv_create_ah; ibv_destroy_ah; ibv_attach_mcast; ibv_detach_mcast; ibv_rate_to_mult; mult_to_ibv_rate; /* These historical symbols are now private to libibverbs, but used by other rdma-core libraries. Do not change them. */ ibv_copy_path_rec_from_kern; ibv_copy_path_rec_to_kern; ibv_copy_qp_attr_from_kern; ibv_get_sysfs_path; ibv_read_sysfs_file; local: *; }; IBVERBS_1.1 { global: ibv_get_device_list; ibv_free_device_list; ibv_get_device_name; ibv_get_device_guid; ibv_open_device; ibv_close_device; ibv_init_ah_from_wc; ibv_create_ah_from_wc; ibv_fork_init; ibv_dontfork_range; ibv_dofork_range; ibv_node_type_str; ibv_port_state_str; ibv_event_type_str; ibv_wc_status_str; ibv_rate_to_mbps; mbps_to_ibv_rate; ibv_resolve_eth_l2_from_gid; /* These historical symbols are now private to libibverbs, but used by other rdma-core libraries. Do not change them. */ ibv_copy_ah_attr_from_kern; } IBVERBS_1.0; /* NOTE: The next stanza for public symbols should be IBVERBS_1.4 due to release 12 */ /* If any symbols in this stanza change ABI then the entire staza gets a new symbol version. Also see the private_symver() macro */ IBVERBS_PRIVATE_14 { global: /* These historical symbols are now private to libibverbs */ ibv_cmd_alloc_mw; ibv_cmd_alloc_pd; ibv_cmd_attach_mcast; ibv_cmd_close_xrcd; ibv_cmd_create_ah; ibv_cmd_create_cq; ibv_cmd_create_cq_ex; ibv_cmd_create_flow; ibv_cmd_create_qp; ibv_cmd_create_qp_ex2; ibv_cmd_create_qp_ex; ibv_cmd_create_rwq_ind_table; ibv_cmd_create_srq; ibv_cmd_create_srq_ex; ibv_cmd_create_wq; ibv_cmd_dealloc_mw; ibv_cmd_dealloc_pd; ibv_cmd_dereg_mr; ibv_cmd_destroy_ah; ibv_cmd_destroy_cq; ibv_cmd_destroy_flow; ibv_cmd_destroy_qp; ibv_cmd_destroy_rwq_ind_table; ibv_cmd_destroy_srq; ibv_cmd_destroy_wq; ibv_cmd_detach_mcast; ibv_cmd_get_context; ibv_cmd_modify_qp; ibv_cmd_modify_qp_ex; ibv_cmd_modify_srq; ibv_cmd_modify_wq; ibv_cmd_open_qp; ibv_cmd_open_xrcd; ibv_cmd_poll_cq; ibv_cmd_post_recv; ibv_cmd_post_send; ibv_cmd_post_srq_recv; ibv_cmd_query_device; ibv_cmd_query_device_ex; ibv_cmd_query_port; ibv_cmd_query_qp; ibv_cmd_query_srq; ibv_cmd_reg_mr; ibv_cmd_req_notify_cq; ibv_cmd_rereg_mr; ibv_cmd_resize_cq; ibv_query_gid_type; verbs_register_driver; verbs_init_cq; + verbs_cleanup_cq; + ibv_init_wq; + ibv_cleanup_wq; }; diff --git a/contrib/ofed/libibverbs/verbs.c b/contrib/ofed/libibverbs/verbs.c index aec8706fd0cc..5c23406e69e7 100644 --- a/contrib/ofed/libibverbs/verbs.c +++ b/contrib/ofed/libibverbs/verbs.c @@ -1,1037 +1,1059 @@ /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include "ibverbs.h" #ifndef NRESOLVE_NEIGH #include #include #include "neigh.h" #endif /* Hack to avoid GCC's -Wmissing-prototypes and the similar error from sparse with these prototypes. Symbol versionining requires the goofy names, the prototype must match the version in verbs.h. */ int __ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr); int __ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); int __ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num, int index, __be16 *pkey); struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context); int __ibv_dealloc_pd(struct ibv_pd *pd); struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access); int __ibv_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd, void *addr, size_t length, int access); int __ibv_dereg_mr(struct ibv_mr *mr); struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); int __ibv_resize_cq(struct ibv_cq *cq, int cqe); int __ibv_destroy_cq(struct ibv_cq *cq); int __ibv_get_cq_event(struct ibv_comp_channel *channel, struct ibv_cq **cq, void **cq_context); void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents); struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd, struct ibv_srq_init_attr *srq_init_attr); int __ibv_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, int srq_attr_mask); int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr); int __ibv_destroy_srq(struct ibv_srq *srq); struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); int __ibv_destroy_qp(struct ibv_qp *qp); struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); int __ibv_destroy_ah(struct ibv_ah *ah); int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); int __attribute__((const)) ibv_rate_to_mult(enum ibv_rate rate) { switch (rate) { case IBV_RATE_2_5_GBPS: return 1; case IBV_RATE_5_GBPS: return 2; case IBV_RATE_10_GBPS: return 4; case IBV_RATE_20_GBPS: return 8; case IBV_RATE_30_GBPS: return 12; case IBV_RATE_40_GBPS: return 16; case IBV_RATE_60_GBPS: return 24; case IBV_RATE_80_GBPS: return 32; case IBV_RATE_120_GBPS: return 48; case IBV_RATE_28_GBPS: return 11; case IBV_RATE_50_GBPS: return 20; case IBV_RATE_400_GBPS: return 160; case IBV_RATE_600_GBPS: return 240; default: return -1; } } enum ibv_rate __attribute__((const)) mult_to_ibv_rate(int mult) { switch (mult) { case 1: return IBV_RATE_2_5_GBPS; case 2: return IBV_RATE_5_GBPS; case 4: return IBV_RATE_10_GBPS; case 8: return IBV_RATE_20_GBPS; case 12: return IBV_RATE_30_GBPS; case 16: return IBV_RATE_40_GBPS; case 24: return IBV_RATE_60_GBPS; case 32: return IBV_RATE_80_GBPS; case 48: return IBV_RATE_120_GBPS; case 11: return IBV_RATE_28_GBPS; case 20: return IBV_RATE_50_GBPS; case 160: return IBV_RATE_400_GBPS; case 240: return IBV_RATE_600_GBPS; default: return IBV_RATE_MAX; } } int __attribute__((const)) ibv_rate_to_mbps(enum ibv_rate rate) { switch (rate) { case IBV_RATE_2_5_GBPS: return 2500; case IBV_RATE_5_GBPS: return 5000; case IBV_RATE_10_GBPS: return 10000; case IBV_RATE_20_GBPS: return 20000; case IBV_RATE_30_GBPS: return 30000; case IBV_RATE_40_GBPS: return 40000; case IBV_RATE_60_GBPS: return 60000; case IBV_RATE_80_GBPS: return 80000; case IBV_RATE_120_GBPS: return 120000; case IBV_RATE_14_GBPS: return 14062; case IBV_RATE_56_GBPS: return 56250; case IBV_RATE_112_GBPS: return 112500; case IBV_RATE_168_GBPS: return 168750; case IBV_RATE_25_GBPS: return 25781; case IBV_RATE_100_GBPS: return 103125; case IBV_RATE_200_GBPS: return 206250; case IBV_RATE_300_GBPS: return 309375; case IBV_RATE_28_GBPS: return 28125; case IBV_RATE_50_GBPS: return 53125; case IBV_RATE_400_GBPS: return 425000; case IBV_RATE_600_GBPS: return 637500; default: return -1; } } enum ibv_rate __attribute__((const)) mbps_to_ibv_rate(int mbps) { switch (mbps) { case 2500: return IBV_RATE_2_5_GBPS; case 5000: return IBV_RATE_5_GBPS; case 10000: return IBV_RATE_10_GBPS; case 20000: return IBV_RATE_20_GBPS; case 30000: return IBV_RATE_30_GBPS; case 40000: return IBV_RATE_40_GBPS; case 60000: return IBV_RATE_60_GBPS; case 80000: return IBV_RATE_80_GBPS; case 120000: return IBV_RATE_120_GBPS; case 14062: return IBV_RATE_14_GBPS; case 56250: return IBV_RATE_56_GBPS; case 112500: return IBV_RATE_112_GBPS; case 168750: return IBV_RATE_168_GBPS; case 25781: return IBV_RATE_25_GBPS; case 103125: return IBV_RATE_100_GBPS; case 206250: return IBV_RATE_200_GBPS; case 309375: return IBV_RATE_300_GBPS; case 28125: return IBV_RATE_28_GBPS; case 53125: return IBV_RATE_50_GBPS; case 425000: return IBV_RATE_400_GBPS; case 637500: return IBV_RATE_600_GBPS; default: return IBV_RATE_MAX; } } int __ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr) { return context->ops.query_device(context, device_attr); } default_symver(__ibv_query_device, ibv_query_device); int __ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { return context->ops.query_port(context, port_num, port_attr); } default_symver(__ibv_query_port, ibv_query_port); int __ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) { char name[24]; char attr[41]; uint16_t val; int i; snprintf(name, sizeof name, "ports/%d/gids/%d", port_num, index); if (ibv_read_sysfs_file(context->device->ibdev_path, name, attr, sizeof attr) < 0) return -1; for (i = 0; i < 8; ++i) { if (sscanf(attr + i * 5, "%hx", &val) != 1) return -1; gid->raw[i * 2 ] = val >> 8; gid->raw[i * 2 + 1] = val & 0xff; } return 0; } default_symver(__ibv_query_gid, ibv_query_gid); int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num, int index, __be16 *pkey) { char name[24]; char attr[8]; uint16_t val; snprintf(name, sizeof name, "ports/%d/pkeys/%d", port_num, index); if (ibv_read_sysfs_file(context->device->ibdev_path, name, attr, sizeof attr) < 0) return -1; if (sscanf(attr, "%hx", &val) != 1) return -1; *pkey = htobe16(val); return 0; } default_symver(__ibv_query_pkey, ibv_query_pkey); struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context) { struct ibv_pd *pd; pd = context->ops.alloc_pd(context); if (pd) pd->context = context; return pd; } default_symver(__ibv_alloc_pd, ibv_alloc_pd); int __ibv_dealloc_pd(struct ibv_pd *pd) { return pd->context->ops.dealloc_pd(pd); } default_symver(__ibv_dealloc_pd, ibv_dealloc_pd); struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) { struct ibv_mr *mr; if (ibv_dontfork_range(addr, length)) return NULL; mr = pd->context->ops.reg_mr(pd, addr, length, access); if (mr) { mr->context = pd->context; mr->pd = pd; mr->addr = addr; mr->length = length; } else ibv_dofork_range(addr, length); return mr; } default_symver(__ibv_reg_mr, ibv_reg_mr); int __ibv_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd, void *addr, size_t length, int access) { int dofork_onfail = 0; int err; void *old_addr; size_t old_len; if (flags & ~IBV_REREG_MR_FLAGS_SUPPORTED) { errno = EINVAL; return IBV_REREG_MR_ERR_INPUT; } if ((flags & IBV_REREG_MR_CHANGE_TRANSLATION) && (!length || !addr)) { errno = EINVAL; return IBV_REREG_MR_ERR_INPUT; } if (access && !(flags & IBV_REREG_MR_CHANGE_ACCESS)) { errno = EINVAL; return IBV_REREG_MR_ERR_INPUT; } if (!mr->context->ops.rereg_mr) { errno = ENOSYS; return IBV_REREG_MR_ERR_INPUT; } if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) { err = ibv_dontfork_range(addr, length); if (err) return IBV_REREG_MR_ERR_DONT_FORK_NEW; dofork_onfail = 1; } old_addr = mr->addr; old_len = mr->length; err = mr->context->ops.rereg_mr(mr, flags, pd, addr, length, access); if (!err) { if (flags & IBV_REREG_MR_CHANGE_PD) mr->pd = pd; if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) { mr->addr = addr; mr->length = length; err = ibv_dofork_range(old_addr, old_len); if (err) return IBV_REREG_MR_ERR_DO_FORK_OLD; } } else { err = IBV_REREG_MR_ERR_CMD; if (dofork_onfail) { if (ibv_dofork_range(addr, length)) err = IBV_REREG_MR_ERR_CMD_AND_DO_FORK_NEW; } } return err; } default_symver(__ibv_rereg_mr, ibv_rereg_mr); int __ibv_dereg_mr(struct ibv_mr *mr) { int ret; void *addr = mr->addr; size_t length = mr->length; ret = mr->context->ops.dereg_mr(mr); if (!ret) ibv_dofork_range(addr, length); return ret; } default_symver(__ibv_dereg_mr, ibv_dereg_mr); static struct ibv_comp_channel *ibv_create_comp_channel_v2(struct ibv_context *context) { struct ibv_abi_compat_v2 *t = context->abi_compat; static int warned; if (!pthread_mutex_trylock(&t->in_use)) return &t->channel; if (!warned) { fprintf(stderr, PFX "Warning: kernel's ABI version %d limits capacity.\n" " Only one completion channel can be created per context.\n", abi_ver); ++warned; } return NULL; } struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context) { struct ibv_comp_channel *channel; struct ibv_create_comp_channel cmd; struct ibv_create_comp_channel_resp resp; if (abi_ver <= 2) return ibv_create_comp_channel_v2(context); channel = malloc(sizeof *channel); if (!channel) return NULL; IBV_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_COMP_CHANNEL, &resp, sizeof resp); if (write(context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) { free(channel); return NULL; } (void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); channel->context = context; channel->fd = resp.fd; channel->refcnt = 0; return channel; } static int ibv_destroy_comp_channel_v2(struct ibv_comp_channel *channel) { struct ibv_abi_compat_v2 *t = (struct ibv_abi_compat_v2 *) channel; pthread_mutex_unlock(&t->in_use); return 0; } int ibv_destroy_comp_channel(struct ibv_comp_channel *channel) { struct ibv_context *context; int ret; context = channel->context; pthread_mutex_lock(&context->mutex); if (channel->refcnt) { ret = EBUSY; goto out; } if (abi_ver <= 2) { ret = ibv_destroy_comp_channel_v2(channel); goto out; } close(channel->fd); free(channel); ret = 0; out: pthread_mutex_unlock(&context->mutex); return ret; } struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) { struct ibv_cq *cq; + int err = 0; cq = context->ops.create_cq(context, cqe, channel, comp_vector); - if (cq) - verbs_init_cq(cq, context, channel, cq_context); + if (!cq) + return NULL; + + err = verbs_init_cq(cq, context, channel, cq_context); + if (err) + goto err; return cq; + +err: + context->ops.destroy_cq(cq); + + return NULL; } default_symver(__ibv_create_cq, ibv_create_cq); int __ibv_resize_cq(struct ibv_cq *cq, int cqe) { if (!cq->context->ops.resize_cq) return ENOSYS; return cq->context->ops.resize_cq(cq, cqe); } default_symver(__ibv_resize_cq, ibv_resize_cq); int __ibv_destroy_cq(struct ibv_cq *cq) { struct ibv_comp_channel *channel = cq->channel; int ret; ret = cq->context->ops.destroy_cq(cq); if (channel) { if (!ret) { pthread_mutex_lock(&channel->context->mutex); --channel->refcnt; pthread_mutex_unlock(&channel->context->mutex); } } return ret; } default_symver(__ibv_destroy_cq, ibv_destroy_cq); int __ibv_get_cq_event(struct ibv_comp_channel *channel, struct ibv_cq **cq, void **cq_context) { struct ibv_comp_event ev; if (read(channel->fd, &ev, sizeof ev) != sizeof ev) return -1; *cq = (struct ibv_cq *) (uintptr_t) ev.cq_handle; *cq_context = (*cq)->cq_context; if ((*cq)->context->ops.cq_event) (*cq)->context->ops.cq_event(*cq); return 0; } default_symver(__ibv_get_cq_event, ibv_get_cq_event); void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents) { pthread_mutex_lock(&cq->mutex); cq->comp_events_completed += nevents; pthread_cond_signal(&cq->cond); pthread_mutex_unlock(&cq->mutex); } default_symver(__ibv_ack_cq_events, ibv_ack_cq_events); struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd, struct ibv_srq_init_attr *srq_init_attr) { struct ibv_srq *srq; if (!pd->context->ops.create_srq) return NULL; srq = pd->context->ops.create_srq(pd, srq_init_attr); - if (srq) { - srq->context = pd->context; - srq->srq_context = srq_init_attr->srq_context; - srq->pd = pd; - srq->events_completed = 0; - pthread_mutex_init(&srq->mutex, NULL); - pthread_cond_init(&srq->cond, NULL); - } + if (!srq) + return NULL; + + srq->context = pd->context; + srq->srq_context = srq_init_attr->srq_context; + srq->pd = pd; + srq->events_completed = 0; + if (pthread_mutex_init(&srq->mutex, NULL)) + goto err; + if (pthread_cond_init(&srq->cond, NULL)) + goto err_mutex; return srq; + +err_mutex: + pthread_mutex_destroy(&srq->mutex); +err: + pd->context->ops.destroy_srq(srq); + + return NULL; } default_symver(__ibv_create_srq, ibv_create_srq); int __ibv_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, int srq_attr_mask) { return srq->context->ops.modify_srq(srq, srq_attr, srq_attr_mask); } default_symver(__ibv_modify_srq, ibv_modify_srq); int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr) { return srq->context->ops.query_srq(srq, srq_attr); } default_symver(__ibv_query_srq, ibv_query_srq); int __ibv_destroy_srq(struct ibv_srq *srq) { + pthread_cond_destroy(&srq->cond); + pthread_mutex_destroy(&srq->mutex); return srq->context->ops.destroy_srq(srq); } default_symver(__ibv_destroy_srq, ibv_destroy_srq); struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) { struct ibv_qp *qp = pd->context->ops.create_qp(pd, qp_init_attr); if (qp) { qp->context = pd->context; qp->qp_context = qp_init_attr->qp_context; qp->pd = pd; qp->send_cq = qp_init_attr->send_cq; qp->recv_cq = qp_init_attr->recv_cq; qp->srq = qp_init_attr->srq; qp->qp_type = qp_init_attr->qp_type; qp->state = IBV_QPS_RESET; qp->events_completed = 0; pthread_mutex_init(&qp->mutex, NULL); pthread_cond_init(&qp->cond, NULL); } return qp; } default_symver(__ibv_create_qp, ibv_create_qp); int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) { int ret; ret = qp->context->ops.query_qp(qp, attr, attr_mask, init_attr); if (ret) return ret; if (attr_mask & IBV_QP_STATE) qp->state = attr->qp_state; return 0; } default_symver(__ibv_query_qp, ibv_query_qp); int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { int ret; ret = qp->context->ops.modify_qp(qp, attr, attr_mask); if (ret) return ret; if (attr_mask & IBV_QP_STATE) qp->state = attr->qp_state; return 0; } default_symver(__ibv_modify_qp, ibv_modify_qp); int __ibv_destroy_qp(struct ibv_qp *qp) { return qp->context->ops.destroy_qp(qp); } default_symver(__ibv_destroy_qp, ibv_destroy_qp); struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) { struct ibv_ah *ah = pd->context->ops.create_ah(pd, attr); if (ah) { ah->context = pd->context; ah->pd = pd; } return ah; } default_symver(__ibv_create_ah, ibv_create_ah); /* GID types as appear in sysfs, no change is expected as of ABI * compatibility. */ #define V1_TYPE "IB/RoCE v1" #define V2_TYPE "RoCE v2" int ibv_query_gid_type(struct ibv_context *context, uint8_t port_num, unsigned int index, enum ibv_gid_type *type) { char name[32]; char buff[11]; snprintf(name, sizeof(name), "ports/%d/gid_attrs/types/%d", port_num, index); /* Reset errno so that we can rely on its value upon any error flow in * ibv_read_sysfs_file. */ errno = 0; if (ibv_read_sysfs_file(context->device->ibdev_path, name, buff, sizeof(buff)) <= 0) { char *dir_path; DIR *dir; if (errno == EINVAL) { /* In IB, this file doesn't exist and the kernel sets * errno to -EINVAL. */ *type = IBV_GID_TYPE_IB_ROCE_V1; return 0; } if (asprintf(&dir_path, "%s/%s/%d/%s/", context->device->ibdev_path, "ports", port_num, "gid_attrs") < 0) return -1; dir = opendir(dir_path); free(dir_path); if (!dir) { if (errno == ENOENT) /* Assuming that if gid_attrs doesn't exist, * we have an old kernel and all GIDs are * IB/RoCE v1 */ *type = IBV_GID_TYPE_IB_ROCE_V1; else return -1; } else { closedir(dir); errno = EFAULT; return -1; } } else { if (!strcmp(buff, V1_TYPE)) { *type = IBV_GID_TYPE_IB_ROCE_V1; } else if (!strcmp(buff, V2_TYPE)) { *type = IBV_GID_TYPE_ROCE_V2; } else { errno = ENOTSUP; return -1; } } return 0; } static int ibv_find_gid_index(struct ibv_context *context, uint8_t port_num, union ibv_gid *gid, enum ibv_gid_type gid_type) { enum ibv_gid_type sgid_type = 0; union ibv_gid sgid; int i = 0, ret; do { ret = ibv_query_gid(context, port_num, i, &sgid); if (!ret) { ret = ibv_query_gid_type(context, port_num, i, &sgid_type); } i++; } while (!ret && (memcmp(&sgid, gid, sizeof(*gid)) || (gid_type != sgid_type))); return ret ? ret : i - 1; } static inline void map_ipv4_addr_to_ipv6(__be32 ipv4, struct in6_addr *ipv6) { ipv6->s6_addr32[0] = 0; ipv6->s6_addr32[1] = 0; ipv6->s6_addr32[2] = htobe32(0x0000FFFF); ipv6->s6_addr32[3] = ipv4; } static inline __sum16 ipv4_calc_hdr_csum(uint16_t *data, unsigned int num_hwords) { unsigned int i = 0; uint32_t sum = 0; for (i = 0; i < num_hwords; i++) sum += *(data++); sum = (sum & 0xffff) + (sum >> 16); return (__sum16)~sum; } static inline int get_grh_header_version(struct ibv_grh *grh) { int ip6h_version = (be32toh(grh->version_tclass_flow) >> 28) & 0xf; struct ip *ip4h = (struct ip *)((void *)grh + 20); struct ip ip4h_checked; if (ip6h_version != 6) { if (ip4h->ip_v == 4) return 4; errno = EPROTONOSUPPORT; return -1; } /* version may be 6 or 4 */ if (ip4h->ip_hl != 5) /* IPv4 header length must be 5 for RoCE v2. */ return 6; /* * Verify checksum. * We can't write on scattered buffers so we have to copy to temp * buffer. */ memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked)); /* Need to set the checksum field (check) to 0 before re-calculating * the checksum. */ ip4h_checked.ip_sum = 0; ip4h_checked.ip_sum = ipv4_calc_hdr_csum((uint16_t *)&ip4h_checked, 10); /* if IPv4 header checksum is OK, believe it */ if (ip4h->ip_sum == ip4h_checked.ip_sum) return 4; return 6; } static inline void set_ah_attr_generic_fields(struct ibv_ah_attr *ah_attr, struct ibv_wc *wc, struct ibv_grh *grh, uint8_t port_num) { uint32_t flow_class; flow_class = be32toh(grh->version_tclass_flow); ah_attr->grh.flow_label = flow_class & 0xFFFFF; ah_attr->dlid = wc->slid; ah_attr->sl = wc->sl; ah_attr->src_path_bits = wc->dlid_path_bits; ah_attr->port_num = port_num; } static inline int set_ah_attr_by_ipv4(struct ibv_context *context, struct ibv_ah_attr *ah_attr, struct ip *ip4h, uint8_t port_num) { union ibv_gid sgid; int ret; /* No point searching multicast GIDs in GID table */ if (IN_CLASSD(be32toh(ip4h->ip_dst.s_addr))) { errno = EINVAL; return -1; } map_ipv4_addr_to_ipv6(ip4h->ip_dst.s_addr, (struct in6_addr *)&sgid); ret = ibv_find_gid_index(context, port_num, &sgid, IBV_GID_TYPE_ROCE_V2); if (ret < 0) return ret; map_ipv4_addr_to_ipv6(ip4h->ip_src.s_addr, (struct in6_addr *)&ah_attr->grh.dgid); ah_attr->grh.sgid_index = (uint8_t) ret; ah_attr->grh.hop_limit = ip4h->ip_ttl; ah_attr->grh.traffic_class = ip4h->ip_tos; return 0; } #define IB_NEXT_HDR 0x1b static inline int set_ah_attr_by_ipv6(struct ibv_context *context, struct ibv_ah_attr *ah_attr, struct ibv_grh *grh, uint8_t port_num) { uint32_t flow_class; uint32_t sgid_type; int ret; /* No point searching multicast GIDs in GID table */ if (grh->dgid.raw[0] == 0xFF) { errno = EINVAL; return -1; } ah_attr->grh.dgid = grh->sgid; if (grh->next_hdr == IPPROTO_UDP) { sgid_type = IBV_GID_TYPE_ROCE_V2; } else if (grh->next_hdr == IB_NEXT_HDR) { sgid_type = IBV_GID_TYPE_IB_ROCE_V1; } else { errno = EPROTONOSUPPORT; return -1; } ret = ibv_find_gid_index(context, port_num, &grh->dgid, sgid_type); if (ret < 0) return ret; ah_attr->grh.sgid_index = (uint8_t) ret; flow_class = be32toh(grh->version_tclass_flow); ah_attr->grh.hop_limit = grh->hop_limit; ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; return 0; } int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num, struct ibv_wc *wc, struct ibv_grh *grh, struct ibv_ah_attr *ah_attr) { int version; int ret = 0; memset(ah_attr, 0, sizeof *ah_attr); set_ah_attr_generic_fields(ah_attr, wc, grh, port_num); if (wc->wc_flags & IBV_WC_GRH) { ah_attr->is_global = 1; version = get_grh_header_version(grh); if (version == 4) ret = set_ah_attr_by_ipv4(context, ah_attr, (struct ip *)((void *)grh + 20), port_num); else if (version == 6) ret = set_ah_attr_by_ipv6(context, ah_attr, grh, port_num); else ret = -1; } return ret; } struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd *pd, struct ibv_wc *wc, struct ibv_grh *grh, uint8_t port_num) { struct ibv_ah_attr ah_attr; int ret; ret = ibv_init_ah_from_wc(pd->context, port_num, wc, grh, &ah_attr); if (ret) return NULL; return ibv_create_ah(pd, &ah_attr); } int __ibv_destroy_ah(struct ibv_ah *ah) { return ah->context->ops.destroy_ah(ah); } default_symver(__ibv_destroy_ah, ibv_destroy_ah); int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) { return qp->context->ops.attach_mcast(qp, gid, lid); } default_symver(__ibv_attach_mcast, ibv_attach_mcast); int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) { return qp->context->ops.detach_mcast(qp, gid, lid); } default_symver(__ibv_detach_mcast, ibv_detach_mcast); static inline int ipv6_addr_v4mapped(const struct in6_addr *a) { return IN6_IS_ADDR_V4MAPPED(a) || /* IPv4 encoded multicast addresses */ (a->s6_addr32[0] == htobe32(0xff0e0000) && ((a->s6_addr32[1] | (a->s6_addr32[2] ^ htobe32(0x0000ffff))) == 0UL)); } struct peer_address { void *address; uint32_t size; }; static inline int create_peer_from_gid(int family, void *raw_gid, struct peer_address *peer_address) { switch (family) { case AF_INET: peer_address->address = raw_gid + 12; peer_address->size = 4; break; case AF_INET6: peer_address->address = raw_gid; peer_address->size = 16; break; default: return -1; } return 0; } #define NEIGH_GET_DEFAULT_TIMEOUT_MS 3000 int ibv_resolve_eth_l2_from_gid(struct ibv_context *context, struct ibv_ah_attr *attr, uint8_t eth_mac[ETHERNET_LL_SIZE], uint16_t *vid) { #ifndef NRESOLVE_NEIGH int dst_family; int src_family; int oif; struct get_neigh_handler neigh_handler; union ibv_gid sgid; int ether_len; struct peer_address src; struct peer_address dst; uint16_t ret_vid; int ret = -EINVAL; int err; err = ibv_query_gid(context, attr->port_num, attr->grh.sgid_index, &sgid); if (err) return err; err = neigh_init_resources(&neigh_handler, NEIGH_GET_DEFAULT_TIMEOUT_MS); if (err) return err; dst_family = ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw) ? AF_INET : AF_INET6; src_family = ipv6_addr_v4mapped((struct in6_addr *)sgid.raw) ? AF_INET : AF_INET6; if (create_peer_from_gid(dst_family, attr->grh.dgid.raw, &dst)) goto free_resources; if (create_peer_from_gid(src_family, &sgid.raw, &src)) goto free_resources; if (neigh_set_dst(&neigh_handler, dst_family, dst.address, dst.size)) goto free_resources; if (neigh_set_src(&neigh_handler, src_family, src.address, src.size)) goto free_resources; oif = neigh_get_oif_from_src(&neigh_handler); if (oif > 0) neigh_set_oif(&neigh_handler, oif); else goto free_resources; ret = -EHOSTUNREACH; /* blocking call */ if (process_get_neigh(&neigh_handler)) goto free_resources; ret_vid = neigh_get_vlan_id_from_dev(&neigh_handler); if (ret_vid <= 0xfff) neigh_set_vlan_id(&neigh_handler, ret_vid); /* We are using only Ethernet here */ ether_len = neigh_get_ll(&neigh_handler, eth_mac, sizeof(uint8_t) * ETHERNET_LL_SIZE); if (ether_len <= 0) goto free_resources; *vid = ret_vid; ret = 0; free_resources: neigh_free_resources(&neigh_handler); return ret; #else return -ENOSYS; #endif } diff --git a/contrib/ofed/libibverbs/verbs.h b/contrib/ofed/libibverbs/verbs.h index 498275561280..bea817e36fc9 100644 --- a/contrib/ofed/libibverbs/verbs.h +++ b/contrib/ofed/libibverbs/verbs.h @@ -1,2380 +1,2377 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef INFINIBAND_VERBS_H #define INFINIBAND_VERBS_H #include #include #include #include #include #include #ifdef __cplusplus # define BEGIN_C_DECLS extern "C" { # define END_C_DECLS } #else /* !__cplusplus */ # define BEGIN_C_DECLS # define END_C_DECLS #endif /* __cplusplus */ #if __GNUC__ >= 3 # define __attribute_const __attribute__((const)) #else # define __attribute_const #endif BEGIN_C_DECLS union ibv_gid { uint8_t raw[16]; struct { __be64 subnet_prefix; __be64 interface_id; } global; }; #ifndef container_of /** * container_of - cast a member of a structure out to the containing structure * @ptr: the pointer to the member. * @type: the type of the container struct this is embedded in. * @member: the name of the member within the struct. * */ #define container_of(ptr, type, member) \ ((type *) ((uint8_t *)(ptr) - offsetof(type, member))) #endif #define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz)) static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *) NULL) - 1; enum ibv_node_type { IBV_NODE_UNKNOWN = -1, IBV_NODE_CA = 1, IBV_NODE_SWITCH, IBV_NODE_ROUTER, IBV_NODE_RNIC, IBV_NODE_USNIC, IBV_NODE_USNIC_UDP, }; enum ibv_transport_type { IBV_TRANSPORT_UNKNOWN = -1, IBV_TRANSPORT_IB = 0, IBV_TRANSPORT_IWARP, IBV_TRANSPORT_USNIC, IBV_TRANSPORT_USNIC_UDP, }; enum ibv_device_cap_flags { IBV_DEVICE_RESIZE_MAX_WR = 1, IBV_DEVICE_BAD_PKEY_CNTR = 1 << 1, IBV_DEVICE_BAD_QKEY_CNTR = 1 << 2, IBV_DEVICE_RAW_MULTI = 1 << 3, IBV_DEVICE_AUTO_PATH_MIG = 1 << 4, IBV_DEVICE_CHANGE_PHY_PORT = 1 << 5, IBV_DEVICE_UD_AV_PORT_ENFORCE = 1 << 6, IBV_DEVICE_CURR_QP_STATE_MOD = 1 << 7, IBV_DEVICE_SHUTDOWN_PORT = 1 << 8, IBV_DEVICE_INIT_TYPE = 1 << 9, IBV_DEVICE_PORT_ACTIVE_EVENT = 1 << 10, IBV_DEVICE_SYS_IMAGE_GUID = 1 << 11, IBV_DEVICE_RC_RNR_NAK_GEN = 1 << 12, IBV_DEVICE_SRQ_RESIZE = 1 << 13, IBV_DEVICE_N_NOTIFY_CQ = 1 << 14, IBV_DEVICE_MEM_WINDOW = 1 << 17, IBV_DEVICE_UD_IP_CSUM = 1 << 18, IBV_DEVICE_XRC = 1 << 20, IBV_DEVICE_MEM_MGT_EXTENSIONS = 1 << 21, IBV_DEVICE_MEM_WINDOW_TYPE_2A = 1 << 23, IBV_DEVICE_MEM_WINDOW_TYPE_2B = 1 << 24, IBV_DEVICE_RC_IP_CSUM = 1 << 25, IBV_DEVICE_RAW_IP_CSUM = 1 << 26, IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29 }; /* * Can't extended above ibv_device_cap_flags enum as in some systems/compilers * enum range is limited to 4 bytes. */ #define IBV_DEVICE_RAW_SCATTER_FCS (1ULL << 34) enum ibv_atomic_cap { IBV_ATOMIC_NONE, IBV_ATOMIC_HCA, IBV_ATOMIC_GLOB }; struct ibv_device_attr { char fw_ver[64]; __be64 node_guid; __be64 sys_image_guid; uint64_t max_mr_size; uint64_t page_size_cap; uint32_t vendor_id; uint32_t vendor_part_id; uint32_t hw_ver; int max_qp; int max_qp_wr; int device_cap_flags; int max_sge; int max_sge_rd; int max_cq; int max_cqe; int max_mr; int max_pd; int max_qp_rd_atom; int max_ee_rd_atom; int max_res_rd_atom; int max_qp_init_rd_atom; int max_ee_init_rd_atom; enum ibv_atomic_cap atomic_cap; int max_ee; int max_rdd; int max_mw; int max_raw_ipv6_qp; int max_raw_ethy_qp; int max_mcast_grp; int max_mcast_qp_attach; int max_total_mcast_qp_attach; int max_ah; int max_fmr; int max_map_per_fmr; int max_srq; int max_srq_wr; int max_srq_sge; uint16_t max_pkeys; uint8_t local_ca_ack_delay; uint8_t phys_port_cnt; }; /* An extensible input struct for possible future extensions of the * ibv_query_device_ex verb. */ struct ibv_query_device_ex_input { uint32_t comp_mask; }; enum ibv_odp_transport_cap_bits { IBV_ODP_SUPPORT_SEND = 1 << 0, IBV_ODP_SUPPORT_RECV = 1 << 1, IBV_ODP_SUPPORT_WRITE = 1 << 2, IBV_ODP_SUPPORT_READ = 1 << 3, IBV_ODP_SUPPORT_ATOMIC = 1 << 4, }; struct ibv_odp_caps { uint64_t general_caps; struct { uint32_t rc_odp_caps; uint32_t uc_odp_caps; uint32_t ud_odp_caps; } per_transport_caps; }; enum ibv_odp_general_caps { IBV_ODP_SUPPORT = 1 << 0, }; struct ibv_tso_caps { uint32_t max_tso; uint32_t supported_qpts; }; /* RX Hash function flags */ enum ibv_rx_hash_function_flags { IBV_RX_HASH_FUNC_TOEPLITZ = 1 << 0, }; /* * RX Hash fields enable to set which incoming packet's field should * participates in RX Hash. Each flag represent certain packet's field, * when the flag is set the field that is represented by the flag will * participate in RX Hash calculation. * Note: *IPV4 and *IPV6 flags can't be enabled together on the same QP * and *TCP and *UDP flags can't be enabled together on the same QP. */ enum ibv_rx_hash_fields { IBV_RX_HASH_SRC_IPV4 = 1 << 0, IBV_RX_HASH_DST_IPV4 = 1 << 1, IBV_RX_HASH_SRC_IPV6 = 1 << 2, IBV_RX_HASH_DST_IPV6 = 1 << 3, IBV_RX_HASH_SRC_PORT_TCP = 1 << 4, IBV_RX_HASH_DST_PORT_TCP = 1 << 5, IBV_RX_HASH_SRC_PORT_UDP = 1 << 6, IBV_RX_HASH_DST_PORT_UDP = 1 << 7 }; struct ibv_rss_caps { uint32_t supported_qpts; uint32_t max_rwq_indirection_tables; uint32_t max_rwq_indirection_table_size; uint64_t rx_hash_fields_mask; /* enum ibv_rx_hash_fields */ uint8_t rx_hash_function; /* enum ibv_rx_hash_function_flags */ }; struct ibv_packet_pacing_caps { uint32_t qp_rate_limit_min; uint32_t qp_rate_limit_max; /* In kbps */ uint32_t supported_qpts; }; enum ibv_raw_packet_caps { IBV_RAW_PACKET_CAP_CVLAN_STRIPPING = 1 << 0, IBV_RAW_PACKET_CAP_SCATTER_FCS = 1 << 1, IBV_RAW_PACKET_CAP_IP_CSUM = 1 << 2, }; struct ibv_device_attr_ex { struct ibv_device_attr orig_attr; uint32_t comp_mask; struct ibv_odp_caps odp_caps; uint64_t completion_timestamp_mask; uint64_t hca_core_clock; uint64_t device_cap_flags_ex; struct ibv_tso_caps tso_caps; struct ibv_rss_caps rss_caps; uint32_t max_wq_type_rq; struct ibv_packet_pacing_caps packet_pacing_caps; uint32_t raw_packet_caps; /* Use ibv_raw_packet_caps */ }; enum ibv_mtu { IBV_MTU_256 = 1, IBV_MTU_512 = 2, IBV_MTU_1024 = 3, IBV_MTU_2048 = 4, IBV_MTU_4096 = 5 }; enum ibv_port_state { IBV_PORT_NOP = 0, IBV_PORT_DOWN = 1, IBV_PORT_INIT = 2, IBV_PORT_ARMED = 3, IBV_PORT_ACTIVE = 4, IBV_PORT_ACTIVE_DEFER = 5 }; enum { IBV_LINK_LAYER_UNSPECIFIED, IBV_LINK_LAYER_INFINIBAND, IBV_LINK_LAYER_ETHERNET, }; enum ibv_port_cap_flags { IBV_PORT_SM = 1 << 1, IBV_PORT_NOTICE_SUP = 1 << 2, IBV_PORT_TRAP_SUP = 1 << 3, IBV_PORT_OPT_IPD_SUP = 1 << 4, IBV_PORT_AUTO_MIGR_SUP = 1 << 5, IBV_PORT_SL_MAP_SUP = 1 << 6, IBV_PORT_MKEY_NVRAM = 1 << 7, IBV_PORT_PKEY_NVRAM = 1 << 8, IBV_PORT_LED_INFO_SUP = 1 << 9, IBV_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, IBV_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, IBV_PORT_CM_SUP = 1 << 16, IBV_PORT_SNMP_TUNNEL_SUP = 1 << 17, IBV_PORT_REINIT_SUP = 1 << 18, IBV_PORT_DEVICE_MGMT_SUP = 1 << 19, IBV_PORT_VENDOR_CLASS_SUP = 1 << 20, IBV_PORT_DR_NOTICE_SUP = 1 << 21, IBV_PORT_CAP_MASK_NOTICE_SUP = 1 << 22, IBV_PORT_BOOT_MGMT_SUP = 1 << 23, IBV_PORT_LINK_LATENCY_SUP = 1 << 24, IBV_PORT_CLIENT_REG_SUP = 1 << 25, IBV_PORT_IP_BASED_GIDS = 1 << 26 }; struct ibv_port_attr { enum ibv_port_state state; enum ibv_mtu max_mtu; enum ibv_mtu active_mtu; int gid_tbl_len; uint32_t port_cap_flags; uint32_t max_msg_sz; uint32_t bad_pkey_cntr; uint32_t qkey_viol_cntr; uint16_t pkey_tbl_len; uint16_t lid; uint16_t sm_lid; uint8_t lmc; uint8_t max_vl_num; uint8_t sm_sl; uint8_t subnet_timeout; uint8_t init_type_reply; uint8_t active_width; uint8_t active_speed; uint8_t phys_state; uint8_t link_layer; uint8_t reserved; }; enum ibv_event_type { IBV_EVENT_CQ_ERR, IBV_EVENT_QP_FATAL, IBV_EVENT_QP_REQ_ERR, IBV_EVENT_QP_ACCESS_ERR, IBV_EVENT_COMM_EST, IBV_EVENT_SQ_DRAINED, IBV_EVENT_PATH_MIG, IBV_EVENT_PATH_MIG_ERR, IBV_EVENT_DEVICE_FATAL, IBV_EVENT_PORT_ACTIVE, IBV_EVENT_PORT_ERR, IBV_EVENT_LID_CHANGE, IBV_EVENT_PKEY_CHANGE, IBV_EVENT_SM_CHANGE, IBV_EVENT_SRQ_ERR, IBV_EVENT_SRQ_LIMIT_REACHED, IBV_EVENT_QP_LAST_WQE_REACHED, IBV_EVENT_CLIENT_REREGISTER, IBV_EVENT_GID_CHANGE, IBV_EVENT_WQ_FATAL, }; struct ibv_async_event { union { struct ibv_cq *cq; struct ibv_qp *qp; struct ibv_srq *srq; struct ibv_wq *wq; int port_num; } element; enum ibv_event_type event_type; }; enum ibv_wc_status { IBV_WC_SUCCESS, IBV_WC_LOC_LEN_ERR, IBV_WC_LOC_QP_OP_ERR, IBV_WC_LOC_EEC_OP_ERR, IBV_WC_LOC_PROT_ERR, IBV_WC_WR_FLUSH_ERR, IBV_WC_MW_BIND_ERR, IBV_WC_BAD_RESP_ERR, IBV_WC_LOC_ACCESS_ERR, IBV_WC_REM_INV_REQ_ERR, IBV_WC_REM_ACCESS_ERR, IBV_WC_REM_OP_ERR, IBV_WC_RETRY_EXC_ERR, IBV_WC_RNR_RETRY_EXC_ERR, IBV_WC_LOC_RDD_VIOL_ERR, IBV_WC_REM_INV_RD_REQ_ERR, IBV_WC_REM_ABORT_ERR, IBV_WC_INV_EECN_ERR, IBV_WC_INV_EEC_STATE_ERR, IBV_WC_FATAL_ERR, IBV_WC_RESP_TIMEOUT_ERR, IBV_WC_GENERAL_ERR }; const char *ibv_wc_status_str(enum ibv_wc_status status); enum ibv_wc_opcode { IBV_WC_SEND, IBV_WC_RDMA_WRITE, IBV_WC_RDMA_READ, IBV_WC_COMP_SWAP, IBV_WC_FETCH_ADD, IBV_WC_BIND_MW, IBV_WC_LOCAL_INV, IBV_WC_TSO, /* * Set value of IBV_WC_RECV so consumers can test if a completion is a * receive by testing (opcode & IBV_WC_RECV). */ IBV_WC_RECV = 1 << 7, IBV_WC_RECV_RDMA_WITH_IMM }; enum { IBV_WC_IP_CSUM_OK_SHIFT = 2 }; enum ibv_create_cq_wc_flags { IBV_WC_EX_WITH_BYTE_LEN = 1 << 0, IBV_WC_EX_WITH_IMM = 1 << 1, IBV_WC_EX_WITH_QP_NUM = 1 << 2, IBV_WC_EX_WITH_SRC_QP = 1 << 3, IBV_WC_EX_WITH_SLID = 1 << 4, IBV_WC_EX_WITH_SL = 1 << 5, IBV_WC_EX_WITH_DLID_PATH_BITS = 1 << 6, IBV_WC_EX_WITH_COMPLETION_TIMESTAMP = 1 << 7, IBV_WC_EX_WITH_CVLAN = 1 << 8, IBV_WC_EX_WITH_FLOW_TAG = 1 << 9, }; enum { IBV_WC_STANDARD_FLAGS = IBV_WC_EX_WITH_BYTE_LEN | IBV_WC_EX_WITH_IMM | IBV_WC_EX_WITH_QP_NUM | IBV_WC_EX_WITH_SRC_QP | IBV_WC_EX_WITH_SLID | IBV_WC_EX_WITH_SL | IBV_WC_EX_WITH_DLID_PATH_BITS }; enum { IBV_CREATE_CQ_SUP_WC_FLAGS = IBV_WC_STANDARD_FLAGS | IBV_WC_EX_WITH_COMPLETION_TIMESTAMP | IBV_WC_EX_WITH_CVLAN | IBV_WC_EX_WITH_FLOW_TAG }; enum ibv_wc_flags { IBV_WC_GRH = 1 << 0, IBV_WC_WITH_IMM = 1 << 1, IBV_WC_IP_CSUM_OK = 1 << IBV_WC_IP_CSUM_OK_SHIFT, IBV_WC_WITH_INV = 1 << 3 }; struct ibv_wc { uint64_t wr_id; enum ibv_wc_status status; enum ibv_wc_opcode opcode; uint32_t vendor_err; uint32_t byte_len; /* When (wc_flags & IBV_WC_WITH_IMM): Immediate data in network byte order. * When (wc_flags & IBV_WC_WITH_INV): Stores the invalidated rkey. */ union { __be32 imm_data; uint32_t invalidated_rkey; }; uint32_t qp_num; uint32_t src_qp; int wc_flags; uint16_t pkey_index; uint16_t slid; uint8_t sl; uint8_t dlid_path_bits; }; enum ibv_access_flags { IBV_ACCESS_LOCAL_WRITE = 1, IBV_ACCESS_REMOTE_WRITE = (1<<1), IBV_ACCESS_REMOTE_READ = (1<<2), IBV_ACCESS_REMOTE_ATOMIC = (1<<3), IBV_ACCESS_MW_BIND = (1<<4), IBV_ACCESS_ZERO_BASED = (1<<5), IBV_ACCESS_ON_DEMAND = (1<<6), }; struct ibv_mw_bind_info { struct ibv_mr *mr; uint64_t addr; uint64_t length; int mw_access_flags; /* use ibv_access_flags */ }; struct ibv_pd { struct ibv_context *context; uint32_t handle; }; enum ibv_xrcd_init_attr_mask { IBV_XRCD_INIT_ATTR_FD = 1 << 0, IBV_XRCD_INIT_ATTR_OFLAGS = 1 << 1, IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2 }; struct ibv_xrcd_init_attr { uint32_t comp_mask; int fd; int oflags; }; struct ibv_xrcd { struct ibv_context *context; }; enum ibv_rereg_mr_flags { IBV_REREG_MR_CHANGE_TRANSLATION = (1 << 0), IBV_REREG_MR_CHANGE_PD = (1 << 1), IBV_REREG_MR_CHANGE_ACCESS = (1 << 2), IBV_REREG_MR_KEEP_VALID = (1 << 3), IBV_REREG_MR_FLAGS_SUPPORTED = ((IBV_REREG_MR_KEEP_VALID << 1) - 1) }; struct ibv_mr { struct ibv_context *context; struct ibv_pd *pd; void *addr; size_t length; uint32_t handle; uint32_t lkey; uint32_t rkey; }; enum ibv_mw_type { IBV_MW_TYPE_1 = 1, IBV_MW_TYPE_2 = 2 }; struct ibv_mw { struct ibv_context *context; struct ibv_pd *pd; uint32_t rkey; uint32_t handle; enum ibv_mw_type type; }; struct ibv_global_route { union ibv_gid dgid; uint32_t flow_label; uint8_t sgid_index; uint8_t hop_limit; uint8_t traffic_class; }; struct ibv_grh { __be32 version_tclass_flow; __be16 paylen; uint8_t next_hdr; uint8_t hop_limit; union ibv_gid sgid; union ibv_gid dgid; }; enum ibv_rate { IBV_RATE_MAX = 0, IBV_RATE_2_5_GBPS = 2, IBV_RATE_5_GBPS = 5, IBV_RATE_10_GBPS = 3, IBV_RATE_20_GBPS = 6, IBV_RATE_30_GBPS = 4, IBV_RATE_40_GBPS = 7, IBV_RATE_60_GBPS = 8, IBV_RATE_80_GBPS = 9, IBV_RATE_120_GBPS = 10, IBV_RATE_14_GBPS = 11, IBV_RATE_56_GBPS = 12, IBV_RATE_112_GBPS = 13, IBV_RATE_168_GBPS = 14, IBV_RATE_25_GBPS = 15, IBV_RATE_100_GBPS = 16, IBV_RATE_200_GBPS = 17, IBV_RATE_300_GBPS = 18, IBV_RATE_28_GBPS = 19, IBV_RATE_50_GBPS = 20, IBV_RATE_400_GBPS = 21, IBV_RATE_600_GBPS = 22, }; /** * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the * base rate of 2.5 Gbit/sec. For example, IBV_RATE_5_GBPS will be * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. * @rate: rate to convert. */ int __attribute_const ibv_rate_to_mult(enum ibv_rate rate); /** * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum. * @mult: multiple to convert. */ enum ibv_rate __attribute_const mult_to_ibv_rate(int mult); /** * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec. * For example, IBV_RATE_5_GBPS will return the value 5000. * @rate: rate to convert. */ int __attribute_const ibv_rate_to_mbps(enum ibv_rate rate); /** * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum. * @mbps: value to convert. */ enum ibv_rate __attribute_const mbps_to_ibv_rate(int mbps) __attribute_const; struct ibv_ah_attr { struct ibv_global_route grh; uint16_t dlid; uint8_t sl; uint8_t src_path_bits; uint8_t static_rate; uint8_t is_global; uint8_t port_num; }; enum ibv_srq_attr_mask { IBV_SRQ_MAX_WR = 1 << 0, IBV_SRQ_LIMIT = 1 << 1 }; struct ibv_srq_attr { uint32_t max_wr; uint32_t max_sge; uint32_t srq_limit; }; struct ibv_srq_init_attr { void *srq_context; struct ibv_srq_attr attr; }; enum ibv_srq_type { IBV_SRQT_BASIC, IBV_SRQT_XRC }; enum ibv_srq_init_attr_mask { IBV_SRQ_INIT_ATTR_TYPE = 1 << 0, IBV_SRQ_INIT_ATTR_PD = 1 << 1, IBV_SRQ_INIT_ATTR_XRCD = 1 << 2, IBV_SRQ_INIT_ATTR_CQ = 1 << 3, IBV_SRQ_INIT_ATTR_RESERVED = 1 << 4 }; struct ibv_srq_init_attr_ex { void *srq_context; struct ibv_srq_attr attr; uint32_t comp_mask; enum ibv_srq_type srq_type; struct ibv_pd *pd; struct ibv_xrcd *xrcd; struct ibv_cq *cq; }; enum ibv_wq_type { IBV_WQT_RQ }; enum ibv_wq_init_attr_mask { IBV_WQ_INIT_ATTR_FLAGS = 1 << 0, IBV_WQ_INIT_ATTR_RESERVED = 1 << 1, }; enum ibv_wq_flags { IBV_WQ_FLAGS_CVLAN_STRIPPING = 1 << 0, IBV_WQ_FLAGS_SCATTER_FCS = 1 << 1, IBV_WQ_FLAGS_RESERVED = 1 << 2, }; struct ibv_wq_init_attr { void *wq_context; enum ibv_wq_type wq_type; uint32_t max_wr; uint32_t max_sge; struct ibv_pd *pd; struct ibv_cq *cq; uint32_t comp_mask; /* Use ibv_wq_init_attr_mask */ uint32_t create_flags; /* use ibv_wq_flags */ }; enum ibv_wq_state { IBV_WQS_RESET, IBV_WQS_RDY, IBV_WQS_ERR, IBV_WQS_UNKNOWN }; enum ibv_wq_attr_mask { IBV_WQ_ATTR_STATE = 1 << 0, IBV_WQ_ATTR_CURR_STATE = 1 << 1, IBV_WQ_ATTR_FLAGS = 1 << 2, IBV_WQ_ATTR_RESERVED = 1 << 3, }; struct ibv_wq_attr { /* enum ibv_wq_attr_mask */ uint32_t attr_mask; /* Move the WQ to this state */ enum ibv_wq_state wq_state; /* Assume this is the current WQ state */ enum ibv_wq_state curr_wq_state; uint32_t flags; /* Use ibv_wq_flags */ uint32_t flags_mask; /* Use ibv_wq_flags */ }; /* * Receive Work Queue Indirection Table. * It's used in order to distribute incoming packets between different * Receive Work Queues. Associating Receive WQs with different CPU cores * allows to workload the traffic between different CPU cores. * The Indirection Table can contain only WQs of type IBV_WQT_RQ. */ struct ibv_rwq_ind_table { struct ibv_context *context; int ind_tbl_handle; int ind_tbl_num; uint32_t comp_mask; }; enum ibv_ind_table_init_attr_mask { IBV_CREATE_IND_TABLE_RESERVED = (1 << 0) }; /* * Receive Work Queue Indirection Table attributes */ struct ibv_rwq_ind_table_init_attr { uint32_t log_ind_tbl_size; /* Each entry is a pointer to a Receive Work Queue */ struct ibv_wq **ind_tbl; uint32_t comp_mask; }; enum ibv_qp_type { IBV_QPT_RC = 2, IBV_QPT_UC, IBV_QPT_UD, IBV_QPT_RAW_PACKET = 8, IBV_QPT_XRC_SEND = 9, IBV_QPT_XRC_RECV }; struct ibv_qp_cap { uint32_t max_send_wr; uint32_t max_recv_wr; uint32_t max_send_sge; uint32_t max_recv_sge; uint32_t max_inline_data; }; struct ibv_qp_init_attr { void *qp_context; struct ibv_cq *send_cq; struct ibv_cq *recv_cq; struct ibv_srq *srq; struct ibv_qp_cap cap; enum ibv_qp_type qp_type; int sq_sig_all; }; enum ibv_qp_init_attr_mask { IBV_QP_INIT_ATTR_PD = 1 << 0, IBV_QP_INIT_ATTR_XRCD = 1 << 1, IBV_QP_INIT_ATTR_CREATE_FLAGS = 1 << 2, IBV_QP_INIT_ATTR_MAX_TSO_HEADER = 1 << 3, IBV_QP_INIT_ATTR_IND_TABLE = 1 << 4, IBV_QP_INIT_ATTR_RX_HASH = 1 << 5, IBV_QP_INIT_ATTR_RESERVED = 1 << 6 }; enum ibv_qp_create_flags { IBV_QP_CREATE_BLOCK_SELF_MCAST_LB = 1 << 1, IBV_QP_CREATE_SCATTER_FCS = 1 << 8, IBV_QP_CREATE_CVLAN_STRIPPING = 1 << 9, }; struct ibv_rx_hash_conf { /* enum ibv_rx_hash_function_flags */ uint8_t rx_hash_function; uint8_t rx_hash_key_len; uint8_t *rx_hash_key; /* enum ibv_rx_hash_fields */ uint64_t rx_hash_fields_mask; }; struct ibv_qp_init_attr_ex { void *qp_context; struct ibv_cq *send_cq; struct ibv_cq *recv_cq; struct ibv_srq *srq; struct ibv_qp_cap cap; enum ibv_qp_type qp_type; int sq_sig_all; uint32_t comp_mask; struct ibv_pd *pd; struct ibv_xrcd *xrcd; uint32_t create_flags; uint16_t max_tso_header; struct ibv_rwq_ind_table *rwq_ind_tbl; struct ibv_rx_hash_conf rx_hash_conf; }; enum ibv_qp_open_attr_mask { IBV_QP_OPEN_ATTR_NUM = 1 << 0, IBV_QP_OPEN_ATTR_XRCD = 1 << 1, IBV_QP_OPEN_ATTR_CONTEXT = 1 << 2, IBV_QP_OPEN_ATTR_TYPE = 1 << 3, IBV_QP_OPEN_ATTR_RESERVED = 1 << 4 }; struct ibv_qp_open_attr { uint32_t comp_mask; uint32_t qp_num; struct ibv_xrcd *xrcd; void *qp_context; enum ibv_qp_type qp_type; }; enum ibv_qp_attr_mask { IBV_QP_STATE = 1 << 0, IBV_QP_CUR_STATE = 1 << 1, IBV_QP_EN_SQD_ASYNC_NOTIFY = 1 << 2, IBV_QP_ACCESS_FLAGS = 1 << 3, IBV_QP_PKEY_INDEX = 1 << 4, IBV_QP_PORT = 1 << 5, IBV_QP_QKEY = 1 << 6, IBV_QP_AV = 1 << 7, IBV_QP_PATH_MTU = 1 << 8, IBV_QP_TIMEOUT = 1 << 9, IBV_QP_RETRY_CNT = 1 << 10, IBV_QP_RNR_RETRY = 1 << 11, IBV_QP_RQ_PSN = 1 << 12, IBV_QP_MAX_QP_RD_ATOMIC = 1 << 13, IBV_QP_ALT_PATH = 1 << 14, IBV_QP_MIN_RNR_TIMER = 1 << 15, IBV_QP_SQ_PSN = 1 << 16, IBV_QP_MAX_DEST_RD_ATOMIC = 1 << 17, IBV_QP_PATH_MIG_STATE = 1 << 18, IBV_QP_CAP = 1 << 19, IBV_QP_DEST_QPN = 1 << 20, IBV_QP_RATE_LIMIT = 1 << 25, }; enum ibv_qp_state { IBV_QPS_RESET, IBV_QPS_INIT, IBV_QPS_RTR, IBV_QPS_RTS, IBV_QPS_SQD, IBV_QPS_SQE, IBV_QPS_ERR, IBV_QPS_UNKNOWN }; enum ibv_mig_state { IBV_MIG_MIGRATED, IBV_MIG_REARM, IBV_MIG_ARMED }; struct ibv_qp_attr { enum ibv_qp_state qp_state; enum ibv_qp_state cur_qp_state; enum ibv_mtu path_mtu; enum ibv_mig_state path_mig_state; uint32_t qkey; uint32_t rq_psn; uint32_t sq_psn; uint32_t dest_qp_num; int qp_access_flags; struct ibv_qp_cap cap; struct ibv_ah_attr ah_attr; struct ibv_ah_attr alt_ah_attr; uint16_t pkey_index; uint16_t alt_pkey_index; uint8_t en_sqd_async_notify; uint8_t sq_draining; uint8_t max_rd_atomic; uint8_t max_dest_rd_atomic; uint8_t min_rnr_timer; uint8_t port_num; uint8_t timeout; uint8_t retry_cnt; uint8_t rnr_retry; uint8_t alt_port_num; uint8_t alt_timeout; uint32_t rate_limit; }; enum ibv_wr_opcode { IBV_WR_RDMA_WRITE, IBV_WR_RDMA_WRITE_WITH_IMM, IBV_WR_SEND, IBV_WR_SEND_WITH_IMM, IBV_WR_RDMA_READ, IBV_WR_ATOMIC_CMP_AND_SWP, IBV_WR_ATOMIC_FETCH_AND_ADD, IBV_WR_LOCAL_INV, IBV_WR_BIND_MW, IBV_WR_SEND_WITH_INV, IBV_WR_TSO, }; enum ibv_send_flags { IBV_SEND_FENCE = 1 << 0, IBV_SEND_SIGNALED = 1 << 1, IBV_SEND_SOLICITED = 1 << 2, IBV_SEND_INLINE = 1 << 3, IBV_SEND_IP_CSUM = 1 << 4 }; struct ibv_sge { uint64_t addr; uint32_t length; uint32_t lkey; }; struct ibv_send_wr { uint64_t wr_id; struct ibv_send_wr *next; struct ibv_sge *sg_list; int num_sge; enum ibv_wr_opcode opcode; int send_flags; __be32 imm_data; union { struct { uint64_t remote_addr; uint32_t rkey; } rdma; struct { uint64_t remote_addr; uint64_t compare_add; uint64_t swap; uint32_t rkey; } atomic; struct { struct ibv_ah *ah; uint32_t remote_qpn; uint32_t remote_qkey; } ud; } wr; union { struct { uint32_t remote_srqn; } xrc; } qp_type; union { struct { struct ibv_mw *mw; uint32_t rkey; struct ibv_mw_bind_info bind_info; } bind_mw; struct { void *hdr; uint16_t hdr_sz; uint16_t mss; } tso; }; }; struct ibv_recv_wr { uint64_t wr_id; struct ibv_recv_wr *next; struct ibv_sge *sg_list; int num_sge; }; struct ibv_mw_bind { uint64_t wr_id; int send_flags; struct ibv_mw_bind_info bind_info; }; struct ibv_srq { struct ibv_context *context; void *srq_context; struct ibv_pd *pd; uint32_t handle; pthread_mutex_t mutex; pthread_cond_t cond; uint32_t events_completed; }; /* * Work Queue. QP can be created without internal WQs "packaged" inside it, * this QP can be configured to use "external" WQ object as its * receive/send queue. * WQ associated (many to one) with Completion Queue it owns WQ properties * (PD, WQ size etc). * WQ of type IBV_WQT_RQ: * - Contains receive WQEs, in this case its PD serves as scatter as well. * - Exposes post receive function to be used to post a list of work * requests (WRs) to its receive queue. */ struct ibv_wq { struct ibv_context *context; void *wq_context; struct ibv_pd *pd; struct ibv_cq *cq; uint32_t wq_num; uint32_t handle; enum ibv_wq_state state; enum ibv_wq_type wq_type; int (*post_recv)(struct ibv_wq *current, struct ibv_recv_wr *recv_wr, struct ibv_recv_wr **bad_recv_wr); pthread_mutex_t mutex; pthread_cond_t cond; uint32_t events_completed; uint32_t comp_mask; }; struct ibv_qp { struct ibv_context *context; void *qp_context; struct ibv_pd *pd; struct ibv_cq *send_cq; struct ibv_cq *recv_cq; struct ibv_srq *srq; uint32_t handle; uint32_t qp_num; enum ibv_qp_state state; enum ibv_qp_type qp_type; pthread_mutex_t mutex; pthread_cond_t cond; uint32_t events_completed; }; struct ibv_comp_channel { struct ibv_context *context; int fd; int refcnt; }; struct ibv_cq { struct ibv_context *context; struct ibv_comp_channel *channel; void *cq_context; uint32_t handle; int cqe; pthread_mutex_t mutex; pthread_cond_t cond; uint32_t comp_events_completed; uint32_t async_events_completed; }; struct ibv_poll_cq_attr { uint32_t comp_mask; }; struct ibv_cq_ex { struct ibv_context *context; struct ibv_comp_channel *channel; void *cq_context; uint32_t handle; int cqe; pthread_mutex_t mutex; pthread_cond_t cond; uint32_t comp_events_completed; uint32_t async_events_completed; uint32_t comp_mask; enum ibv_wc_status status; uint64_t wr_id; int (*start_poll)(struct ibv_cq_ex *current, struct ibv_poll_cq_attr *attr); int (*next_poll)(struct ibv_cq_ex *current); void (*end_poll)(struct ibv_cq_ex *current); enum ibv_wc_opcode (*read_opcode)(struct ibv_cq_ex *current); uint32_t (*read_vendor_err)(struct ibv_cq_ex *current); uint32_t (*read_byte_len)(struct ibv_cq_ex *current); uint32_t (*read_imm_data)(struct ibv_cq_ex *current); uint32_t (*read_qp_num)(struct ibv_cq_ex *current); uint32_t (*read_src_qp)(struct ibv_cq_ex *current); int (*read_wc_flags)(struct ibv_cq_ex *current); uint32_t (*read_slid)(struct ibv_cq_ex *current); uint8_t (*read_sl)(struct ibv_cq_ex *current); uint8_t (*read_dlid_path_bits)(struct ibv_cq_ex *current); uint64_t (*read_completion_ts)(struct ibv_cq_ex *current); uint16_t (*read_cvlan)(struct ibv_cq_ex *current); uint32_t (*read_flow_tag)(struct ibv_cq_ex *current); }; static inline struct ibv_cq *ibv_cq_ex_to_cq(struct ibv_cq_ex *cq) { return (struct ibv_cq *)cq; } static inline int ibv_start_poll(struct ibv_cq_ex *cq, struct ibv_poll_cq_attr *attr) { return cq->start_poll(cq, attr); } static inline int ibv_next_poll(struct ibv_cq_ex *cq) { return cq->next_poll(cq); } static inline void ibv_end_poll(struct ibv_cq_ex *cq) { cq->end_poll(cq); } static inline enum ibv_wc_opcode ibv_wc_read_opcode(struct ibv_cq_ex *cq) { return cq->read_opcode(cq); } static inline uint32_t ibv_wc_read_vendor_err(struct ibv_cq_ex *cq) { return cq->read_vendor_err(cq); } static inline uint32_t ibv_wc_read_byte_len(struct ibv_cq_ex *cq) { return cq->read_byte_len(cq); } static inline uint32_t ibv_wc_read_imm_data(struct ibv_cq_ex *cq) { return cq->read_imm_data(cq); } static inline uint32_t ibv_wc_read_qp_num(struct ibv_cq_ex *cq) { return cq->read_qp_num(cq); } static inline uint32_t ibv_wc_read_src_qp(struct ibv_cq_ex *cq) { return cq->read_src_qp(cq); } static inline int ibv_wc_read_wc_flags(struct ibv_cq_ex *cq) { return cq->read_wc_flags(cq); } static inline uint32_t ibv_wc_read_slid(struct ibv_cq_ex *cq) { return cq->read_slid(cq); } static inline uint8_t ibv_wc_read_sl(struct ibv_cq_ex *cq) { return cq->read_sl(cq); } static inline uint8_t ibv_wc_read_dlid_path_bits(struct ibv_cq_ex *cq) { return cq->read_dlid_path_bits(cq); } static inline uint64_t ibv_wc_read_completion_ts(struct ibv_cq_ex *cq) { return cq->read_completion_ts(cq); } static inline uint16_t ibv_wc_read_cvlan(struct ibv_cq_ex *cq) { return cq->read_cvlan(cq); } static inline uint32_t ibv_wc_read_flow_tag(struct ibv_cq_ex *cq) { return cq->read_flow_tag(cq); } static inline int ibv_post_wq_recv(struct ibv_wq *wq, struct ibv_recv_wr *recv_wr, struct ibv_recv_wr **bad_recv_wr) { return wq->post_recv(wq, recv_wr, bad_recv_wr); } struct ibv_ah { struct ibv_context *context; struct ibv_pd *pd; uint32_t handle; }; enum ibv_flow_flags { IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1 << 0, IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1, }; enum ibv_flow_attr_type { /* steering according to rule specifications */ IBV_FLOW_ATTR_NORMAL = 0x0, /* default unicast and multicast rule - * receive all Eth traffic which isn't steered to any QP */ IBV_FLOW_ATTR_ALL_DEFAULT = 0x1, /* default multicast rule - * receive all Eth multicast traffic which isn't steered to any QP */ IBV_FLOW_ATTR_MC_DEFAULT = 0x2, /* sniffer rule - receive all port traffic */ IBV_FLOW_ATTR_SNIFFER = 0x3, }; enum ibv_flow_spec_type { IBV_FLOW_SPEC_ETH = 0x20, IBV_FLOW_SPEC_IPV4 = 0x30, IBV_FLOW_SPEC_IPV6 = 0x31, IBV_FLOW_SPEC_IPV4_EXT = 0x32, IBV_FLOW_SPEC_TCP = 0x40, IBV_FLOW_SPEC_UDP = 0x41, IBV_FLOW_SPEC_VXLAN_TUNNEL = 0x50, IBV_FLOW_SPEC_INNER = 0x100, IBV_FLOW_SPEC_ACTION_TAG = 0x1000, IBV_FLOW_SPEC_ACTION_DROP = 0x1001, }; struct ibv_flow_eth_filter { uint8_t dst_mac[6]; uint8_t src_mac[6]; uint16_t ether_type; /* * same layout as 802.1q: prio 3, cfi 1, vlan id 12 */ uint16_t vlan_tag; }; struct ibv_flow_spec_eth { enum ibv_flow_spec_type type; uint16_t size; struct ibv_flow_eth_filter val; struct ibv_flow_eth_filter mask; }; struct ibv_flow_ipv4_filter { uint32_t src_ip; uint32_t dst_ip; }; struct ibv_flow_spec_ipv4 { enum ibv_flow_spec_type type; uint16_t size; struct ibv_flow_ipv4_filter val; struct ibv_flow_ipv4_filter mask; }; struct ibv_flow_ipv4_ext_filter { uint32_t src_ip; uint32_t dst_ip; uint8_t proto; uint8_t tos; uint8_t ttl; uint8_t flags; }; struct ibv_flow_spec_ipv4_ext { enum ibv_flow_spec_type type; uint16_t size; struct ibv_flow_ipv4_ext_filter val; struct ibv_flow_ipv4_ext_filter mask; }; struct ibv_flow_ipv6_filter { uint8_t src_ip[16]; uint8_t dst_ip[16]; uint32_t flow_label; uint8_t next_hdr; uint8_t traffic_class; uint8_t hop_limit; }; struct ibv_flow_spec_ipv6 { enum ibv_flow_spec_type type; uint16_t size; struct ibv_flow_ipv6_filter val; struct ibv_flow_ipv6_filter mask; }; struct ibv_flow_tcp_udp_filter { uint16_t dst_port; uint16_t src_port; }; struct ibv_flow_spec_tcp_udp { enum ibv_flow_spec_type type; uint16_t size; struct ibv_flow_tcp_udp_filter val; struct ibv_flow_tcp_udp_filter mask; }; struct ibv_flow_tunnel_filter { uint32_t tunnel_id; }; struct ibv_flow_spec_tunnel { enum ibv_flow_spec_type type; uint16_t size; struct ibv_flow_tunnel_filter val; struct ibv_flow_tunnel_filter mask; }; struct ibv_flow_spec_action_tag { enum ibv_flow_spec_type type; uint16_t size; uint32_t tag_id; }; struct ibv_flow_spec_action_drop { enum ibv_flow_spec_type type; uint16_t size; }; struct ibv_flow_spec { union { struct { enum ibv_flow_spec_type type; uint16_t size; } hdr; struct ibv_flow_spec_eth eth; struct ibv_flow_spec_ipv4 ipv4; struct ibv_flow_spec_tcp_udp tcp_udp; struct ibv_flow_spec_ipv4_ext ipv4_ext; struct ibv_flow_spec_ipv6 ipv6; struct ibv_flow_spec_tunnel tunnel; struct ibv_flow_spec_action_tag flow_tag; struct ibv_flow_spec_action_drop drop; }; }; struct ibv_flow_attr { uint32_t comp_mask; enum ibv_flow_attr_type type; uint16_t size; uint16_t priority; uint8_t num_of_specs; uint8_t port; uint32_t flags; /* Following are the optional layers according to user request * struct ibv_flow_spec_xxx [L2] * struct ibv_flow_spec_yyy [L3/L4] */ }; struct ibv_flow { uint32_t comp_mask; struct ibv_context *context; uint32_t handle; }; struct ibv_device; struct ibv_context; /* Obsolete, never used, do not touch */ struct _ibv_device_ops { struct ibv_context * (*_dummy1)(struct ibv_device *device, int cmd_fd); void (*_dummy2)(struct ibv_context *context); }; enum { IBV_SYSFS_NAME_MAX = 64, IBV_SYSFS_PATH_MAX = 256 }; struct ibv_device { struct _ibv_device_ops _ops; enum ibv_node_type node_type; enum ibv_transport_type transport_type; /* Name of underlying kernel IB device, eg "mthca0" */ char name[IBV_SYSFS_NAME_MAX]; /* Name of uverbs device, eg "uverbs0" */ char dev_name[IBV_SYSFS_NAME_MAX]; /* Path to infiniband_verbs class device in sysfs */ char dev_path[IBV_SYSFS_PATH_MAX]; /* Path to infiniband class device in sysfs */ char ibdev_path[IBV_SYSFS_PATH_MAX]; }; struct ibv_context_ops { int (*query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr); int (*query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); struct ibv_pd * (*alloc_pd)(struct ibv_context *context); int (*dealloc_pd)(struct ibv_pd *pd); struct ibv_mr * (*reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access); int (*rereg_mr)(struct ibv_mr *mr, int flags, struct ibv_pd *pd, void *addr, size_t length, int access); int (*dereg_mr)(struct ibv_mr *mr); struct ibv_mw * (*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type); int (*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw, struct ibv_mw_bind *mw_bind); int (*dealloc_mw)(struct ibv_mw *mw); struct ibv_cq * (*create_cq)(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector); int (*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc); int (*req_notify_cq)(struct ibv_cq *cq, int solicited_only); void (*cq_event)(struct ibv_cq *cq); int (*resize_cq)(struct ibv_cq *cq, int cqe); int (*destroy_cq)(struct ibv_cq *cq); struct ibv_srq * (*create_srq)(struct ibv_pd *pd, struct ibv_srq_init_attr *srq_init_attr); int (*modify_srq)(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, int srq_attr_mask); int (*query_srq)(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr); int (*destroy_srq)(struct ibv_srq *srq); int (*post_srq_recv)(struct ibv_srq *srq, struct ibv_recv_wr *recv_wr, struct ibv_recv_wr **bad_recv_wr); struct ibv_qp * (*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); int (*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); int (*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); int (*destroy_qp)(struct ibv_qp *qp); int (*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr); int (*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); struct ibv_ah * (*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr); int (*destroy_ah)(struct ibv_ah *ah); int (*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); int (*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); void (*async_event)(struct ibv_async_event *event); }; struct ibv_context { struct ibv_device *device; struct ibv_context_ops ops; int cmd_fd; int async_fd; int num_comp_vectors; pthread_mutex_t mutex; void *abi_compat; }; enum ibv_cq_init_attr_mask { IBV_CQ_INIT_ATTR_MASK_FLAGS = 1 << 0, IBV_CQ_INIT_ATTR_MASK_RESERVED = 1 << 1 }; enum ibv_create_cq_attr_flags { IBV_CREATE_CQ_ATTR_SINGLE_THREADED = 1 << 0, IBV_CREATE_CQ_ATTR_RESERVED = 1 << 1, }; struct ibv_cq_init_attr_ex { /* Minimum number of entries required for CQ */ uint32_t cqe; /* Consumer-supplied context returned for completion events */ void *cq_context; /* Completion channel where completion events will be queued. * May be NULL if completion events will not be used. */ struct ibv_comp_channel *channel; /* Completion vector used to signal completion events. * Must be < context->num_comp_vectors. */ uint32_t comp_vector; /* Or'ed bit of enum ibv_create_cq_wc_flags. */ uint64_t wc_flags; /* compatibility mask (extended verb). Or'd flags of * enum ibv_cq_init_attr_mask */ uint32_t comp_mask; /* create cq attr flags - one or more flags from * enum ibv_create_cq_attr_flags */ uint32_t flags; }; enum ibv_values_mask { IBV_VALUES_MASK_RAW_CLOCK = 1 << 0, IBV_VALUES_MASK_RESERVED = 1 << 1 }; struct ibv_values_ex { uint32_t comp_mask; struct timespec raw_clock; }; enum verbs_context_mask { VERBS_CONTEXT_XRCD = 1 << 0, VERBS_CONTEXT_SRQ = 1 << 1, VERBS_CONTEXT_QP = 1 << 2, VERBS_CONTEXT_CREATE_FLOW = 1 << 3, VERBS_CONTEXT_DESTROY_FLOW = 1 << 4, VERBS_CONTEXT_RESERVED = 1 << 5 }; struct verbs_context { /* "grows up" - new fields go here */ int (*destroy_rwq_ind_table)(struct ibv_rwq_ind_table *rwq_ind_table); struct ibv_rwq_ind_table *(*create_rwq_ind_table)(struct ibv_context *context, struct ibv_rwq_ind_table_init_attr *init_attr); int (*destroy_wq)(struct ibv_wq *wq); int (*modify_wq)(struct ibv_wq *wq, struct ibv_wq_attr *wq_attr); struct ibv_wq * (*create_wq)(struct ibv_context *context, struct ibv_wq_init_attr *wq_init_attr); int (*query_rt_values)(struct ibv_context *context, struct ibv_values_ex *values); struct ibv_cq_ex *(*create_cq_ex)(struct ibv_context *context, struct ibv_cq_init_attr_ex *init_attr); struct verbs_ex_private *priv; int (*query_device_ex)(struct ibv_context *context, const struct ibv_query_device_ex_input *input, struct ibv_device_attr_ex *attr, size_t attr_size); int (*ibv_destroy_flow) (struct ibv_flow *flow); void (*ABI_placeholder2) (void); /* DO NOT COPY THIS GARBAGE */ struct ibv_flow * (*ibv_create_flow) (struct ibv_qp *qp, struct ibv_flow_attr *flow_attr); void (*ABI_placeholder1) (void); /* DO NOT COPY THIS GARBAGE */ struct ibv_qp *(*open_qp)(struct ibv_context *context, struct ibv_qp_open_attr *attr); struct ibv_qp *(*create_qp_ex)(struct ibv_context *context, struct ibv_qp_init_attr_ex *qp_init_attr_ex); int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num); struct ibv_srq * (*create_srq_ex)(struct ibv_context *context, struct ibv_srq_init_attr_ex *srq_init_attr_ex); struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context, struct ibv_xrcd_init_attr *xrcd_init_attr); int (*close_xrcd)(struct ibv_xrcd *xrcd); uint64_t has_comp_mask; size_t sz; /* Must be immediately before struct ibv_context */ struct ibv_context context; /* Must be last field in the struct */ }; static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx) { return (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED) ? NULL : container_of(ctx, struct verbs_context, context); } #define verbs_get_ctx_op(ctx, op) ({ \ struct verbs_context *__vctx = verbs_get_ctx(ctx); \ (!__vctx || (__vctx->sz < sizeof(*__vctx) - offsetof(struct verbs_context, op)) || \ !__vctx->op) ? NULL : __vctx; }) #define verbs_set_ctx_op(_vctx, op, ptr) ({ \ struct verbs_context *vctx = _vctx; \ if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \ vctx->op = ptr; }) /** * ibv_get_device_list - Get list of IB devices currently available * @num_devices: optional. if non-NULL, set to the number of devices * returned in the array. * * Return a NULL-terminated array of IB devices. The array can be * released with ibv_free_device_list(). */ struct ibv_device **ibv_get_device_list(int *num_devices); /** * ibv_free_device_list - Free list from ibv_get_device_list() * * Free an array of devices returned from ibv_get_device_list(). Once * the array is freed, pointers to devices that were not opened with * ibv_open_device() are no longer valid. Client code must open all * devices it intends to use before calling ibv_free_device_list(). */ void ibv_free_device_list(struct ibv_device **list); /** * ibv_get_device_name - Return kernel device name */ const char *ibv_get_device_name(struct ibv_device *device); /** * ibv_get_device_guid - Return device's node GUID */ __be64 ibv_get_device_guid(struct ibv_device *device); /** * ibv_open_device - Initialize device for use */ struct ibv_context *ibv_open_device(struct ibv_device *device); /** * ibv_close_device - Release device */ int ibv_close_device(struct ibv_context *context); /** * ibv_get_async_event - Get next async event * @event: Pointer to use to return async event * * All async events returned by ibv_get_async_event() must eventually * be acknowledged with ibv_ack_async_event(). */ int ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event); /** * ibv_ack_async_event - Acknowledge an async event * @event: Event to be acknowledged. * * All async events which are returned by ibv_get_async_event() must * be acknowledged. To avoid races, destroying an object (CQ, SRQ or * QP) will wait for all affiliated events to be acknowledged, so * there should be a one-to-one correspondence between acks and * successful gets. */ void ibv_ack_async_event(struct ibv_async_event *event); /** * ibv_query_device - Get device properties */ int ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr); /** * ibv_query_port - Get port properties */ int ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); static inline int ___ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { /* For compatibility when running with old libibverbs */ port_attr->link_layer = IBV_LINK_LAYER_UNSPECIFIED; port_attr->reserved = 0; return ibv_query_port(context, port_num, port_attr); } #define ibv_query_port(context, port_num, port_attr) \ ___ibv_query_port(context, port_num, port_attr) /** * ibv_query_gid - Get a GID table entry */ int ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); /** * ibv_query_pkey - Get a P_Key table entry */ int ibv_query_pkey(struct ibv_context *context, uint8_t port_num, int index, __be16 *pkey); /** * ibv_alloc_pd - Allocate a protection domain */ struct ibv_pd *ibv_alloc_pd(struct ibv_context *context); /** * ibv_dealloc_pd - Free a protection domain */ int ibv_dealloc_pd(struct ibv_pd *pd); static inline struct ibv_flow *ibv_create_flow(struct ibv_qp *qp, struct ibv_flow_attr *flow) { struct verbs_context *vctx = verbs_get_ctx_op(qp->context, ibv_create_flow); if (!vctx || !vctx->ibv_create_flow) { errno = ENOSYS; return NULL; } return vctx->ibv_create_flow(qp, flow); } static inline int ibv_destroy_flow(struct ibv_flow *flow_id) { struct verbs_context *vctx = verbs_get_ctx_op(flow_id->context, ibv_destroy_flow); if (!vctx || !vctx->ibv_destroy_flow) return -ENOSYS; return vctx->ibv_destroy_flow(flow_id); } /** * ibv_open_xrcd - Open an extended connection domain */ static inline struct ibv_xrcd * ibv_open_xrcd(struct ibv_context *context, struct ibv_xrcd_init_attr *xrcd_init_attr) { struct verbs_context *vctx = verbs_get_ctx_op(context, open_xrcd); if (!vctx) { errno = ENOSYS; return NULL; } return vctx->open_xrcd(context, xrcd_init_attr); } /** * ibv_close_xrcd - Close an extended connection domain */ static inline int ibv_close_xrcd(struct ibv_xrcd *xrcd) { struct verbs_context *vctx = verbs_get_ctx(xrcd->context); return vctx->close_xrcd(xrcd); } /** * ibv_reg_mr - Register a memory region */ struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access); enum ibv_rereg_mr_err_code { /* Old MR is valid, invalid input */ IBV_REREG_MR_ERR_INPUT = -1, /* Old MR is valid, failed via don't fork on new address range */ IBV_REREG_MR_ERR_DONT_FORK_NEW = -2, /* New MR is valid, failed via do fork on old address range */ IBV_REREG_MR_ERR_DO_FORK_OLD = -3, /* MR shouldn't be used, command error */ IBV_REREG_MR_ERR_CMD = -4, /* MR shouldn't be used, command error, invalid fork state on new address range */ IBV_REREG_MR_ERR_CMD_AND_DO_FORK_NEW = -5, }; /** * ibv_rereg_mr - Re-Register a memory region */ int ibv_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd, void *addr, size_t length, int access); /** * ibv_dereg_mr - Deregister a memory region */ int ibv_dereg_mr(struct ibv_mr *mr); /** * ibv_alloc_mw - Allocate a memory window */ static inline struct ibv_mw *ibv_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type) { struct ibv_mw *mw; if (!pd->context->ops.alloc_mw) { errno = ENOSYS; return NULL; } mw = pd->context->ops.alloc_mw(pd, type); return mw; } /** * ibv_dealloc_mw - Free a memory window */ static inline int ibv_dealloc_mw(struct ibv_mw *mw) { return mw->context->ops.dealloc_mw(mw); } /** * ibv_inc_rkey - Increase the 8 lsb in the given rkey */ static inline uint32_t ibv_inc_rkey(uint32_t rkey) { const uint32_t mask = 0x000000ff; uint8_t newtag = (uint8_t)((rkey + 1) & mask); return (rkey & ~mask) | newtag; } /** * ibv_bind_mw - Bind a memory window to a region */ static inline int ibv_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, struct ibv_mw_bind *mw_bind) { if (mw->type != IBV_MW_TYPE_1) return EINVAL; return mw->context->ops.bind_mw(qp, mw, mw_bind); } /** * ibv_create_comp_channel - Create a completion event channel */ struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context); /** * ibv_destroy_comp_channel - Destroy a completion event channel */ int ibv_destroy_comp_channel(struct ibv_comp_channel *channel); /** * ibv_create_cq - Create a completion queue * @context - Context CQ will be attached to * @cqe - Minimum number of entries required for CQ * @cq_context - Consumer-supplied context returned for completion events * @channel - Completion channel where completion events will be queued. * May be NULL if completion events will not be used. * @comp_vector - Completion vector used to signal completion events. * Must be >= 0 and < context->num_comp_vectors. */ struct ibv_cq *ibv_create_cq(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); /** * ibv_create_cq_ex - Create a completion queue * @context - Context CQ will be attached to * @cq_attr - Attributes to create the CQ with */ static inline struct ibv_cq_ex *ibv_create_cq_ex(struct ibv_context *context, struct ibv_cq_init_attr_ex *cq_attr) { struct verbs_context *vctx = verbs_get_ctx_op(context, create_cq_ex); if (!vctx) { errno = ENOSYS; return NULL; } if (cq_attr->comp_mask & ~(IBV_CQ_INIT_ATTR_MASK_RESERVED - 1)) { errno = EINVAL; return NULL; } return vctx->create_cq_ex(context, cq_attr); } /** * ibv_resize_cq - Modifies the capacity of the CQ. * @cq: The CQ to resize. * @cqe: The minimum size of the CQ. * * Users can examine the cq structure to determine the actual CQ size. */ int ibv_resize_cq(struct ibv_cq *cq, int cqe); /** * ibv_destroy_cq - Destroy a completion queue */ int ibv_destroy_cq(struct ibv_cq *cq); /** * ibv_get_cq_event - Read next CQ event * @channel: Channel to get next event from. * @cq: Used to return pointer to CQ. * @cq_context: Used to return consumer-supplied CQ context. * * All completion events returned by ibv_get_cq_event() must * eventually be acknowledged with ibv_ack_cq_events(). */ int ibv_get_cq_event(struct ibv_comp_channel *channel, struct ibv_cq **cq, void **cq_context); /** * ibv_ack_cq_events - Acknowledge CQ completion events * @cq: CQ to acknowledge events for * @nevents: Number of events to acknowledge. * * All completion events which are returned by ibv_get_cq_event() must * be acknowledged. To avoid races, ibv_destroy_cq() will wait for * all completion events to be acknowledged, so there should be a * one-to-one correspondence between acks and successful gets. An * application may accumulate multiple completion events and * acknowledge them in a single call to ibv_ack_cq_events() by passing * the number of events to ack in @nevents. */ void ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents); /** * ibv_poll_cq - Poll a CQ for work completions * @cq:the CQ being polled * @num_entries:maximum number of completions to return * @wc:array of at least @num_entries of &struct ibv_wc where completions * will be returned * * Poll a CQ for (possibly multiple) completions. If the return value * is < 0, an error occurred. If the return value is >= 0, it is the * number of completions returned. If the return value is * non-negative and strictly less than num_entries, then the CQ was * emptied. */ static inline int ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc) { return cq->context->ops.poll_cq(cq, num_entries, wc); } /** * ibv_req_notify_cq - Request completion notification on a CQ. An * event will be added to the completion channel associated with the * CQ when an entry is added to the CQ. * @cq: The completion queue to request notification for. * @solicited_only: If non-zero, an event will be generated only for * the next solicited CQ entry. If zero, any CQ entry, solicited or * not, will generate an event. */ static inline int ibv_req_notify_cq(struct ibv_cq *cq, int solicited_only) { return cq->context->ops.req_notify_cq(cq, solicited_only); } /** * ibv_create_srq - Creates a SRQ associated with the specified protection * domain. * @pd: The protection domain associated with the SRQ. * @srq_init_attr: A list of initial attributes required to create the SRQ. * * srq_attr->max_wr and srq_attr->max_sge are read the determine the * requested size of the SRQ, and set to the actual values allocated * on return. If ibv_create_srq() succeeds, then max_wr and max_sge * will always be at least as large as the requested values. */ struct ibv_srq *ibv_create_srq(struct ibv_pd *pd, struct ibv_srq_init_attr *srq_init_attr); static inline struct ibv_srq * ibv_create_srq_ex(struct ibv_context *context, struct ibv_srq_init_attr_ex *srq_init_attr_ex) { struct verbs_context *vctx; uint32_t mask = srq_init_attr_ex->comp_mask; if (!(mask & ~(IBV_SRQ_INIT_ATTR_PD | IBV_SRQ_INIT_ATTR_TYPE)) && (mask & IBV_SRQ_INIT_ATTR_PD) && (!(mask & IBV_SRQ_INIT_ATTR_TYPE) || (srq_init_attr_ex->srq_type == IBV_SRQT_BASIC))) return ibv_create_srq(srq_init_attr_ex->pd, (struct ibv_srq_init_attr *)srq_init_attr_ex); vctx = verbs_get_ctx_op(context, create_srq_ex); if (!vctx) { errno = ENOSYS; return NULL; } return vctx->create_srq_ex(context, srq_init_attr_ex); } /** * ibv_modify_srq - Modifies the attributes for the specified SRQ. * @srq: The SRQ to modify. * @srq_attr: On input, specifies the SRQ attributes to modify. On output, * the current values of selected SRQ attributes are returned. * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ * are being modified. * * The mask may contain IBV_SRQ_MAX_WR to resize the SRQ and/or * IBV_SRQ_LIMIT to set the SRQ's limit and request notification when * the number of receives queued drops below the limit. */ int ibv_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, int srq_attr_mask); /** * ibv_query_srq - Returns the attribute list and current values for the * specified SRQ. * @srq: The SRQ to query. * @srq_attr: The attributes of the specified SRQ. */ int ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr); static inline int ibv_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num) { struct verbs_context *vctx = verbs_get_ctx_op(srq->context, get_srq_num); if (!vctx) return ENOSYS; return vctx->get_srq_num(srq, srq_num); } /** * ibv_destroy_srq - Destroys the specified SRQ. * @srq: The SRQ to destroy. */ int ibv_destroy_srq(struct ibv_srq *srq); /** * ibv_post_srq_recv - Posts a list of work requests to the specified SRQ. * @srq: The SRQ to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ibv_post_srq_recv(struct ibv_srq *srq, struct ibv_recv_wr *recv_wr, struct ibv_recv_wr **bad_recv_wr) { return srq->context->ops.post_srq_recv(srq, recv_wr, bad_recv_wr); } /** * ibv_create_qp - Create a queue pair. */ struct ibv_qp *ibv_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); static inline struct ibv_qp * ibv_create_qp_ex(struct ibv_context *context, struct ibv_qp_init_attr_ex *qp_init_attr_ex) { struct verbs_context *vctx; uint32_t mask = qp_init_attr_ex->comp_mask; if (mask == IBV_QP_INIT_ATTR_PD) return ibv_create_qp(qp_init_attr_ex->pd, (struct ibv_qp_init_attr *)qp_init_attr_ex); vctx = verbs_get_ctx_op(context, create_qp_ex); if (!vctx) { errno = ENOSYS; return NULL; } return vctx->create_qp_ex(context, qp_init_attr_ex); } /** * ibv_query_rt_values_ex - Get current real time @values of a device. * @values - in/out - defines the attributes we need to query/queried. * (Or's bits of enum ibv_values_mask on values->comp_mask field) */ static inline int ibv_query_rt_values_ex(struct ibv_context *context, struct ibv_values_ex *values) { struct verbs_context *vctx; vctx = verbs_get_ctx_op(context, query_rt_values); if (!vctx) return ENOSYS; if (values->comp_mask & ~(IBV_VALUES_MASK_RESERVED - 1)) return EINVAL; return vctx->query_rt_values(context, values); } /** * ibv_query_device_ex - Get extended device properties */ static inline int ibv_query_device_ex(struct ibv_context *context, const struct ibv_query_device_ex_input *input, struct ibv_device_attr_ex *attr) { struct verbs_context *vctx; int ret; vctx = verbs_get_ctx_op(context, query_device_ex); if (!vctx) goto legacy; ret = vctx->query_device_ex(context, input, attr, sizeof(*attr)); if (ret == ENOSYS) goto legacy; return ret; legacy: memset(attr, 0, sizeof(*attr)); ret = ibv_query_device(context, &attr->orig_attr); return ret; } /** * ibv_open_qp - Open a shareable queue pair. */ static inline struct ibv_qp * ibv_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *qp_open_attr) { struct verbs_context *vctx = verbs_get_ctx_op(context, open_qp); if (!vctx) { errno = ENOSYS; return NULL; } return vctx->open_qp(context, qp_open_attr); } /** * ibv_modify_qp - Modify a queue pair. */ int ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); /** * ibv_query_qp - Returns the attribute list and current values for the * specified QP. * @qp: The QP to query. * @attr: The attributes of the specified QP. * @attr_mask: A bit-mask used to select specific attributes to query. * @init_attr: Additional attributes of the selected QP. * * The qp_attr_mask may be used to limit the query to gathering only the * selected attributes. */ int ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); /** * ibv_destroy_qp - Destroy a queue pair. */ int ibv_destroy_qp(struct ibv_qp *qp); /* * ibv_create_wq - Creates a WQ associated with the specified protection * domain. * @context: ibv_context. * @wq_init_attr: A list of initial attributes required to create the * WQ. If WQ creation succeeds, then the attributes are updated to * the actual capabilities of the created WQ. * * wq_init_attr->max_wr and wq_init_attr->max_sge determine * the requested size of the WQ, and set to the actual values allocated * on return. * If ibv_create_wq() succeeds, then max_wr and max_sge will always be * at least as large as the requested values. * * Return Value * ibv_create_wq() returns a pointer to the created WQ, or NULL if the request * fails. */ static inline struct ibv_wq *ibv_create_wq(struct ibv_context *context, struct ibv_wq_init_attr *wq_init_attr) { struct verbs_context *vctx = verbs_get_ctx_op(context, create_wq); struct ibv_wq *wq; if (!vctx) { errno = ENOSYS; return NULL; } wq = vctx->create_wq(context, wq_init_attr); - if (wq) { + if (wq) wq->events_completed = 0; - pthread_mutex_init(&wq->mutex, NULL); - pthread_cond_init(&wq->cond, NULL); - } return wq; } /* * ibv_modify_wq - Modifies the attributes for the specified WQ. * @wq: The WQ to modify. * @wq_attr: On input, specifies the WQ attributes to modify. * wq_attr->attr_mask: A bit-mask used to specify which attributes of the WQ * are being modified. * On output, the current values of selected WQ attributes are returned. * * Return Value * ibv_modify_wq() returns 0 on success, or the value of errno * on failure (which indicates the failure reason). * */ static inline int ibv_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *wq_attr) { struct verbs_context *vctx = verbs_get_ctx_op(wq->context, modify_wq); if (!vctx) return ENOSYS; return vctx->modify_wq(wq, wq_attr); } /* * ibv_destroy_wq - Destroys the specified WQ. * @ibv_wq: The WQ to destroy. * Return Value * ibv_destroy_wq() returns 0 on success, or the value of errno * on failure (which indicates the failure reason). */ static inline int ibv_destroy_wq(struct ibv_wq *wq) { struct verbs_context *vctx; vctx = verbs_get_ctx_op(wq->context, destroy_wq); if (!vctx) return ENOSYS; return vctx->destroy_wq(wq); } /* * ibv_create_rwq_ind_table - Creates a receive work queue Indirection Table * @context: ibv_context. * @init_attr: A list of initial attributes required to create the Indirection Table. * Return Value * ibv_create_rwq_ind_table returns a pointer to the created * Indirection Table, or NULL if the request fails. */ static inline struct ibv_rwq_ind_table *ibv_create_rwq_ind_table(struct ibv_context *context, struct ibv_rwq_ind_table_init_attr *init_attr) { struct verbs_context *vctx; vctx = verbs_get_ctx_op(context, create_rwq_ind_table); if (!vctx) { errno = ENOSYS; return NULL; } return vctx->create_rwq_ind_table(context, init_attr); } /* * ibv_destroy_rwq_ind_table - Destroys the specified Indirection Table. * @rwq_ind_table: The Indirection Table to destroy. * Return Value * ibv_destroy_rwq_ind_table() returns 0 on success, or the value of errno * on failure (which indicates the failure reason). */ static inline int ibv_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table) { struct verbs_context *vctx; vctx = verbs_get_ctx_op(rwq_ind_table->context, destroy_rwq_ind_table); if (!vctx) return ENOSYS; return vctx->destroy_rwq_ind_table(rwq_ind_table); } /** * ibv_post_send - Post a list of work requests to a send queue. * * If IBV_SEND_INLINE flag is set, the data buffers can be reused * immediately after the call returns. */ static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { return qp->context->ops.post_send(qp, wr, bad_wr); } /** * ibv_post_recv - Post a list of work requests to a receive queue. */ static inline int ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { return qp->context->ops.post_recv(qp, wr, bad_wr); } /** * ibv_create_ah - Create an address handle. */ struct ibv_ah *ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); /** * ibv_init_ah_from_wc - Initializes address handle attributes from a * work completion. * @context: Device context on which the received message arrived. * @port_num: Port on which the received message arrived. * @wc: Work completion associated with the received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @ah_attr: Returned attributes that can be used when creating an address * handle for replying to the message. */ int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num, struct ibv_wc *wc, struct ibv_grh *grh, struct ibv_ah_attr *ah_attr); /** * ibv_create_ah_from_wc - Creates an address handle associated with the * sender of the specified work completion. * @pd: The protection domain associated with the address handle. * @wc: Work completion information associated with a received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @port_num: The outbound port number to associate with the address. * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd *pd, struct ibv_wc *wc, struct ibv_grh *grh, uint8_t port_num); /** * ibv_destroy_ah - Destroy an address handle. */ int ibv_destroy_ah(struct ibv_ah *ah); /** * ibv_attach_mcast - Attaches the specified QP to a multicast group. * @qp: QP to attach to the multicast group. The QP must be a UD QP. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. * * In order to route multicast packets correctly, subnet * administration must have created the multicast group and configured * the fabric appropriately. The port associated with the specified * QP must also be a member of the multicast group. */ int ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); /** * ibv_detach_mcast - Detaches the specified QP from a multicast group. * @qp: QP to detach from the multicast group. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. */ int ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); /** * ibv_fork_init - Prepare data structures so that fork() may be used * safely. If this function is not called or returns a non-zero * status, then libibverbs data structures are not fork()-safe and the * effect of an application calling fork() is undefined. */ int ibv_fork_init(void); /** * ibv_node_type_str - Return string describing node_type enum value */ const char *ibv_node_type_str(enum ibv_node_type node_type); /** * ibv_port_state_str - Return string describing port_state enum value */ const char *ibv_port_state_str(enum ibv_port_state port_state); /** * ibv_event_type_str - Return string describing event_type enum value */ const char *ibv_event_type_str(enum ibv_event_type event); #define ETHERNET_LL_SIZE 6 int ibv_resolve_eth_l2_from_gid(struct ibv_context *context, struct ibv_ah_attr *attr, uint8_t eth_mac[ETHERNET_LL_SIZE], uint16_t *vid); static inline int ibv_is_qpt_supported(uint32_t caps, enum ibv_qp_type qpt) { return !!(caps & (1 << qpt)); } END_C_DECLS # undef __attribute_const #endif /* INFINIBAND_VERBS_H */ diff --git a/contrib/ofed/libmlx4/mlx4.c b/contrib/ofed/libmlx4/mlx4.c index 229c2670b5ed..db8a07d48381 100644 --- a/contrib/ofed/libmlx4/mlx4.c +++ b/contrib/ofed/libmlx4/mlx4.c @@ -1,327 +1,356 @@ /* * Copyright (c) 2007 Cisco, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include "mlx4.h" #include "mlx4-abi.h" #ifndef PCI_VENDOR_ID_MELLANOX #define PCI_VENDOR_ID_MELLANOX 0x15b3 #endif #define HCA(v, d) \ { .vendor = PCI_VENDOR_ID_##v, \ .device = d } static struct { unsigned vendor; unsigned device; } hca_table[] = { HCA(MELLANOX, 0x6340), /* MT25408 "Hermon" SDR */ HCA(MELLANOX, 0x634a), /* MT25408 "Hermon" DDR */ HCA(MELLANOX, 0x6354), /* MT25408 "Hermon" QDR */ HCA(MELLANOX, 0x6732), /* MT25408 "Hermon" DDR PCIe gen2 */ HCA(MELLANOX, 0x673c), /* MT25408 "Hermon" QDR PCIe gen2 */ HCA(MELLANOX, 0x6368), /* MT25408 "Hermon" EN 10GigE */ HCA(MELLANOX, 0x6750), /* MT25408 "Hermon" EN 10GigE PCIe gen2 */ HCA(MELLANOX, 0x6372), /* MT25458 ConnectX EN 10GBASE-T 10GigE */ HCA(MELLANOX, 0x675a), /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ HCA(MELLANOX, 0x6764), /* MT26468 ConnectX EN 10GigE PCIe gen2*/ HCA(MELLANOX, 0x6746), /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */ HCA(MELLANOX, 0x676e), /* MT26478 ConnectX2 40GigE PCIe gen2 */ HCA(MELLANOX, 0x1002), /* MT25400 Family [ConnectX-2 Virtual Function] */ HCA(MELLANOX, 0x1003), /* MT27500 Family [ConnectX-3] */ HCA(MELLANOX, 0x1004), /* MT27500 Family [ConnectX-3 Virtual Function] */ HCA(MELLANOX, 0x1005), /* MT27510 Family */ HCA(MELLANOX, 0x1006), /* MT27511 Family */ HCA(MELLANOX, 0x1007), /* MT27520 Family */ HCA(MELLANOX, 0x1008), /* MT27521 Family */ HCA(MELLANOX, 0x1009), /* MT27530 Family */ HCA(MELLANOX, 0x100a), /* MT27531 Family */ HCA(MELLANOX, 0x100b), /* MT27540 Family */ HCA(MELLANOX, 0x100c), /* MT27541 Family */ HCA(MELLANOX, 0x100d), /* MT27550 Family */ HCA(MELLANOX, 0x100e), /* MT27551 Family */ HCA(MELLANOX, 0x100f), /* MT27560 Family */ HCA(MELLANOX, 0x1010), /* MT27561 Family */ }; static struct ibv_context_ops mlx4_ctx_ops = { .query_device = mlx4_query_device, .query_port = mlx4_query_port, .alloc_pd = mlx4_alloc_pd, .dealloc_pd = mlx4_free_pd, .reg_mr = mlx4_reg_mr, .rereg_mr = mlx4_rereg_mr, .dereg_mr = mlx4_dereg_mr, .alloc_mw = mlx4_alloc_mw, .dealloc_mw = mlx4_dealloc_mw, .bind_mw = mlx4_bind_mw, .create_cq = mlx4_create_cq, .poll_cq = mlx4_poll_cq, .req_notify_cq = mlx4_arm_cq, .cq_event = mlx4_cq_event, .resize_cq = mlx4_resize_cq, .destroy_cq = mlx4_destroy_cq, .create_srq = mlx4_create_srq, .modify_srq = mlx4_modify_srq, .query_srq = mlx4_query_srq, .destroy_srq = mlx4_destroy_srq, .post_srq_recv = mlx4_post_srq_recv, .create_qp = mlx4_create_qp, .query_qp = mlx4_query_qp, .modify_qp = mlx4_modify_qp, .destroy_qp = mlx4_destroy_qp, .post_send = mlx4_post_send, .post_recv = mlx4_post_recv, .create_ah = mlx4_create_ah, .destroy_ah = mlx4_destroy_ah, .attach_mcast = ibv_cmd_attach_mcast, .detach_mcast = ibv_cmd_detach_mcast }; static int mlx4_map_internal_clock(struct mlx4_device *mdev, struct ibv_context *ibv_ctx) { struct mlx4_context *context = to_mctx(ibv_ctx); void *hca_clock_page; hca_clock_page = mmap(NULL, mdev->page_size, PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd, mdev->page_size * 3); if (hca_clock_page == MAP_FAILED) { fprintf(stderr, PFX "Warning: Timestamp available,\n" "but failed to mmap() hca core clock page.\n"); return -1; } context->hca_core_clock = hca_clock_page + (context->core_clock.offset & (mdev->page_size - 1)); return 0; } static int mlx4_init_context(struct verbs_device *v_device, struct ibv_context *ibv_ctx, int cmd_fd) { struct mlx4_context *context; struct ibv_get_context cmd; struct mlx4_alloc_ucontext_resp resp; int i; + int ret; struct mlx4_alloc_ucontext_resp_v3 resp_v3; __u16 bf_reg_size; struct mlx4_device *dev = to_mdev(&v_device->device); struct verbs_context *verbs_ctx = verbs_get_ctx(ibv_ctx); struct ibv_device_attr_ex dev_attrs; /* memory footprint of mlx4_context and verbs_context share * struct ibv_context. */ context = to_mctx(ibv_ctx); ibv_ctx->cmd_fd = cmd_fd; if (dev->abi_version <= MLX4_UVERBS_NO_DEV_CAPS_ABI_VERSION) { if (ibv_cmd_get_context(ibv_ctx, &cmd, sizeof cmd, &resp_v3.ibv_resp, sizeof resp_v3)) return errno; context->num_qps = resp_v3.qp_tab_size; bf_reg_size = resp_v3.bf_reg_size; context->cqe_size = sizeof (struct mlx4_cqe); } else { if (ibv_cmd_get_context(ibv_ctx, &cmd, sizeof cmd, &resp.ibv_resp, sizeof resp)) return errno; context->num_qps = resp.qp_tab_size; bf_reg_size = resp.bf_reg_size; if (resp.dev_caps & MLX4_USER_DEV_CAP_64B_CQE) context->cqe_size = resp.cqe_size; else context->cqe_size = sizeof (struct mlx4_cqe); } context->qp_table_shift = ffs(context->num_qps) - 1 - MLX4_QP_TABLE_BITS; context->qp_table_mask = (1 << context->qp_table_shift) - 1; for (i = 0; i < MLX4_PORTS_NUM; ++i) context->port_query_cache[i].valid = 0; - pthread_mutex_init(&context->qp_table_mutex, NULL); + ret = pthread_mutex_init(&context->qp_table_mutex, NULL); + if (ret) + return ret; for (i = 0; i < MLX4_QP_TABLE_SIZE; ++i) context->qp_table[i].refcnt = 0; for (i = 0; i < MLX4_NUM_DB_TYPE; ++i) context->db_list[i] = NULL; - mlx4_init_xsrq_table(&context->xsrq_table, context->num_qps); - pthread_mutex_init(&context->db_list_mutex, NULL); + ret = mlx4_init_xsrq_table(&context->xsrq_table, context->num_qps); + if (ret) + goto err; + + ret = pthread_mutex_init(&context->db_list_mutex, NULL); + if (ret) + goto err_xsrq; context->uar = mmap(NULL, dev->page_size, PROT_WRITE, MAP_SHARED, cmd_fd, 0); if (context->uar == MAP_FAILED) return errno; if (bf_reg_size) { context->bf_page = mmap(NULL, dev->page_size, PROT_WRITE, MAP_SHARED, cmd_fd, dev->page_size); if (context->bf_page == MAP_FAILED) { fprintf(stderr, PFX "Warning: BlueFlame available, " "but failed to mmap() BlueFlame page.\n"); context->bf_page = NULL; context->bf_buf_size = 0; } else { context->bf_buf_size = bf_reg_size / 2; context->bf_offset = 0; - pthread_spin_init(&context->bf_lock, PTHREAD_PROCESS_PRIVATE); + ret = pthread_spin_init(&context->bf_lock, PTHREAD_PROCESS_PRIVATE); + if (ret) + goto err_db_list; } } else { context->bf_page = NULL; context->bf_buf_size = 0; } - pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE); + ret = pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE); + if (ret) + goto err_bf_lock; ibv_ctx->ops = mlx4_ctx_ops; context->hca_core_clock = NULL; memset(&dev_attrs, 0, sizeof(dev_attrs)); if (!mlx4_query_device_ex(ibv_ctx, NULL, &dev_attrs, sizeof(struct ibv_device_attr_ex))) { context->max_qp_wr = dev_attrs.orig_attr.max_qp_wr; context->max_sge = dev_attrs.orig_attr.max_sge; if (context->core_clock.offset_valid) mlx4_map_internal_clock(dev, ibv_ctx); } verbs_ctx->has_comp_mask = VERBS_CONTEXT_XRCD | VERBS_CONTEXT_SRQ | VERBS_CONTEXT_QP; verbs_set_ctx_op(verbs_ctx, close_xrcd, mlx4_close_xrcd); verbs_set_ctx_op(verbs_ctx, open_xrcd, mlx4_open_xrcd); verbs_set_ctx_op(verbs_ctx, create_srq_ex, mlx4_create_srq_ex); verbs_set_ctx_op(verbs_ctx, get_srq_num, verbs_get_srq_num); verbs_set_ctx_op(verbs_ctx, create_qp_ex, mlx4_create_qp_ex); verbs_set_ctx_op(verbs_ctx, open_qp, mlx4_open_qp); verbs_set_ctx_op(verbs_ctx, ibv_create_flow, ibv_cmd_create_flow); verbs_set_ctx_op(verbs_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow); verbs_set_ctx_op(verbs_ctx, create_cq_ex, mlx4_create_cq_ex); verbs_set_ctx_op(verbs_ctx, query_device_ex, mlx4_query_device_ex); verbs_set_ctx_op(verbs_ctx, query_rt_values, mlx4_query_rt_values); return 0; +err_bf_lock: + if (context->bf_buf_size) + pthread_spin_destroy(&context->bf_lock); +err_db_list: + pthread_mutex_destroy(&context->db_list_mutex); +err_xsrq: + mlx4_cleanup_xsrq_table(&context->xsrq_table); +err: + pthread_mutex_destroy(&context->qp_table_mutex); + + return ret; } static void mlx4_uninit_context(struct verbs_device *v_device, struct ibv_context *ibv_ctx) { struct mlx4_context *context = to_mctx(ibv_ctx); + pthread_mutex_destroy(&context->qp_table_mutex); + mlx4_cleanup_xsrq_table(&context->xsrq_table); + pthread_mutex_destroy(&context->db_list_mutex); + pthread_spin_destroy(&context->bf_lock); + pthread_spin_destroy(&context->uar_lock); + munmap(context->uar, to_mdev(&v_device->device)->page_size); if (context->bf_page) munmap(context->bf_page, to_mdev(&v_device->device)->page_size); if (context->hca_core_clock) munmap(context->hca_core_clock - context->core_clock.offset, to_mdev(&v_device->device)->page_size); } static struct verbs_device_ops mlx4_dev_ops = { .init_context = mlx4_init_context, .uninit_context = mlx4_uninit_context, }; static struct verbs_device *mlx4_driver_init(const char *uverbs_sys_path, int abi_version) { char value[8]; struct mlx4_device *dev; unsigned vendor, device; int i; if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor", value, sizeof value) < 0) return NULL; vendor = strtol(value, NULL, 16); if (ibv_read_sysfs_file(uverbs_sys_path, "device/device", value, sizeof value) < 0) return NULL; device = strtol(value, NULL, 16); for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i) if (vendor == hca_table[i].vendor && device == hca_table[i].device) goto found; return NULL; found: if (abi_version < MLX4_UVERBS_MIN_ABI_VERSION || abi_version > MLX4_UVERBS_MAX_ABI_VERSION) { fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported " "(min supported %d, max supported %d)\n", abi_version, uverbs_sys_path, MLX4_UVERBS_MIN_ABI_VERSION, MLX4_UVERBS_MAX_ABI_VERSION); return NULL; } dev = calloc(1, sizeof *dev); if (!dev) { fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n", uverbs_sys_path); return NULL; } dev->page_size = sysconf(_SC_PAGESIZE); dev->abi_version = abi_version; dev->verbs_dev.ops = &mlx4_dev_ops; dev->verbs_dev.sz = sizeof(*dev); dev->verbs_dev.size_of_context = sizeof(struct mlx4_context) - sizeof(struct ibv_context); return &dev->verbs_dev; } static __attribute__((constructor)) void mlx4_register_driver(void) { verbs_register_driver("mlx4", mlx4_driver_init); } diff --git a/contrib/ofed/libmlx4/mlx4.h b/contrib/ofed/libmlx4/mlx4.h index 864ef9eccc60..5b3dd547b5fe 100644 --- a/contrib/ofed/libmlx4/mlx4.h +++ b/contrib/ofed/libmlx4/mlx4.h @@ -1,458 +1,459 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef MLX4_H #define MLX4_H #include #include #include #include #include #define MLX4_PORTS_NUM 2 #define PFX "mlx4: " enum { MLX4_STAT_RATE_OFFSET = 5 }; enum { MLX4_QP_TABLE_BITS = 8, MLX4_QP_TABLE_SIZE = 1 << MLX4_QP_TABLE_BITS, MLX4_QP_TABLE_MASK = MLX4_QP_TABLE_SIZE - 1 }; #define MLX4_REMOTE_SRQN_FLAGS(wr) htobe32(wr->qp_type.xrc.remote_srqn << 8) enum { MLX4_XSRQ_TABLE_BITS = 8, MLX4_XSRQ_TABLE_SIZE = 1 << MLX4_XSRQ_TABLE_BITS, MLX4_XSRQ_TABLE_MASK = MLX4_XSRQ_TABLE_SIZE - 1 }; struct mlx4_xsrq_table { struct { struct mlx4_srq **table; int refcnt; } xsrq_table[MLX4_XSRQ_TABLE_SIZE]; pthread_mutex_t mutex; int num_xsrq; int shift; int mask; }; enum { MLX4_XRC_QPN_BIT = (1 << 23) }; enum mlx4_db_type { MLX4_DB_TYPE_CQ, MLX4_DB_TYPE_RQ, MLX4_NUM_DB_TYPE }; enum { MLX4_OPCODE_NOP = 0x00, MLX4_OPCODE_SEND_INVAL = 0x01, MLX4_OPCODE_RDMA_WRITE = 0x08, MLX4_OPCODE_RDMA_WRITE_IMM = 0x09, MLX4_OPCODE_SEND = 0x0a, MLX4_OPCODE_SEND_IMM = 0x0b, MLX4_OPCODE_LSO = 0x0e, MLX4_OPCODE_RDMA_READ = 0x10, MLX4_OPCODE_ATOMIC_CS = 0x11, MLX4_OPCODE_ATOMIC_FA = 0x12, MLX4_OPCODE_MASKED_ATOMIC_CS = 0x14, MLX4_OPCODE_MASKED_ATOMIC_FA = 0x15, MLX4_OPCODE_BIND_MW = 0x18, MLX4_OPCODE_FMR = 0x19, MLX4_OPCODE_LOCAL_INVAL = 0x1b, MLX4_OPCODE_CONFIG_CMD = 0x1f, MLX4_RECV_OPCODE_RDMA_WRITE_IMM = 0x00, MLX4_RECV_OPCODE_SEND = 0x01, MLX4_RECV_OPCODE_SEND_IMM = 0x02, MLX4_RECV_OPCODE_SEND_INVAL = 0x03, MLX4_CQE_OPCODE_ERROR = 0x1e, MLX4_CQE_OPCODE_RESIZE = 0x16, }; struct mlx4_device { struct verbs_device verbs_dev; int page_size; int abi_version; }; struct mlx4_db_page; struct mlx4_context { struct ibv_context ibv_ctx; void *uar; pthread_spinlock_t uar_lock; void *bf_page; int bf_buf_size; int bf_offset; pthread_spinlock_t bf_lock; struct { struct mlx4_qp **table; int refcnt; } qp_table[MLX4_QP_TABLE_SIZE]; pthread_mutex_t qp_table_mutex; int num_qps; int qp_table_shift; int qp_table_mask; int max_qp_wr; int max_sge; struct mlx4_db_page *db_list[MLX4_NUM_DB_TYPE]; pthread_mutex_t db_list_mutex; int cqe_size; struct mlx4_xsrq_table xsrq_table; struct { uint8_t valid; uint8_t link_layer; enum ibv_port_cap_flags caps; } port_query_cache[MLX4_PORTS_NUM]; struct { uint64_t offset; uint8_t offset_valid; } core_clock; void *hca_core_clock; }; struct mlx4_buf { void *buf; size_t length; }; struct mlx4_pd { struct ibv_pd ibv_pd; uint32_t pdn; }; enum { MLX4_CQ_FLAGS_RX_CSUM_VALID = 1 << 0, MLX4_CQ_FLAGS_EXTENDED = 1 << 1, MLX4_CQ_FLAGS_SINGLE_THREADED = 1 << 2, }; struct mlx4_cq { struct ibv_cq_ex ibv_cq; struct mlx4_buf buf; struct mlx4_buf resize_buf; pthread_spinlock_t lock; uint32_t cqn; uint32_t cons_index; uint32_t *set_ci_db; uint32_t *arm_db; int arm_sn; int cqe_size; struct mlx4_qp *cur_qp; struct mlx4_cqe *cqe; uint32_t flags; }; struct mlx4_srq { struct verbs_srq verbs_srq; struct mlx4_buf buf; pthread_spinlock_t lock; uint64_t *wrid; uint32_t srqn; int max; int max_gs; int wqe_shift; int head; int tail; uint32_t *db; uint16_t counter; uint8_t ext_srq; }; struct mlx4_wq { uint64_t *wrid; pthread_spinlock_t lock; int wqe_cnt; int max_post; unsigned head; unsigned tail; int max_gs; int wqe_shift; int offset; }; struct mlx4_qp { struct verbs_qp verbs_qp; struct mlx4_buf buf; int max_inline_data; int buf_size; uint32_t doorbell_qpn; uint32_t sq_signal_bits; int sq_spare_wqes; struct mlx4_wq sq; uint32_t *db; struct mlx4_wq rq; uint8_t link_layer; uint32_t qp_cap_cache; }; struct mlx4_av { uint32_t port_pd; uint8_t reserved1; uint8_t g_slid; uint16_t dlid; uint8_t reserved2; uint8_t gid_index; uint8_t stat_rate; uint8_t hop_limit; uint32_t sl_tclass_flowlabel; uint8_t dgid[16]; }; struct mlx4_ah { struct ibv_ah ibv_ah; struct mlx4_av av; uint16_t vlan; uint8_t mac[6]; }; enum { MLX4_CSUM_SUPPORT_UD_OVER_IB = (1 << 0), MLX4_CSUM_SUPPORT_RAW_OVER_ETH = (1 << 1), /* Only report rx checksum when the validation is valid */ MLX4_RX_CSUM_VALID = (1 << 16), }; enum mlx4_cqe_status { MLX4_CQE_STATUS_TCP_UDP_CSUM_OK = (1 << 2), MLX4_CQE_STATUS_IPV4_PKT = (1 << 22), MLX4_CQE_STATUS_IP_HDR_CSUM_OK = (1 << 28), MLX4_CQE_STATUS_IPV4_CSUM_OK = MLX4_CQE_STATUS_IPV4_PKT | MLX4_CQE_STATUS_IP_HDR_CSUM_OK | MLX4_CQE_STATUS_TCP_UDP_CSUM_OK }; struct mlx4_cqe { uint32_t vlan_my_qpn; uint32_t immed_rss_invalid; uint32_t g_mlpath_rqpn; union { struct { uint16_t sl_vid; uint16_t rlid; }; uint32_t ts_47_16; }; uint32_t status; uint32_t byte_cnt; uint16_t wqe_index; uint16_t checksum; uint8_t reserved3; uint8_t ts_15_8; uint8_t ts_7_0; uint8_t owner_sr_opcode; }; static inline unsigned long align(unsigned long val, unsigned long align) { return (val + align - 1) & ~(align - 1); } int align_queue_size(int req); #define to_mxxx(xxx, type) \ ((struct mlx4_##type *) \ ((void *) ib##xxx - offsetof(struct mlx4_##type, ibv_##xxx))) static inline struct mlx4_device *to_mdev(struct ibv_device *ibdev) { /* ibv_device is first field of verbs_device * see try_driver() in libibverbs. */ return container_of(ibdev, struct mlx4_device, verbs_dev); } static inline struct mlx4_context *to_mctx(struct ibv_context *ibctx) { return to_mxxx(ctx, context); } static inline struct mlx4_pd *to_mpd(struct ibv_pd *ibpd) { return to_mxxx(pd, pd); } static inline struct mlx4_cq *to_mcq(struct ibv_cq *ibcq) { return to_mxxx(cq, cq); } static inline struct mlx4_srq *to_msrq(struct ibv_srq *ibsrq) { return container_of(container_of(ibsrq, struct verbs_srq, srq), struct mlx4_srq, verbs_srq); } static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp) { return container_of(container_of(ibqp, struct verbs_qp, qp), struct mlx4_qp, verbs_qp); } static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah) { return to_mxxx(ah, ah); } static inline void mlx4_update_cons_index(struct mlx4_cq *cq) { *cq->set_ci_db = htobe32(cq->cons_index & 0xffffff); } int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size); void mlx4_free_buf(struct mlx4_buf *buf); uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type); void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, uint32_t *db); int mlx4_query_device(struct ibv_context *context, struct ibv_device_attr *attr); int mlx4_query_device_ex(struct ibv_context *context, const struct ibv_query_device_ex_input *input, struct ibv_device_attr_ex *attr, size_t attr_size); int mlx4_query_port(struct ibv_context *context, uint8_t port, struct ibv_port_attr *attr); int mlx4_query_rt_values(struct ibv_context *context, struct ibv_values_ex *values); struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context); int mlx4_free_pd(struct ibv_pd *pd); struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context, struct ibv_xrcd_init_attr *attr); int mlx4_close_xrcd(struct ibv_xrcd *xrcd); struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access); int mlx4_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd, void *addr, size_t length, int access); int mlx4_dereg_mr(struct ibv_mr *mr); struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type); int mlx4_dealloc_mw(struct ibv_mw *mw); int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, struct ibv_mw_bind *mw_bind); struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector); struct ibv_cq_ex *mlx4_create_cq_ex(struct ibv_context *context, struct ibv_cq_init_attr_ex *cq_attr); void mlx4_cq_fill_pfns(struct mlx4_cq *cq, const struct ibv_cq_init_attr_ex *cq_attr); int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent, int entry_size); int mlx4_resize_cq(struct ibv_cq *cq, int cqe); int mlx4_destroy_cq(struct ibv_cq *cq); int mlx4_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); int mlx4_arm_cq(struct ibv_cq *cq, int solicited); void mlx4_cq_event(struct ibv_cq *cq); void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq); void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq); int mlx4_get_outstanding_cqes(struct mlx4_cq *cq); void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int new_cqe); struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd, struct ibv_srq_init_attr *attr); struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context, struct ibv_srq_init_attr_ex *attr_ex); struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context, struct ibv_srq_init_attr_ex *attr_ex); int mlx4_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr, int mask); int mlx4_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr); int mlx4_destroy_srq(struct ibv_srq *srq); int mlx4_destroy_xrc_srq(struct ibv_srq *srq); int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr, struct mlx4_srq *srq); -void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size); +void mlx4_cleanup_xsrq_table(struct mlx4_xsrq_table *xsrq_table); +int mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size); struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn); int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn, struct mlx4_srq *srq); void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn); void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind); int mlx4_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context, struct ibv_qp_init_attr_ex *attr); struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr); int mlx4_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); int mlx4_destroy_qp(struct ibv_qp *qp); void mlx4_init_qp_indices(struct mlx4_qp *qp); void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp); int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr); int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type, struct mlx4_qp *qp); int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap, enum ibv_qp_type type, struct mlx4_qp *qp); void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap, enum ibv_qp_type type); struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn); int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp); void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn); struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); int mlx4_destroy_ah(struct ibv_ah *ah); int mlx4_alloc_av(struct mlx4_pd *pd, struct ibv_ah_attr *attr, struct mlx4_ah *ah); void mlx4_free_av(struct mlx4_ah *ah); #endif /* MLX4_H */ diff --git a/contrib/ofed/libmlx4/srq.c b/contrib/ofed/libmlx4/srq.c index b8d25bb343da..da709c630450 100644 --- a/contrib/ofed/libmlx4/srq.c +++ b/contrib/ofed/libmlx4/srq.c @@ -1,325 +1,334 @@ /* * Copyright (c) 2007 Cisco, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include "mlx4.h" #include "doorbell.h" #include "wqe.h" #include "mlx4-abi.h" static void *get_wqe(struct mlx4_srq *srq, int n) { return srq->buf.buf + (n << srq->wqe_shift); } void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind) { struct mlx4_wqe_srq_next_seg *next; pthread_spin_lock(&srq->lock); next = get_wqe(srq, srq->tail); next->next_wqe_index = htobe16(ind); srq->tail = ind; pthread_spin_unlock(&srq->lock); } int mlx4_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { struct mlx4_srq *srq = to_msrq(ibsrq); struct mlx4_wqe_srq_next_seg *next; struct mlx4_wqe_data_seg *scat; int err = 0; int nreq; int i; pthread_spin_lock(&srq->lock); for (nreq = 0; wr; ++nreq, wr = wr->next) { if (wr->num_sge > srq->max_gs) { err = -1; *bad_wr = wr; break; } if (srq->head == srq->tail) { /* SRQ is full*/ err = -1; *bad_wr = wr; break; } srq->wrid[srq->head] = wr->wr_id; next = get_wqe(srq, srq->head); srq->head = be16toh(next->next_wqe_index); scat = (struct mlx4_wqe_data_seg *) (next + 1); for (i = 0; i < wr->num_sge; ++i) { scat[i].byte_count = htobe32(wr->sg_list[i].length); scat[i].lkey = htobe32(wr->sg_list[i].lkey); scat[i].addr = htobe64(wr->sg_list[i].addr); } if (i < srq->max_gs) { scat[i].byte_count = 0; scat[i].lkey = htobe32(MLX4_INVALID_LKEY); scat[i].addr = 0; } } if (nreq) { srq->counter += nreq; /* * Make sure that descriptors are written before * we write doorbell record. */ udma_to_device_barrier(); *srq->db = htobe32(srq->counter); } pthread_spin_unlock(&srq->lock); return err; } int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr, struct mlx4_srq *srq) { struct mlx4_wqe_srq_next_seg *next; struct mlx4_wqe_data_seg *scatter; int size; int buf_size; int i; srq->wrid = malloc(srq->max * sizeof (uint64_t)); if (!srq->wrid) return -1; size = sizeof (struct mlx4_wqe_srq_next_seg) + srq->max_gs * sizeof (struct mlx4_wqe_data_seg); for (srq->wqe_shift = 5; 1 << srq->wqe_shift < size; ++srq->wqe_shift) ; /* nothing */ buf_size = srq->max << srq->wqe_shift; if (mlx4_alloc_buf(&srq->buf, buf_size, to_mdev(pd->context->device)->page_size)) { free(srq->wrid); return -1; } memset(srq->buf.buf, 0, buf_size); /* * Now initialize the SRQ buffer so that all of the WQEs are * linked into the list of free WQEs. */ for (i = 0; i < srq->max; ++i) { next = get_wqe(srq, i); next->next_wqe_index = htobe16((i + 1) & (srq->max - 1)); for (scatter = (void *) (next + 1); (void *) scatter < (void *) next + (1 << srq->wqe_shift); ++scatter) scatter->lkey = htobe32(MLX4_INVALID_LKEY); } srq->head = 0; srq->tail = srq->max - 1; return 0; } -void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size) +void mlx4_cleanup_xsrq_table(struct mlx4_xsrq_table *xsrq_table) { + pthread_mutex_destroy(&xsrq_table->mutex); +} + +int mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size) +{ + int ret; memset(xsrq_table, 0, sizeof *xsrq_table); xsrq_table->num_xsrq = size; xsrq_table->shift = ffs(size) - 1 - MLX4_XSRQ_TABLE_BITS; xsrq_table->mask = (1 << xsrq_table->shift) - 1; - pthread_mutex_init(&xsrq_table->mutex, NULL); + return pthread_mutex_init(&xsrq_table->mutex, NULL); } struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn) { int index; index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift; if (xsrq_table->xsrq_table[index].refcnt) return xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask]; return NULL; } int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn, struct mlx4_srq *srq) { int index, ret = 0; index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift; pthread_mutex_lock(&xsrq_table->mutex); if (!xsrq_table->xsrq_table[index].refcnt) { xsrq_table->xsrq_table[index].table = calloc(xsrq_table->mask + 1, sizeof(struct mlx4_srq *)); if (!xsrq_table->xsrq_table[index].table) { ret = -1; goto out; } } xsrq_table->xsrq_table[index].refcnt++; xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = srq; out: pthread_mutex_unlock(&xsrq_table->mutex); return ret; } void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn) { int index; index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift; pthread_mutex_lock(&xsrq_table->mutex); if (--xsrq_table->xsrq_table[index].refcnt) xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = NULL; else free(xsrq_table->xsrq_table[index].table); pthread_mutex_unlock(&xsrq_table->mutex); } struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context, struct ibv_srq_init_attr_ex *attr_ex) { struct mlx4_create_xsrq cmd; struct mlx4_create_srq_resp resp; struct mlx4_srq *srq; int ret; /* Sanity check SRQ size before proceeding */ if (attr_ex->attr.max_wr > 1 << 16 || attr_ex->attr.max_sge > 64) return NULL; srq = calloc(1, sizeof *srq); if (!srq) return NULL; if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE)) goto err; srq->max = align_queue_size(attr_ex->attr.max_wr + 1); srq->max_gs = attr_ex->attr.max_sge; srq->counter = 0; srq->ext_srq = 1; if (mlx4_alloc_srq_buf(attr_ex->pd, &attr_ex->attr, srq)) - goto err; + goto err_spl; srq->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ); if (!srq->db) goto err_free; *srq->db = 0; cmd.buf_addr = (uintptr_t) srq->buf.buf; cmd.db_addr = (uintptr_t) srq->db; ret = ibv_cmd_create_srq_ex(context, &srq->verbs_srq, sizeof(srq->verbs_srq), attr_ex, &cmd.ibv_cmd, sizeof cmd, &resp.ibv_resp, sizeof resp); if (ret) goto err_db; ret = mlx4_store_xsrq(&to_mctx(context)->xsrq_table, srq->verbs_srq.srq_num, srq); if (ret) goto err_destroy; return &srq->verbs_srq.srq; err_destroy: ibv_cmd_destroy_srq(&srq->verbs_srq.srq); err_db: mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, srq->db); err_free: free(srq->wrid); mlx4_free_buf(&srq->buf); +err_spl: + pthread_spin_destroy(&srq->lock); err: free(srq); return NULL; } int mlx4_destroy_xrc_srq(struct ibv_srq *srq) { struct mlx4_context *mctx = to_mctx(srq->context); struct mlx4_srq *msrq = to_msrq(srq); struct mlx4_cq *mcq; int ret; mcq = to_mcq(msrq->verbs_srq.cq); mlx4_cq_clean(mcq, 0, msrq); pthread_spin_lock(&mcq->lock); mlx4_clear_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num); pthread_spin_unlock(&mcq->lock); ret = ibv_cmd_destroy_srq(srq); if (ret) { pthread_spin_lock(&mcq->lock); mlx4_store_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num, msrq); pthread_spin_unlock(&mcq->lock); return ret; } mlx4_free_db(mctx, MLX4_DB_TYPE_RQ, msrq->db); mlx4_free_buf(&msrq->buf); free(msrq->wrid); + pthread_spin_destroy(&msrq->lock); free(msrq); return 0; } diff --git a/contrib/ofed/libmlx4/verbs.c b/contrib/ofed/libmlx4/verbs.c index f6f43f9bef76..f9c7f5f67b99 100644 --- a/contrib/ofed/libmlx4/verbs.c +++ b/contrib/ofed/libmlx4/verbs.c @@ -1,1255 +1,1271 @@ /* * Copyright (c) 2007 Cisco, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include "mlx4.h" #include "mlx4-abi.h" #include "wqe.h" int mlx4_query_device(struct ibv_context *context, struct ibv_device_attr *attr) { struct ibv_query_device cmd; uint64_t raw_fw_ver; unsigned major, minor, sub_minor; int ret; ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd); if (ret) return ret; major = (raw_fw_ver >> 32) & 0xffff; minor = (raw_fw_ver >> 16) & 0xffff; sub_minor = raw_fw_ver & 0xffff; snprintf(attr->fw_ver, sizeof attr->fw_ver, "%d.%d.%03d", major, minor, sub_minor); return 0; } int mlx4_query_device_ex(struct ibv_context *context, const struct ibv_query_device_ex_input *input, struct ibv_device_attr_ex *attr, size_t attr_size) { struct mlx4_context *mctx = to_mctx(context); struct mlx4_query_device_ex_resp resp = {}; struct mlx4_query_device_ex cmd = {}; uint64_t raw_fw_ver; unsigned sub_minor; unsigned major; unsigned minor; int err; err = ibv_cmd_query_device_ex(context, input, attr, attr_size, &raw_fw_ver, &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd), &resp.ibv_resp, sizeof(resp.ibv_resp), sizeof(resp)); if (err) return err; if (resp.comp_mask & MLX4_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET) { mctx->core_clock.offset = resp.hca_core_clock_offset; mctx->core_clock.offset_valid = 1; } major = (raw_fw_ver >> 32) & 0xffff; minor = (raw_fw_ver >> 16) & 0xffff; sub_minor = raw_fw_ver & 0xffff; snprintf(attr->orig_attr.fw_ver, sizeof attr->orig_attr.fw_ver, "%d.%d.%03d", major, minor, sub_minor); return 0; } #define READL(ptr) (*((uint32_t *)(ptr))) static int mlx4_read_clock(struct ibv_context *context, uint64_t *cycles) { unsigned int clockhi, clocklo, clockhi1; int i; struct mlx4_context *ctx = to_mctx(context); if (!ctx->hca_core_clock) return -EOPNOTSUPP; /* Handle wraparound */ for (i = 0; i < 2; i++) { clockhi = be32toh(READL(ctx->hca_core_clock)); clocklo = be32toh(READL(ctx->hca_core_clock + 4)); clockhi1 = be32toh(READL(ctx->hca_core_clock)); if (clockhi == clockhi1) break; } *cycles = (uint64_t)clockhi << 32 | (uint64_t)clocklo; return 0; } int mlx4_query_rt_values(struct ibv_context *context, struct ibv_values_ex *values) { uint32_t comp_mask = 0; int err = 0; if (values->comp_mask & IBV_VALUES_MASK_RAW_CLOCK) { uint64_t cycles; err = mlx4_read_clock(context, &cycles); if (!err) { values->raw_clock.tv_sec = 0; values->raw_clock.tv_nsec = cycles; comp_mask |= IBV_VALUES_MASK_RAW_CLOCK; } } values->comp_mask = comp_mask; return err; } int mlx4_query_port(struct ibv_context *context, uint8_t port, struct ibv_port_attr *attr) { struct ibv_query_port cmd; int err; err = ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd)); if (!err && port <= MLX4_PORTS_NUM && port > 0) { struct mlx4_context *mctx = to_mctx(context); if (!mctx->port_query_cache[port - 1].valid) { mctx->port_query_cache[port - 1].link_layer = attr->link_layer; mctx->port_query_cache[port - 1].caps = attr->port_cap_flags; mctx->port_query_cache[port - 1].valid = 1; } } return err; } /* Only the fields in the port cache will be valid */ static int query_port_cache(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { struct mlx4_context *mctx = to_mctx(context); if (port_num <= 0 || port_num > MLX4_PORTS_NUM) return -EINVAL; if (mctx->port_query_cache[port_num - 1].valid) { port_attr->link_layer = mctx-> port_query_cache[port_num - 1]. link_layer; port_attr->port_cap_flags = mctx-> port_query_cache[port_num - 1]. caps; return 0; } return mlx4_query_port(context, port_num, (struct ibv_port_attr *)port_attr); } struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context) { struct ibv_alloc_pd cmd; struct mlx4_alloc_pd_resp resp; struct mlx4_pd *pd; pd = malloc(sizeof *pd); if (!pd) return NULL; if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd, &resp.ibv_resp, sizeof resp)) { free(pd); return NULL; } pd->pdn = resp.pdn; return &pd->ibv_pd; } int mlx4_free_pd(struct ibv_pd *pd) { int ret; ret = ibv_cmd_dealloc_pd(pd); if (ret) return ret; free(to_mpd(pd)); return 0; } struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context, struct ibv_xrcd_init_attr *attr) { struct ibv_open_xrcd cmd; struct ibv_open_xrcd_resp resp; struct verbs_xrcd *xrcd; int ret; xrcd = calloc(1, sizeof *xrcd); if (!xrcd) return NULL; ret = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), attr, &cmd, sizeof cmd, &resp, sizeof resp); if (ret) goto err; return &xrcd->xrcd; err: free(xrcd); return NULL; } int mlx4_close_xrcd(struct ibv_xrcd *ib_xrcd) { struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd); int ret; ret = ibv_cmd_close_xrcd(xrcd); if (!ret) free(xrcd); return ret; } struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) { struct ibv_mr *mr; struct ibv_reg_mr cmd; struct ibv_reg_mr_resp resp; int ret; mr = malloc(sizeof *mr); if (!mr) return NULL; ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr, access, mr, &cmd, sizeof cmd, &resp, sizeof resp); if (ret) { free(mr); return NULL; } return mr; } int mlx4_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd, void *addr, size_t length, int access) { struct ibv_rereg_mr cmd; struct ibv_rereg_mr_resp resp; if (flags & IBV_REREG_MR_KEEP_VALID) return ENOTSUP; return ibv_cmd_rereg_mr(mr, flags, addr, length, (uintptr_t)addr, access, pd, &cmd, sizeof(cmd), &resp, sizeof(resp)); } int mlx4_dereg_mr(struct ibv_mr *mr) { int ret; ret = ibv_cmd_dereg_mr(mr); if (ret) return ret; free(mr); return 0; } struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type) { struct ibv_mw *mw; struct ibv_alloc_mw cmd; struct ibv_alloc_mw_resp resp; int ret; mw = calloc(1, sizeof(*mw)); if (!mw) return NULL; ret = ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd), &resp, sizeof(resp)); if (ret) { free(mw); return NULL; } return mw; } int mlx4_dealloc_mw(struct ibv_mw *mw) { int ret; struct ibv_dealloc_mw cmd; ret = ibv_cmd_dealloc_mw(mw, &cmd, sizeof(cmd)); if (ret) return ret; free(mw); return 0; } int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, struct ibv_mw_bind *mw_bind) { struct ibv_send_wr *bad_wr = NULL; struct ibv_send_wr wr = { }; int ret; wr.opcode = IBV_WR_BIND_MW; wr.next = NULL; wr.wr_id = mw_bind->wr_id; wr.send_flags = mw_bind->send_flags; wr.bind_mw.mw = mw; wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey); wr.bind_mw.bind_info = mw_bind->bind_info; ret = mlx4_post_send(qp, &wr, &bad_wr); if (ret) return ret; /* updating the mw with the latest rkey. */ mw->rkey = wr.bind_mw.rkey; return 0; } int align_queue_size(int req) { int nent; for (nent = 1; nent < req; nent <<= 1) ; /* nothing */ return nent; } enum { CREATE_CQ_SUPPORTED_WC_FLAGS = IBV_WC_STANDARD_FLAGS | IBV_WC_EX_WITH_COMPLETION_TIMESTAMP }; enum { CREATE_CQ_SUPPORTED_COMP_MASK = IBV_CQ_INIT_ATTR_MASK_FLAGS }; enum { CREATE_CQ_SUPPORTED_FLAGS = IBV_CREATE_CQ_ATTR_SINGLE_THREADED }; static int mlx4_cmd_create_cq(struct ibv_context *context, struct ibv_cq_init_attr_ex *cq_attr, struct mlx4_cq *cq) { struct mlx4_create_cq cmd = {}; struct mlx4_create_cq_resp resp = {}; int ret; cmd.buf_addr = (uintptr_t) cq->buf.buf; cmd.db_addr = (uintptr_t) cq->set_ci_db; ret = ibv_cmd_create_cq(context, cq_attr->cqe, cq_attr->channel, cq_attr->comp_vector, ibv_cq_ex_to_cq(&cq->ibv_cq), &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (!ret) cq->cqn = resp.cqn; return ret; } static int mlx4_cmd_create_cq_ex(struct ibv_context *context, struct ibv_cq_init_attr_ex *cq_attr, struct mlx4_cq *cq) { struct mlx4_create_cq_ex cmd = {}; struct mlx4_create_cq_resp_ex resp = {}; int ret; cmd.buf_addr = (uintptr_t) cq->buf.buf; cmd.db_addr = (uintptr_t) cq->set_ci_db; ret = ibv_cmd_create_cq_ex(context, cq_attr, &cq->ibv_cq, &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd), &resp.ibv_resp, sizeof(resp.ibv_resp), sizeof(resp)); if (!ret) cq->cqn = resp.cqn; return ret; } static struct ibv_cq_ex *create_cq(struct ibv_context *context, struct ibv_cq_init_attr_ex *cq_attr, int cq_alloc_flags) { struct mlx4_cq *cq; int ret; struct mlx4_context *mctx = to_mctx(context); /* Sanity check CQ size before proceeding */ if (cq_attr->cqe > 0x3fffff) { errno = EINVAL; return NULL; } if (cq_attr->comp_mask & ~CREATE_CQ_SUPPORTED_COMP_MASK) { errno = ENOTSUP; return NULL; } if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS && cq_attr->flags & ~CREATE_CQ_SUPPORTED_FLAGS) { errno = ENOTSUP; return NULL; } if (cq_attr->wc_flags & ~CREATE_CQ_SUPPORTED_WC_FLAGS) return NULL; /* mlx4 devices don't support slid and sl in cqe when completion * timestamp is enabled in the CQ */ if ((cq_attr->wc_flags & (IBV_WC_EX_WITH_SLID | IBV_WC_EX_WITH_SL)) && (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP)) { errno = ENOTSUP; return NULL; } cq = malloc(sizeof *cq); if (!cq) return NULL; cq->cons_index = 0; if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE)) goto err; cq_attr->cqe = align_queue_size(cq_attr->cqe + 1); if (mlx4_alloc_cq_buf(to_mdev(context->device), &cq->buf, cq_attr->cqe, mctx->cqe_size)) - goto err; + goto err_spl; cq->cqe_size = mctx->cqe_size; cq->set_ci_db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_CQ); if (!cq->set_ci_db) goto err_buf; cq->arm_db = cq->set_ci_db + 1; *cq->arm_db = 0; cq->arm_sn = 1; *cq->set_ci_db = 0; cq->flags = cq_alloc_flags; if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS && cq_attr->flags & IBV_CREATE_CQ_ATTR_SINGLE_THREADED) cq->flags |= MLX4_CQ_FLAGS_SINGLE_THREADED; --cq_attr->cqe; if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED) ret = mlx4_cmd_create_cq_ex(context, cq_attr, cq); else ret = mlx4_cmd_create_cq(context, cq_attr, cq); if (ret) goto err_db; if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED) mlx4_cq_fill_pfns(cq, cq_attr); return &cq->ibv_cq; err_db: mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_CQ, cq->set_ci_db); err_buf: mlx4_free_buf(&cq->buf); +err_spl: + pthread_spin_destroy(&cq->lock); + err: free(cq); return NULL; } struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector) { struct ibv_cq_ex *cq; struct ibv_cq_init_attr_ex cq_attr = {.cqe = cqe, .channel = channel, .comp_vector = comp_vector, .wc_flags = IBV_WC_STANDARD_FLAGS}; cq = create_cq(context, &cq_attr, 0); return cq ? ibv_cq_ex_to_cq(cq) : NULL; } struct ibv_cq_ex *mlx4_create_cq_ex(struct ibv_context *context, struct ibv_cq_init_attr_ex *cq_attr) { /* * Make local copy since some attributes might be adjusted * for internal use. */ struct ibv_cq_init_attr_ex cq_attr_c = {.cqe = cq_attr->cqe, .channel = cq_attr->channel, .comp_vector = cq_attr->comp_vector, .wc_flags = cq_attr->wc_flags, .comp_mask = cq_attr->comp_mask, .flags = cq_attr->flags}; return create_cq(context, &cq_attr_c, MLX4_CQ_FLAGS_EXTENDED); } int mlx4_resize_cq(struct ibv_cq *ibcq, int cqe) { struct mlx4_cq *cq = to_mcq(ibcq); struct mlx4_resize_cq cmd; struct ibv_resize_cq_resp resp; struct mlx4_buf buf; int old_cqe, outst_cqe, ret; /* Sanity check CQ size before proceeding */ if (cqe > 0x3fffff) return EINVAL; pthread_spin_lock(&cq->lock); cqe = align_queue_size(cqe + 1); if (cqe == ibcq->cqe + 1) { ret = 0; goto out; } /* Can't be smaller then the number of outstanding CQEs */ outst_cqe = mlx4_get_outstanding_cqes(cq); if (cqe < outst_cqe + 1) { ret = EINVAL; goto out; } ret = mlx4_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe, cq->cqe_size); if (ret) goto out; old_cqe = ibcq->cqe; cmd.buf_addr = (uintptr_t) buf.buf; ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd, &resp, sizeof resp); if (ret) { mlx4_free_buf(&buf); goto out; } mlx4_cq_resize_copy_cqes(cq, buf.buf, old_cqe); mlx4_free_buf(&cq->buf); cq->buf = buf; mlx4_update_cons_index(cq); out: pthread_spin_unlock(&cq->lock); return ret; } int mlx4_destroy_cq(struct ibv_cq *cq) { int ret; ret = ibv_cmd_destroy_cq(cq); if (ret) return ret; + verbs_cleanup_cq(cq); + pthread_spin_destroy(&to_mcq(cq)->lock); mlx4_free_db(to_mctx(cq->context), MLX4_DB_TYPE_CQ, to_mcq(cq)->set_ci_db); mlx4_free_buf(&to_mcq(cq)->buf); free(to_mcq(cq)); return 0; } struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd, struct ibv_srq_init_attr *attr) { struct mlx4_create_srq cmd; struct mlx4_create_srq_resp resp; struct mlx4_srq *srq; int ret; /* Sanity check SRQ size before proceeding */ if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64) return NULL; srq = malloc(sizeof *srq); if (!srq) return NULL; if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE)) goto err; srq->max = align_queue_size(attr->attr.max_wr + 1); srq->max_gs = attr->attr.max_sge; srq->counter = 0; srq->ext_srq = 0; if (mlx4_alloc_srq_buf(pd, &attr->attr, srq)) - goto err; + goto err_spl; srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ); if (!srq->db) goto err_free; *srq->db = 0; cmd.buf_addr = (uintptr_t) srq->buf.buf; cmd.db_addr = (uintptr_t) srq->db; ret = ibv_cmd_create_srq(pd, &srq->verbs_srq.srq, attr, &cmd.ibv_cmd, sizeof cmd, &resp.ibv_resp, sizeof resp); if (ret) goto err_db; return &srq->verbs_srq.srq; err_db: mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db); err_free: free(srq->wrid); mlx4_free_buf(&srq->buf); +err_spl: + pthread_spin_destroy(&srq->lock); + err: free(srq); return NULL; } struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context, struct ibv_srq_init_attr_ex *attr_ex) { if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) || (attr_ex->srq_type == IBV_SRQT_BASIC)) return mlx4_create_srq(attr_ex->pd, (struct ibv_srq_init_attr *) attr_ex); else if (attr_ex->srq_type == IBV_SRQT_XRC) return mlx4_create_xrc_srq(context, attr_ex); return NULL; } int mlx4_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr, int attr_mask) { struct ibv_modify_srq cmd; return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd); } int mlx4_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr) { struct ibv_query_srq cmd; return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd); } int mlx4_destroy_srq(struct ibv_srq *srq) { int ret; if (to_msrq(srq)->ext_srq) return mlx4_destroy_xrc_srq(srq); ret = ibv_cmd_destroy_srq(srq); if (ret) return ret; mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db); mlx4_free_buf(&to_msrq(srq)->buf); free(to_msrq(srq)->wrid); + pthread_spin_destroy(&to_msrq(srq)->lock); free(to_msrq(srq)); return 0; } static int mlx4_cmd_create_qp_ex(struct ibv_context *context, struct ibv_qp_init_attr_ex *attr, struct mlx4_create_qp *cmd, struct mlx4_qp *qp) { struct mlx4_create_qp_ex cmd_ex; struct mlx4_create_qp_resp_ex resp; int ret; memset(&cmd_ex, 0, sizeof(cmd_ex)); memcpy(&cmd_ex.ibv_cmd.base, &cmd->ibv_cmd.user_handle, offsetof(typeof(cmd->ibv_cmd), is_srq) + sizeof(cmd->ibv_cmd.is_srq) - offsetof(typeof(cmd->ibv_cmd), user_handle)); memcpy(&cmd_ex.drv_ex, &cmd->buf_addr, offsetof(typeof(*cmd), sq_no_prefetch) + sizeof(cmd->sq_no_prefetch) - sizeof(cmd->ibv_cmd)); ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr, &cmd_ex.ibv_cmd, sizeof(cmd_ex.ibv_cmd), sizeof(cmd_ex), &resp.ibv_resp, sizeof(resp.ibv_resp), sizeof(resp)); return ret; } enum { MLX4_CREATE_QP_SUP_COMP_MASK = (IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_XRCD | IBV_QP_INIT_ATTR_CREATE_FLAGS), }; enum { MLX4_CREATE_QP_EX2_COMP_MASK = (IBV_QP_INIT_ATTR_CREATE_FLAGS), }; struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context, struct ibv_qp_init_attr_ex *attr) { struct mlx4_context *ctx = to_mctx(context); struct mlx4_create_qp cmd; struct ibv_create_qp_resp resp; struct mlx4_qp *qp; int ret; /* Sanity check QP size before proceeding */ if (ctx->max_qp_wr) { /* mlx4_query_device succeeded */ if (attr->cap.max_send_wr > ctx->max_qp_wr || attr->cap.max_recv_wr > ctx->max_qp_wr || attr->cap.max_send_sge > ctx->max_sge || attr->cap.max_recv_sge > ctx->max_sge) return NULL; } else { if (attr->cap.max_send_wr > 65536 || attr->cap.max_recv_wr > 65536 || attr->cap.max_send_sge > 64 || attr->cap.max_recv_sge > 64) return NULL; } if (attr->cap.max_inline_data > 1024) return NULL; if (attr->comp_mask & ~MLX4_CREATE_QP_SUP_COMP_MASK) return NULL; qp = calloc(1, sizeof *qp); if (!qp) return NULL; if (attr->qp_type == IBV_QPT_XRC_RECV) { attr->cap.max_send_wr = qp->sq.wqe_cnt = 0; } else { mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp); /* * We need to leave 2 KB + 1 WQE of headroom in the SQ to * allow HW to prefetch. */ qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes); } if (attr->srq || attr->qp_type == IBV_QPT_XRC_SEND || attr->qp_type == IBV_QPT_XRC_RECV) { attr->cap.max_recv_wr = qp->rq.wqe_cnt = attr->cap.max_recv_sge = 0; } else { qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr); if (attr->cap.max_recv_sge < 1) attr->cap.max_recv_sge = 1; if (attr->cap.max_recv_wr < 1) attr->cap.max_recv_wr = 1; } if (mlx4_alloc_qp_buf(context, &attr->cap, attr->qp_type, qp)) goto err; mlx4_init_qp_indices(qp); - if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) || - pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE)) + if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE)) goto err_free; + if (pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE)) + goto err_sq_spl; if (attr->cap.max_recv_sge) { qp->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ); if (!qp->db) - goto err_free; + goto err_rq_spl; *qp->db = 0; cmd.db_addr = (uintptr_t) qp->db; } else { cmd.db_addr = 0; } cmd.buf_addr = (uintptr_t) qp->buf.buf; cmd.log_sq_stride = qp->sq.wqe_shift; for (cmd.log_sq_bb_count = 0; qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count; ++cmd.log_sq_bb_count) ; /* nothing */ cmd.sq_no_prefetch = 0; /* OK for ABI 2: just a reserved field */ memset(cmd.reserved, 0, sizeof cmd.reserved); pthread_mutex_lock(&to_mctx(context)->qp_table_mutex); if (attr->comp_mask & MLX4_CREATE_QP_EX2_COMP_MASK) ret = mlx4_cmd_create_qp_ex(context, attr, &cmd, qp); else ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr, &cmd.ibv_cmd, sizeof(cmd), &resp, sizeof(resp)); if (ret) goto err_rq_db; if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) { ret = mlx4_store_qp(to_mctx(context), qp->verbs_qp.qp.qp_num, qp); if (ret) goto err_destroy; } pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex); qp->rq.wqe_cnt = qp->rq.max_post = attr->cap.max_recv_wr; qp->rq.max_gs = attr->cap.max_recv_sge; if (attr->qp_type != IBV_QPT_XRC_RECV) mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type); qp->doorbell_qpn = htobe32(qp->verbs_qp.qp.qp_num << 8); if (attr->sq_sig_all) qp->sq_signal_bits = htobe32(MLX4_WQE_CTRL_CQ_UPDATE); else qp->sq_signal_bits = 0; return &qp->verbs_qp.qp; err_destroy: ibv_cmd_destroy_qp(&qp->verbs_qp.qp); err_rq_db: pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex); if (attr->cap.max_recv_sge) mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, qp->db); - +err_rq_spl: + pthread_spin_destroy(&qp->rq.lock); +err_sq_spl: + pthread_spin_destroy(&qp->sq.lock); err_free: free(qp->sq.wrid); if (qp->rq.wqe_cnt) free(qp->rq.wrid); mlx4_free_buf(&qp->buf); err: free(qp); return NULL; } struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) { struct ibv_qp_init_attr_ex attr_ex; struct ibv_qp *qp; memcpy(&attr_ex, attr, sizeof *attr); attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; attr_ex.pd = pd; qp = mlx4_create_qp_ex(pd->context, &attr_ex); if (qp) memcpy(attr, &attr_ex, sizeof *attr); return qp; } struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr) { struct ibv_open_qp cmd; struct ibv_create_qp_resp resp; struct mlx4_qp *qp; int ret; qp = calloc(1, sizeof *qp); if (!qp) return NULL; ret = ibv_cmd_open_qp(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr, &cmd, sizeof cmd, &resp, sizeof resp); if (ret) goto err; return &qp->verbs_qp.qp; err: free(qp); return NULL; } int mlx4_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) { struct ibv_query_qp cmd; struct mlx4_qp *qp = to_mqp(ibqp); int ret; ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof cmd); if (ret) return ret; init_attr->cap.max_send_wr = qp->sq.max_post; init_attr->cap.max_send_sge = qp->sq.max_gs; init_attr->cap.max_inline_data = qp->max_inline_data; attr->cap = init_attr->cap; return 0; } int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { struct ibv_modify_qp cmd = {}; struct ibv_port_attr port_attr; struct mlx4_qp *mqp = to_mqp(qp); struct ibv_device_attr device_attr; int ret; memset(&device_attr, 0, sizeof(device_attr)); if (attr_mask & IBV_QP_PORT) { ret = ibv_query_port(qp->context, attr->port_num, &port_attr); if (ret) return ret; mqp->link_layer = port_attr.link_layer; ret = ibv_query_device(qp->context, &device_attr); if (ret) return ret; switch(qp->qp_type) { case IBV_QPT_UD: if ((mqp->link_layer == IBV_LINK_LAYER_INFINIBAND) && (device_attr.device_cap_flags & IBV_DEVICE_UD_IP_CSUM)) mqp->qp_cap_cache |= MLX4_CSUM_SUPPORT_UD_OVER_IB | MLX4_RX_CSUM_VALID; break; case IBV_QPT_RAW_PACKET: if ((mqp->link_layer == IBV_LINK_LAYER_ETHERNET) && (device_attr.device_cap_flags & IBV_DEVICE_RAW_IP_CSUM)) mqp->qp_cap_cache |= MLX4_CSUM_SUPPORT_RAW_OVER_ETH | MLX4_RX_CSUM_VALID; break; default: break; } } if (qp->state == IBV_QPS_RESET && attr_mask & IBV_QP_STATE && attr->qp_state == IBV_QPS_INIT) { mlx4_qp_init_sq_ownership(to_mqp(qp)); } ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof cmd); if (!ret && (attr_mask & IBV_QP_STATE) && attr->qp_state == IBV_QPS_RESET) { if (qp->recv_cq) mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num, qp->srq ? to_msrq(qp->srq) : NULL); if (qp->send_cq && qp->send_cq != qp->recv_cq) mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL); mlx4_init_qp_indices(to_mqp(qp)); if (to_mqp(qp)->rq.wqe_cnt) *to_mqp(qp)->db = 0; } return ret; } static void mlx4_lock_cqs(struct ibv_qp *qp) { struct mlx4_cq *send_cq = to_mcq(qp->send_cq); struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq); if (!qp->send_cq || !qp->recv_cq) { if (qp->send_cq) pthread_spin_lock(&send_cq->lock); else if (qp->recv_cq) pthread_spin_lock(&recv_cq->lock); } else if (send_cq == recv_cq) { pthread_spin_lock(&send_cq->lock); } else if (send_cq->cqn < recv_cq->cqn) { pthread_spin_lock(&send_cq->lock); pthread_spin_lock(&recv_cq->lock); } else { pthread_spin_lock(&recv_cq->lock); pthread_spin_lock(&send_cq->lock); } } static void mlx4_unlock_cqs(struct ibv_qp *qp) { struct mlx4_cq *send_cq = to_mcq(qp->send_cq); struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq); if (!qp->send_cq || !qp->recv_cq) { if (qp->send_cq) pthread_spin_unlock(&send_cq->lock); else if (qp->recv_cq) pthread_spin_unlock(&recv_cq->lock); } else if (send_cq == recv_cq) { pthread_spin_unlock(&send_cq->lock); } else if (send_cq->cqn < recv_cq->cqn) { pthread_spin_unlock(&recv_cq->lock); pthread_spin_unlock(&send_cq->lock); } else { pthread_spin_unlock(&send_cq->lock); pthread_spin_unlock(&recv_cq->lock); } } int mlx4_destroy_qp(struct ibv_qp *ibqp) { struct mlx4_qp *qp = to_mqp(ibqp); int ret; pthread_mutex_lock(&to_mctx(ibqp->context)->qp_table_mutex); ret = ibv_cmd_destroy_qp(ibqp); if (ret) { pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex); return ret; } mlx4_lock_cqs(ibqp); if (ibqp->recv_cq) __mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num, ibqp->srq ? to_msrq(ibqp->srq) : NULL); if (ibqp->send_cq && ibqp->send_cq != ibqp->recv_cq) __mlx4_cq_clean(to_mcq(ibqp->send_cq), ibqp->qp_num, NULL); if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num); mlx4_unlock_cqs(ibqp); pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex); + pthread_spin_destroy(&qp->rq.lock); + pthread_spin_destroy(&qp->sq.lock); + if (qp->rq.wqe_cnt) { mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db); free(qp->rq.wrid); } if (qp->sq.wqe_cnt) free(qp->sq.wrid); mlx4_free_buf(&qp->buf); free(qp); return 0; } static int link_local_gid(const union ibv_gid *gid) { uint32_t *tmp = (uint32_t *)gid->raw; uint32_t hi = tmp[0]; uint32_t lo = tmp[1]; if (hi == htobe32(0xfe800000) && lo == 0) return 1; return 0; } static int is_multicast_gid(const union ibv_gid *gid) { return gid->raw[0] == 0xff; } static uint16_t get_vlan_id(union ibv_gid *gid) { uint16_t vid; vid = gid->raw[11] << 8 | gid->raw[12]; return vid < 0x1000 ? vid : 0xffff; } static int mlx4_resolve_grh_to_l2(struct ibv_pd *pd, struct mlx4_ah *ah, struct ibv_ah_attr *attr) { int err, i; uint16_t vid; union ibv_gid sgid; if (link_local_gid(&attr->grh.dgid)) { memcpy(ah->mac, &attr->grh.dgid.raw[8], 3); memcpy(ah->mac + 3, &attr->grh.dgid.raw[13], 3); ah->mac[0] ^= 2; vid = get_vlan_id(&attr->grh.dgid); } else if (is_multicast_gid(&attr->grh.dgid)) { ah->mac[0] = 0x33; ah->mac[1] = 0x33; for (i = 2; i < 6; ++i) ah->mac[i] = attr->grh.dgid.raw[i + 10]; err = ibv_query_gid(pd->context, attr->port_num, attr->grh.sgid_index, &sgid); if (err) return err; ah->av.dlid = htobe16(0xc000); ah->av.port_pd |= htobe32(1 << 31); vid = get_vlan_id(&sgid); } else return 1; if (vid != 0xffff) { ah->av.port_pd |= htobe32(1 << 29); ah->vlan = vid | ((attr->sl & 7) << 13); } return 0; } struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) { struct mlx4_ah *ah; struct ibv_port_attr port_attr; if (query_port_cache(pd->context, attr->port_num, &port_attr)) return NULL; ah = malloc(sizeof *ah); if (!ah) return NULL; memset(&ah->av, 0, sizeof ah->av); ah->av.port_pd = htobe32(to_mpd(pd)->pdn | (attr->port_num << 24)); if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { ah->av.g_slid = attr->src_path_bits; ah->av.dlid = htobe16(attr->dlid); ah->av.sl_tclass_flowlabel = htobe32(attr->sl << 28); } else ah->av.sl_tclass_flowlabel = htobe32(attr->sl << 29); if (attr->static_rate) { ah->av.stat_rate = attr->static_rate + MLX4_STAT_RATE_OFFSET; /* XXX check rate cap? */ } if (attr->is_global) { ah->av.g_slid |= 0x80; ah->av.gid_index = attr->grh.sgid_index; ah->av.hop_limit = attr->grh.hop_limit; ah->av.sl_tclass_flowlabel |= htobe32((attr->grh.traffic_class << 20) | attr->grh.flow_label); memcpy(ah->av.dgid, attr->grh.dgid.raw, 16); } if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { if (port_attr.port_cap_flags & IBV_PORT_IP_BASED_GIDS) { uint16_t vid; if (ibv_resolve_eth_l2_from_gid(pd->context, attr, ah->mac, &vid)) { free(ah); return NULL; } if (vid <= 0xfff) { ah->av.port_pd |= htobe32(1 << 29); ah->vlan = vid | ((attr->sl & 7) << 13); } } else { if (mlx4_resolve_grh_to_l2(pd, ah, attr)) { free(ah); return NULL; } } } return &ah->ibv_ah; } int mlx4_destroy_ah(struct ibv_ah *ah) { free(to_mah(ah)); return 0; } diff --git a/contrib/ofed/libmlx5/mlx5.c b/contrib/ofed/libmlx5/mlx5.c index bc35ebe05cd9..f55be7dae45e 100644 --- a/contrib/ofed/libmlx5/mlx5.c +++ b/contrib/ofed/libmlx5/mlx5.c @@ -1,1039 +1,1091 @@ /* * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include "mlx5.h" #include "mlx5-abi.h" #ifndef PCI_VENDOR_ID_MELLANOX #define PCI_VENDOR_ID_MELLANOX 0x15b3 #endif #ifndef CPU_OR #define CPU_OR(x, y, z) do {} while (0) #endif #ifndef CPU_EQUAL #define CPU_EQUAL(x, y) 1 #endif #define HCA(v, d) \ { .vendor = PCI_VENDOR_ID_##v, \ .device = d } static struct { unsigned vendor; unsigned device; } hca_table[] = { HCA(MELLANOX, 4113), /* MT4113 Connect-IB */ HCA(MELLANOX, 4114), /* Connect-IB Virtual Function */ HCA(MELLANOX, 4115), /* ConnectX-4 */ HCA(MELLANOX, 4116), /* ConnectX-4 Virtual Function */ HCA(MELLANOX, 4117), /* ConnectX-4LX */ HCA(MELLANOX, 4118), /* ConnectX-4LX Virtual Function */ HCA(MELLANOX, 4119), /* ConnectX-5, PCIe 3.0 */ HCA(MELLANOX, 4120), /* ConnectX-5 Virtual Function */ HCA(MELLANOX, 4121), /* ConnectX-5 Ex */ HCA(MELLANOX, 4122), /* ConnectX-5 Ex VF */ HCA(MELLANOX, 4123), /* ConnectX-6 */ HCA(MELLANOX, 4124), /* ConnectX-6 VF */ HCA(MELLANOX, 4125), /* ConnectX-6 DX */ HCA(MELLANOX, 4126), /* ConnectX family mlx5Gen Virtual Function */ HCA(MELLANOX, 4127), /* ConnectX-6 LX */ HCA(MELLANOX, 4129), /* ConnectX-7 */ HCA(MELLANOX, 4131), /* ConnectX-8 */ HCA(MELLANOX, 41682), /* BlueField integrated ConnectX-5 network controller */ HCA(MELLANOX, 41683), /* BlueField integrated ConnectX-5 network controller VF */ HCA(MELLANOX, 41686), /* BlueField-2 integrated ConnectX-6 Dx network controller */ HCA(MELLANOX, 41692), /* BlueField-3 integrated ConnectX-7 network controller */ HCA(MELLANOX, 41695), /* BlueField-4 integrated ConnectX-8 network controller */ }; uint32_t mlx5_debug_mask = 0; int mlx5_freeze_on_error_cqe; static struct ibv_context_ops mlx5_ctx_ops = { .query_device = mlx5_query_device, .query_port = mlx5_query_port, .alloc_pd = mlx5_alloc_pd, .dealloc_pd = mlx5_free_pd, .reg_mr = mlx5_reg_mr, .rereg_mr = mlx5_rereg_mr, .dereg_mr = mlx5_dereg_mr, .alloc_mw = mlx5_alloc_mw, .dealloc_mw = mlx5_dealloc_mw, .bind_mw = mlx5_bind_mw, .create_cq = mlx5_create_cq, .poll_cq = mlx5_poll_cq, .req_notify_cq = mlx5_arm_cq, .cq_event = mlx5_cq_event, .resize_cq = mlx5_resize_cq, .destroy_cq = mlx5_destroy_cq, .create_srq = mlx5_create_srq, .modify_srq = mlx5_modify_srq, .query_srq = mlx5_query_srq, .destroy_srq = mlx5_destroy_srq, .post_srq_recv = mlx5_post_srq_recv, .create_qp = mlx5_create_qp, .query_qp = mlx5_query_qp, .modify_qp = mlx5_modify_qp, .destroy_qp = mlx5_destroy_qp, .post_send = mlx5_post_send, .post_recv = mlx5_post_recv, .create_ah = mlx5_create_ah, .destroy_ah = mlx5_destroy_ah, .attach_mcast = mlx5_attach_mcast, .detach_mcast = mlx5_detach_mcast }; static int read_number_from_line(const char *line, int *value) { const char *ptr; ptr = strchr(line, ':'); if (!ptr) return 1; ++ptr; *value = atoi(ptr); return 0; } /** * The function looks for the first free user-index in all the * user-index tables. If all are used, returns -1, otherwise * a valid user-index. * In case the reference count of the table is zero, it means the * table is not in use and wasn't allocated yet, therefore the * mlx5_store_uidx allocates the table, and increment the reference * count on the table. */ static int32_t get_free_uidx(struct mlx5_context *ctx) { int32_t tind; int32_t i; for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) { if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK) break; } if (tind == MLX5_UIDX_TABLE_SIZE) return -1; if (!ctx->uidx_table[tind].refcnt) return tind << MLX5_UIDX_TABLE_SHIFT; for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) { if (!ctx->uidx_table[tind].table[i]) break; } return (tind << MLX5_UIDX_TABLE_SHIFT) | i; } int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc) { int32_t tind; int32_t ret = -1; int32_t uidx; pthread_mutex_lock(&ctx->uidx_table_mutex); uidx = get_free_uidx(ctx); if (uidx < 0) goto out; tind = uidx >> MLX5_UIDX_TABLE_SHIFT; if (!ctx->uidx_table[tind].refcnt) { ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1, sizeof(struct mlx5_resource *)); if (!ctx->uidx_table[tind].table) goto out; } ++ctx->uidx_table[tind].refcnt; ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc; ret = uidx; out: pthread_mutex_unlock(&ctx->uidx_table_mutex); return ret; } void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx) { int tind = uidx >> MLX5_UIDX_TABLE_SHIFT; pthread_mutex_lock(&ctx->uidx_table_mutex); if (!--ctx->uidx_table[tind].refcnt) free(ctx->uidx_table[tind].table); else ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL; pthread_mutex_unlock(&ctx->uidx_table_mutex); } static int mlx5_is_sandy_bridge(int *num_cores) { char line[128]; FILE *fd; int rc = 0; int cur_cpu_family = -1; int cur_cpu_model = -1; fd = fopen("/proc/cpuinfo", "r"); if (!fd) return 0; *num_cores = 0; while (fgets(line, 128, fd)) { int value; /* if this is information on new processor */ if (!strncmp(line, "processor", 9)) { ++*num_cores; cur_cpu_family = -1; cur_cpu_model = -1; } else if (!strncmp(line, "cpu family", 10)) { if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value))) cur_cpu_family = value; } else if (!strncmp(line, "model", 5)) { if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value))) cur_cpu_model = value; } /* if this is a Sandy Bridge CPU */ if ((cur_cpu_family == 6) && (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) )) rc = 1; } fclose(fd); return rc; } /* man cpuset This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits within a word are also in big-endian order. The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on the size of the bitmask. Examples of the Mask Format: 00000001 # just bit 0 set 40000000,00000000,00000000 # just bit 94 set 000000ff,00000000 # bits 32-39 set 00000000,000E3862 # 1,5,6,11-13,17-19 set A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as: 00000001,00000001,00010117 The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for bit 4, and the "7" is for bits 2, 1, and 0. */ static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set) { char *p, buf[1024]; char *env_value; uint32_t word; int i, k; env_value = getenv("MLX5_LOCAL_CPUS"); if (env_value) strncpy(buf, env_value, sizeof(buf)); else { char fname[MAXPATHLEN]; snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s", ibv_get_device_name(ibdev)); if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) { fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname); return; } } p = strrchr(buf, ','); if (!p) p = buf; i = 0; do { if (*p == ',') { *p = 0; p ++; } word = strtoul(p, NULL, 16); for (k = 0; word; ++k, word >>= 1) if (word & 1) CPU_SET(k+i, cpu_set); if (p == buf) break; p = strrchr(buf, ','); if (!p) p = buf; i += 32; } while (i < CPU_SETSIZE); } static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev) { cpuset_t my_cpus, dev_local_cpus, result_set; int stall_enable; int ret; int num_cores; if (!mlx5_is_sandy_bridge(&num_cores)) return 0; /* by default enable stall on sandy bridge arch */ stall_enable = 1; /* * check if app is bound to cpu set that is inside * of device local cpu set. Disable stalling if true */ /* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */ CPU_ZERO(&my_cpus); CPU_ZERO(&dev_local_cpus); CPU_ZERO(&result_set); ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, sizeof(my_cpus), &my_cpus); if (ret == -1) { if (errno == EINVAL) fprintf(stderr, PFX "Warning: my cpu set is too small\n"); else fprintf(stderr, PFX "Warning: failed to get my cpu set\n"); goto out; } /* get device local cpu set */ mlx5_local_cpu_set(ibdev, &dev_local_cpus); /* check if my cpu set is in dev cpu */ #if __FreeBSD_version < 1400046 CPU_OR(&result_set, &my_cpus); CPU_OR(&result_set, &dev_local_cpus); #else CPU_OR(&result_set, &my_cpus, &dev_local_cpus); #endif stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1; out: return stall_enable; } static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx) { char *env_value; env_value = getenv("MLX5_STALL_CQ_POLL"); if (env_value) /* check if cq stall is enforced by user */ ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0; else /* autodetect if we need to do cq polling */ ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev); env_value = getenv("MLX5_STALL_NUM_LOOP"); if (env_value) mlx5_stall_num_loop = atoi(env_value); env_value = getenv("MLX5_STALL_CQ_POLL_MIN"); if (env_value) mlx5_stall_cq_poll_min = atoi(env_value); env_value = getenv("MLX5_STALL_CQ_POLL_MAX"); if (env_value) mlx5_stall_cq_poll_max = atoi(env_value); env_value = getenv("MLX5_STALL_CQ_INC_STEP"); if (env_value) mlx5_stall_cq_inc_step = atoi(env_value); env_value = getenv("MLX5_STALL_CQ_DEC_STEP"); if (env_value) mlx5_stall_cq_dec_step = atoi(env_value); ctx->stall_adaptive_enable = 0; ctx->stall_cycles = 0; if (mlx5_stall_num_loop < 0) { ctx->stall_adaptive_enable = 1; ctx->stall_cycles = mlx5_stall_cq_poll_min; } } static int get_total_uuars(int page_size) { int size = MLX5_DEF_TOT_UUARS; int uuars_in_page; char *env; env = getenv("MLX5_TOTAL_UUARS"); if (env) size = atoi(env); if (size < 1) return -EINVAL; uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR; size = max(uuars_in_page, size); size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR); if (size > MLX5_MAX_BFREGS) return -ENOMEM; return size; } static void open_debug_file(struct mlx5_context *ctx) { char *env; env = getenv("MLX5_DEBUG_FILE"); if (!env) { ctx->dbg_fp = stderr; return; } ctx->dbg_fp = fopen(env, "aw+"); if (!ctx->dbg_fp) { fprintf(stderr, "Failed opening debug file %s, using stderr\n", env); ctx->dbg_fp = stderr; return; } } static void close_debug_file(struct mlx5_context *ctx) { if (ctx->dbg_fp && ctx->dbg_fp != stderr) fclose(ctx->dbg_fp); } static void set_debug_mask(void) { char *env; env = getenv("MLX5_DEBUG_MASK"); if (env) mlx5_debug_mask = strtol(env, NULL, 0); } static void set_freeze_on_error(void) { char *env; env = getenv("MLX5_FREEZE_ON_ERROR_CQE"); if (env) mlx5_freeze_on_error_cqe = strtol(env, NULL, 0); } static int get_always_bf(void) { char *env; env = getenv("MLX5_POST_SEND_PREFER_BF"); if (!env) return 1; return strcmp(env, "0") ? 1 : 0; } static int get_shut_up_bf(void) { char *env; env = getenv("MLX5_SHUT_UP_BF"); if (!env) return 0; return strcmp(env, "0") ? 1 : 0; } static int get_num_low_lat_uuars(int tot_uuars) { char *env; int num = 4; env = getenv("MLX5_NUM_LOW_LAT_UUARS"); if (env) num = atoi(env); if (num < 0) return -EINVAL; num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD); return num; } /* The library allocates an array of uuar contexts. The one in index zero does * not to execersize odd/even policy so it can avoid a lock but it may not use * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock * since they are assigned to one QP only. The rest can use blue flame but since * they are shared they need a lock */ static int need_uuar_lock(struct mlx5_context *ctx, int uuarn) { if (uuarn == 0 || mlx5_single_threaded) return 0; if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2) return 0; return 1; } static int single_threaded_app(void) { char *env; env = getenv("MLX5_SINGLE_THREADED"); if (env) return strcmp(env, "1") ? 0 : 1; return 0; } static int mlx5_cmd_get_context(struct mlx5_context *context, struct mlx5_alloc_ucontext *req, size_t req_len, struct mlx5_alloc_ucontext_resp *resp, size_t resp_len) { if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, req_len, &resp->ibv_resp, resp_len)) return 0; /* The ibv_cmd_get_context fails in older kernels when passing * a request length that the kernel doesn't know. * To avoid breaking compatibility of new libmlx5 and older * kernels, when ibv_cmd_get_context fails with the full * request length, we try once again with the legacy length. * We repeat this process while reducing requested size based * on the feature input size. To avoid this in the future, we * will remove the check in kernel that requires fields unknown * to the kernel to be cleared. This will require that any new * feature that involves extending struct mlx5_alloc_ucontext * will be accompanied by an indication in the form of one or * more fields in struct mlx5_alloc_ucontext_resp. If the * response value can be interpreted as feature not supported * when the returned value is zero, this will suffice to * indicate to the library that the request was ignored by the * kernel, either because it is unaware or because it decided * to do so. If zero is a valid response, we will add a new * field that indicates whether the request was handled. */ if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, offsetof(struct mlx5_alloc_ucontext, lib_caps), &resp->ibv_resp, resp_len)) return 0; return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, offsetof(struct mlx5_alloc_ucontext, cqe_version), &resp->ibv_resp, resp_len); } static int mlx5_map_internal_clock(struct mlx5_device *mdev, struct ibv_context *ibv_ctx) { struct mlx5_context *context = to_mctx(ibv_ctx); void *hca_clock_page; off_t offset = 0; set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset); hca_clock_page = mmap(NULL, mdev->page_size, PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd, mdev->page_size * offset); if (hca_clock_page == MAP_FAILED) { fprintf(stderr, PFX "Warning: Timestamp available,\n" "but failed to mmap() hca core clock page.\n"); return -1; } context->hca_core_clock = hca_clock_page + (context->core_clock.offset & (mdev->page_size - 1)); return 0; } int mlx5dv_query_device(struct ibv_context *ctx_in, struct mlx5dv_context *attrs_out) { struct mlx5_context *mctx = to_mctx(ctx_in); uint64_t comp_mask_out = 0; attrs_out->version = 0; attrs_out->flags = 0; if (mctx->cqe_version == MLX5_CQE_VERSION_V1) attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1; if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW) attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW; if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) { attrs_out->cqe_comp_caps = mctx->cqe_comp_caps; comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION; } attrs_out->comp_mask = comp_mask_out; return 0; } static int mlx5dv_get_qp(struct ibv_qp *qp_in, struct mlx5dv_qp *qp_out) { struct mlx5_qp *mqp = to_mqp(qp_in); qp_out->comp_mask = 0; qp_out->dbrec = mqp->db; if (mqp->sq_buf_size) /* IBV_QPT_RAW_PACKET */ qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf); else qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset); qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt; qp_out->sq.stride = 1 << mqp->sq.wqe_shift; qp_out->rq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset); qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt; qp_out->rq.stride = 1 << mqp->rq.wqe_shift; qp_out->bf.reg = mqp->bf->reg; if (mqp->bf->uuarn > 0) qp_out->bf.size = mqp->bf->buf_size; else qp_out->bf.size = 0; return 0; } static int mlx5dv_get_cq(struct ibv_cq *cq_in, struct mlx5dv_cq *cq_out) { struct mlx5_cq *mcq = to_mcq(cq_in); struct mlx5_context *mctx = to_mctx(cq_in->context); cq_out->comp_mask = 0; cq_out->cqn = mcq->cqn; cq_out->cqe_cnt = mcq->ibv_cq.cqe + 1; cq_out->cqe_size = mcq->cqe_sz; cq_out->buf = mcq->active_buf->buf; cq_out->dbrec = mcq->dbrec; cq_out->uar = mctx->uar; mcq->flags |= MLX5_CQ_FLAGS_DV_OWNED; return 0; } static int mlx5dv_get_rwq(struct ibv_wq *wq_in, struct mlx5dv_rwq *rwq_out) { struct mlx5_rwq *mrwq = to_mrwq(wq_in); rwq_out->comp_mask = 0; rwq_out->buf = mrwq->pbuff; rwq_out->dbrec = mrwq->recv_db; rwq_out->wqe_cnt = mrwq->rq.wqe_cnt; rwq_out->stride = 1 << mrwq->rq.wqe_shift; return 0; } static int mlx5dv_get_srq(struct ibv_srq *srq_in, struct mlx5dv_srq *srq_out) { struct mlx5_srq *msrq; msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq); srq_out->comp_mask = 0; srq_out->buf = msrq->buf.buf; srq_out->dbrec = msrq->db; srq_out->stride = 1 << msrq->wqe_shift; srq_out->head = msrq->head; srq_out->tail = msrq->tail; return 0; } int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type) { int ret = 0; if (obj_type & MLX5DV_OBJ_QP) ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out); if (!ret && (obj_type & MLX5DV_OBJ_CQ)) ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out); if (!ret && (obj_type & MLX5DV_OBJ_SRQ)) ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out); if (!ret && (obj_type & MLX5DV_OBJ_RWQ)) ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out); return ret; } static void adjust_uar_info(struct mlx5_device *mdev, struct mlx5_context *context, struct mlx5_alloc_ucontext_resp resp) { if (!resp.log_uar_size && !resp.num_uars_per_page) { /* old kernel */ context->uar_size = mdev->page_size; context->num_uars_per_page = 1; return; } context->uar_size = 1 << resp.log_uar_size; context->num_uars_per_page = resp.num_uars_per_page; } static int mlx5_init_context(struct verbs_device *vdev, struct ibv_context *ctx, int cmd_fd) { struct mlx5_context *context; struct mlx5_alloc_ucontext req; struct mlx5_alloc_ucontext_resp resp; int i; int page_size; int tot_uuars; int low_lat_uuars; int gross_uuars; int j; off_t offset; struct mlx5_device *mdev; struct verbs_context *v_ctx; struct ibv_port_attr port_attr; struct ibv_device_attr_ex device_attr; int k; int bfi; int num_sys_page_map; mdev = to_mdev(&vdev->device); v_ctx = verbs_get_ctx(ctx); page_size = mdev->page_size; mlx5_single_threaded = single_threaded_app(); context = to_mctx(ctx); context->ibv_ctx.cmd_fd = cmd_fd; open_debug_file(context); set_debug_mask(); set_freeze_on_error(); if (gethostname(context->hostname, sizeof(context->hostname))) strcpy(context->hostname, "host_unknown"); tot_uuars = get_total_uuars(page_size); if (tot_uuars < 0) { errno = -tot_uuars; goto err_free; } low_lat_uuars = get_num_low_lat_uuars(tot_uuars); if (low_lat_uuars < 0) { errno = -low_lat_uuars; goto err_free; } if (low_lat_uuars > tot_uuars - 1) { errno = ENOMEM; goto err_free; } memset(&req, 0, sizeof(req)); memset(&resp, 0, sizeof(resp)); req.total_num_uuars = tot_uuars; req.num_low_latency_uuars = low_lat_uuars; req.cqe_version = MLX5_CQE_VERSION_V1; req.lib_caps |= MLX5_LIB_CAP_4K_UAR; if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp, sizeof(resp))) goto err_free; context->max_num_qps = resp.qp_tab_size; context->bf_reg_size = resp.bf_reg_size; context->tot_uuars = resp.tot_uuars; context->low_lat_uuars = low_lat_uuars; context->cache_line_size = resp.cache_line_size; context->max_sq_desc_sz = resp.max_sq_desc_sz; context->max_rq_desc_sz = resp.max_rq_desc_sz; context->max_send_wqebb = resp.max_send_wqebb; context->num_ports = resp.num_ports; context->max_recv_wr = resp.max_recv_wr; context->max_srq_recv_wr = resp.max_srq_recv_wr; context->cqe_version = resp.cqe_version; if (context->cqe_version) { if (context->cqe_version == MLX5_CQE_VERSION_V1) mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1; else goto err_free; } adjust_uar_info(mdev, context, resp); gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR; context->bfs = calloc(gross_uuars, sizeof(*context->bfs)); if (!context->bfs) { errno = ENOMEM; goto err_free; } context->cmds_supp_uhw = resp.cmds_supp_uhw; context->vendor_cap_flags = 0; - pthread_mutex_init(&context->qp_table_mutex, NULL); - pthread_mutex_init(&context->srq_table_mutex, NULL); - pthread_mutex_init(&context->uidx_table_mutex, NULL); + if (pthread_mutex_init(&context->qp_table_mutex, NULL)) + goto err_free_bf; + if (pthread_mutex_init(&context->srq_table_mutex, NULL)) + goto err_qp_table_mutex; + if (pthread_mutex_init(&context->uidx_table_mutex, NULL)) + goto err_srq_table_mutex; for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) context->qp_table[i].refcnt = 0; for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) context->uidx_table[i].refcnt = 0; context->db_list = NULL; - pthread_mutex_init(&context->db_list_mutex, NULL); + if (pthread_mutex_init(&context->db_list_mutex, NULL)) + goto err_uidx_table_mutex; num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR); for (i = 0; i < num_sys_page_map; ++i) { offset = 0; set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset); set_index(i, &offset); context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, cmd_fd, page_size * offset); if (context->uar[i] == MAP_FAILED) { context->uar[i] = NULL; - goto err_free_bf; + goto err_db_list_mutex; } } for (i = 0; i < num_sys_page_map; i++) { for (j = 0; j < context->num_uars_per_page; j++) { for (k = 0; k < NUM_BFREGS_PER_UAR; k++) { bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k; context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j + MLX5_BF_OFFSET + k * context->bf_reg_size; context->bfs[bfi].need_lock = need_uuar_lock(context, bfi); - mlx5_spinlock_init(&context->bfs[bfi].lock); + if (mlx5_spinlock_init(&context->bfs[bfi].lock)) + goto err_bfs_spl; context->bfs[bfi].offset = 0; if (bfi) context->bfs[bfi].buf_size = context->bf_reg_size / 2; context->bfs[bfi].uuarn = bfi; } } } context->hca_core_clock = NULL; if (resp.response_length + sizeof(resp.ibv_resp) >= offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) + sizeof(resp.hca_core_clock_offset) && resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) { context->core_clock.offset = resp.hca_core_clock_offset; mlx5_map_internal_clock(mdev, ctx); } - mlx5_spinlock_init(&context->lock32); + if (mlx5_spinlock_init(&context->lock32)) + goto err_bfs_spl; context->prefer_bf = get_always_bf(); context->shut_up_bf = get_shut_up_bf(); mlx5_read_env(&vdev->device, context); - mlx5_spinlock_init(&context->hugetlb_lock); + if (mlx5_spinlock_init(&context->hugetlb_lock)) + goto err_32_spl; TAILQ_INIT(&context->hugetlb_list); context->ibv_ctx.ops = mlx5_ctx_ops; verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex); verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd); verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd); verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex); verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num); verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex); verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values); verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow); verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow); verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex); verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq); verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq); verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq); verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table); verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table); memset(&device_attr, 0, sizeof(device_attr)); if (!mlx5_query_device_ex(ctx, NULL, &device_attr, sizeof(struct ibv_device_attr_ex))) { context->cached_device_cap_flags = device_attr.orig_attr.device_cap_flags; context->atomic_cap = device_attr.orig_attr.atomic_cap; context->cached_tso_caps = device_attr.tso_caps; } for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) { memset(&port_attr, 0, sizeof(port_attr)); if (!mlx5_query_port(ctx, j + 1, &port_attr)) context->cached_link_layer[j] = port_attr.link_layer; } return 0; +err_32_spl: + mlx5_spinlock_destroy(&context->lock32); + +err_bfs_spl: + for (i = 0; i < num_sys_page_map; i++) { + for (j = 0; j < context->num_uars_per_page; j++) { + for (k = 0; k < NUM_BFREGS_PER_UAR; k++) { + bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k; + mlx5_spinlock_destroy(&context->bfs[bfi].lock); + } + } + } + +err_db_list_mutex: + pthread_mutex_destroy(&context->db_list_mutex); + +err_uidx_table_mutex: + pthread_mutex_destroy(&context->uidx_table_mutex); + +err_srq_table_mutex: + pthread_mutex_destroy(&context->srq_table_mutex); + +err_qp_table_mutex: + pthread_mutex_destroy(&context->qp_table_mutex); + err_free_bf: free(context->bfs); err_free: for (i = 0; i < MLX5_MAX_UARS; ++i) { if (context->uar[i]) munmap(context->uar[i], page_size); } close_debug_file(context); return errno; } static void mlx5_cleanup_context(struct verbs_device *device, struct ibv_context *ibctx) { struct mlx5_context *context = to_mctx(ibctx); int page_size = to_mdev(ibctx->device)->page_size; int i; + int j; + int k; + int bfi; + int num_sys_page_map; + + num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR); + for (i = 0; i < num_sys_page_map; i++) { + for (j = 0; j < context->num_uars_per_page; j++) { + for (k = 0; k < NUM_BFREGS_PER_UAR; k++) { + bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k; + mlx5_spinlock_destroy(&context->bfs[bfi].lock); + } + } + } + mlx5_spinlock_destroy(&context->hugetlb_lock); + mlx5_spinlock_destroy(&context->lock32); + pthread_mutex_destroy(&context->db_list_mutex); + pthread_mutex_destroy(&context->uidx_table_mutex); + pthread_mutex_destroy(&context->srq_table_mutex); + pthread_mutex_destroy(&context->qp_table_mutex); free(context->bfs); for (i = 0; i < MLX5_MAX_UARS; ++i) { if (context->uar[i]) munmap(context->uar[i], page_size); } if (context->hca_core_clock) munmap(context->hca_core_clock - context->core_clock.offset, page_size); close_debug_file(context); } static struct verbs_device_ops mlx5_dev_ops = { .init_context = mlx5_init_context, .uninit_context = mlx5_cleanup_context, }; static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path, int abi_version) { char value[8]; struct mlx5_device *dev; unsigned vendor, device; int i; if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor", value, sizeof value) < 0) return NULL; sscanf(value, "%i", &vendor); if (ibv_read_sysfs_file(uverbs_sys_path, "device/device", value, sizeof value) < 0) return NULL; sscanf(value, "%i", &device); for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i) if (vendor == hca_table[i].vendor && device == hca_table[i].device) goto found; return NULL; found: if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION || abi_version > MLX5_UVERBS_MAX_ABI_VERSION) { fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported " "(min supported %d, max supported %d)\n", abi_version, uverbs_sys_path, MLX5_UVERBS_MIN_ABI_VERSION, MLX5_UVERBS_MAX_ABI_VERSION); return NULL; } dev = calloc(1, sizeof *dev); if (!dev) { fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n", uverbs_sys_path); return NULL; } dev->page_size = sysconf(_SC_PAGESIZE); dev->driver_abi_ver = abi_version; dev->verbs_dev.ops = &mlx5_dev_ops; dev->verbs_dev.sz = sizeof(*dev); dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) - sizeof(struct ibv_context); return &dev->verbs_dev; } static __attribute__((constructor)) void mlx5_register_driver(void) { verbs_register_driver("mlx5", mlx5_driver_init); } diff --git a/contrib/ofed/libmlx5/verbs.c b/contrib/ofed/libmlx5/verbs.c index 2e2f74f0882b..766751041050 100644 --- a/contrib/ofed/libmlx5/verbs.c +++ b/contrib/ofed/libmlx5/verbs.c @@ -1,2212 +1,2251 @@ /* * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include "mlx5.h" #include "mlx5-abi.h" #include "wqe.h" int mlx5_single_threaded = 0; static inline int is_xrc_tgt(int type) { return type == IBV_QPT_XRC_RECV; } int mlx5_query_device(struct ibv_context *context, struct ibv_device_attr *attr) { struct ibv_query_device cmd; uint64_t raw_fw_ver; unsigned major, minor, sub_minor; int ret; ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd); if (ret) return ret; major = (raw_fw_ver >> 32) & 0xffff; minor = (raw_fw_ver >> 16) & 0xffff; sub_minor = raw_fw_ver & 0xffff; snprintf(attr->fw_ver, sizeof attr->fw_ver, "%d.%d.%04d", major, minor, sub_minor); return 0; } #define READL(ptr) (*((uint32_t *)(ptr))) static int mlx5_read_clock(struct ibv_context *context, uint64_t *cycles) { unsigned int clockhi, clocklo, clockhi1; int i; struct mlx5_context *ctx = to_mctx(context); if (!ctx->hca_core_clock) return -EOPNOTSUPP; /* Handle wraparound */ for (i = 0; i < 2; i++) { clockhi = be32toh(READL(ctx->hca_core_clock)); clocklo = be32toh(READL(ctx->hca_core_clock + 4)); clockhi1 = be32toh(READL(ctx->hca_core_clock)); if (clockhi == clockhi1) break; } *cycles = (uint64_t)clockhi << 32 | (uint64_t)clocklo; return 0; } int mlx5_query_rt_values(struct ibv_context *context, struct ibv_values_ex *values) { uint32_t comp_mask = 0; int err = 0; if (values->comp_mask & IBV_VALUES_MASK_RAW_CLOCK) { uint64_t cycles; err = mlx5_read_clock(context, &cycles); if (!err) { values->raw_clock.tv_sec = 0; values->raw_clock.tv_nsec = cycles; comp_mask |= IBV_VALUES_MASK_RAW_CLOCK; } } values->comp_mask = comp_mask; return err; } int mlx5_query_port(struct ibv_context *context, uint8_t port, struct ibv_port_attr *attr) { struct ibv_query_port cmd; return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd); } struct ibv_pd *mlx5_alloc_pd(struct ibv_context *context) { struct ibv_alloc_pd cmd; struct mlx5_alloc_pd_resp resp; struct mlx5_pd *pd; pd = calloc(1, sizeof *pd); if (!pd) return NULL; if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd, &resp.ibv_resp, sizeof resp)) { free(pd); return NULL; } pd->pdn = resp.pdn; return &pd->ibv_pd; } int mlx5_free_pd(struct ibv_pd *pd) { int ret; ret = ibv_cmd_dealloc_pd(pd); if (ret) return ret; free(to_mpd(pd)); return 0; } struct ibv_mr *mlx5_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int acc) { struct mlx5_mr *mr; struct ibv_reg_mr cmd; int ret; enum ibv_access_flags access = (enum ibv_access_flags)acc; struct ibv_reg_mr_resp resp; mr = calloc(1, sizeof(*mr)); if (!mr) return NULL; ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t)addr, access, &(mr->ibv_mr), &cmd, sizeof(cmd), &resp, sizeof resp); if (ret) { mlx5_free_buf(&(mr->buf)); free(mr); return NULL; } mr->alloc_flags = acc; return &mr->ibv_mr; } int mlx5_rereg_mr(struct ibv_mr *ibmr, int flags, struct ibv_pd *pd, void *addr, size_t length, int access) { struct ibv_rereg_mr cmd; struct ibv_rereg_mr_resp resp; if (flags & IBV_REREG_MR_KEEP_VALID) return ENOTSUP; return ibv_cmd_rereg_mr(ibmr, flags, addr, length, (uintptr_t)addr, access, pd, &cmd, sizeof(cmd), &resp, sizeof(resp)); } int mlx5_dereg_mr(struct ibv_mr *ibmr) { int ret; struct mlx5_mr *mr = to_mmr(ibmr); ret = ibv_cmd_dereg_mr(ibmr); if (ret) return ret; free(mr); return 0; } struct ibv_mw *mlx5_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type) { struct ibv_mw *mw; struct ibv_alloc_mw cmd; struct ibv_alloc_mw_resp resp; int ret; mw = malloc(sizeof(*mw)); if (!mw) return NULL; memset(mw, 0, sizeof(*mw)); ret = ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd), &resp, sizeof(resp)); if (ret) { free(mw); return NULL; } return mw; } int mlx5_dealloc_mw(struct ibv_mw *mw) { int ret; struct ibv_dealloc_mw cmd; ret = ibv_cmd_dealloc_mw(mw, &cmd, sizeof(cmd)); if (ret) return ret; free(mw); return 0; } int mlx5_round_up_power_of_two(long long sz) { long long ret; for (ret = 1; ret < sz; ret <<= 1) ; /* nothing */ if (ret > INT_MAX) { fprintf(stderr, "%s: roundup overflow\n", __func__); return -ENOMEM; } return (int)ret; } static int align_queue_size(long long req) { return mlx5_round_up_power_of_two(req); } static int get_cqe_size(void) { char *env; int size = 64; env = getenv("MLX5_CQE_SIZE"); if (env) size = atoi(env); switch (size) { case 64: case 128: return size; default: return -EINVAL; } } static int use_scatter_to_cqe(void) { char *env; env = getenv("MLX5_SCATTER_TO_CQE"); if (env && !strcmp(env, "0")) return 0; return 1; } static int srq_sig_enabled(void) { char *env; env = getenv("MLX5_SRQ_SIGNATURE"); if (env) return 1; return 0; } static int qp_sig_enabled(void) { char *env; env = getenv("MLX5_QP_SIGNATURE"); if (env) return 1; return 0; } enum { CREATE_CQ_SUPPORTED_WC_FLAGS = IBV_WC_STANDARD_FLAGS | IBV_WC_EX_WITH_COMPLETION_TIMESTAMP | IBV_WC_EX_WITH_CVLAN | IBV_WC_EX_WITH_FLOW_TAG }; enum { CREATE_CQ_SUPPORTED_COMP_MASK = IBV_CQ_INIT_ATTR_MASK_FLAGS }; enum { CREATE_CQ_SUPPORTED_FLAGS = IBV_CREATE_CQ_ATTR_SINGLE_THREADED }; static struct ibv_cq_ex *create_cq(struct ibv_context *context, const struct ibv_cq_init_attr_ex *cq_attr, int cq_alloc_flags, struct mlx5dv_cq_init_attr *mlx5cq_attr) { struct mlx5_create_cq cmd; struct mlx5_create_cq_resp resp; struct mlx5_cq *cq; int cqe_sz; int ret; int ncqe; struct mlx5_context *mctx = to_mctx(context); FILE *fp = to_mctx(context)->dbg_fp; if (!cq_attr->cqe) { mlx5_dbg(fp, MLX5_DBG_CQ, "CQE invalid\n"); errno = EINVAL; return NULL; } if (cq_attr->comp_mask & ~CREATE_CQ_SUPPORTED_COMP_MASK) { mlx5_dbg(fp, MLX5_DBG_CQ, "Unsupported comp_mask for create_cq\n"); errno = EINVAL; return NULL; } if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS && cq_attr->flags & ~CREATE_CQ_SUPPORTED_FLAGS) { mlx5_dbg(fp, MLX5_DBG_CQ, "Unsupported creation flags requested for create_cq\n"); errno = EINVAL; return NULL; } if (cq_attr->wc_flags & ~CREATE_CQ_SUPPORTED_WC_FLAGS) { mlx5_dbg(fp, MLX5_DBG_CQ, "\n"); errno = ENOTSUP; return NULL; } cq = calloc(1, sizeof *cq); if (!cq) { mlx5_dbg(fp, MLX5_DBG_CQ, "\n"); return NULL; } memset(&cmd, 0, sizeof cmd); cq->cons_index = 0; if (mlx5_spinlock_init(&cq->lock)) goto err; ncqe = align_queue_size(cq_attr->cqe + 1); if ((ncqe > (1 << 24)) || (ncqe < (cq_attr->cqe + 1))) { mlx5_dbg(fp, MLX5_DBG_CQ, "ncqe %d\n", ncqe); errno = EINVAL; goto err_spl; } cqe_sz = get_cqe_size(); if (cqe_sz < 0) { mlx5_dbg(fp, MLX5_DBG_CQ, "\n"); errno = -cqe_sz; goto err_spl; } if (mlx5_alloc_cq_buf(to_mctx(context), cq, &cq->buf_a, ncqe, cqe_sz)) { mlx5_dbg(fp, MLX5_DBG_CQ, "\n"); goto err_spl; } cq->dbrec = mlx5_alloc_dbrec(to_mctx(context)); if (!cq->dbrec) { mlx5_dbg(fp, MLX5_DBG_CQ, "\n"); goto err_buf; } cq->dbrec[MLX5_CQ_SET_CI] = 0; cq->dbrec[MLX5_CQ_ARM_DB] = 0; cq->arm_sn = 0; cq->cqe_sz = cqe_sz; cq->flags = cq_alloc_flags; if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS && cq_attr->flags & IBV_CREATE_CQ_ATTR_SINGLE_THREADED) cq->flags |= MLX5_CQ_FLAGS_SINGLE_THREADED; cmd.buf_addr = (uintptr_t) cq->buf_a.buf; cmd.db_addr = (uintptr_t) cq->dbrec; cmd.cqe_size = cqe_sz; if (mlx5cq_attr) { if (mlx5cq_attr->comp_mask & ~(MLX5DV_CQ_INIT_ATTR_MASK_RESERVED - 1)) { mlx5_dbg(fp, MLX5_DBG_CQ, "Unsupported vendor comp_mask for create_cq\n"); errno = EINVAL; goto err_db; } if (mlx5cq_attr->comp_mask & MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE) { if (mctx->cqe_comp_caps.max_num && (mlx5cq_attr->cqe_comp_res_format & mctx->cqe_comp_caps.supported_format)) { cmd.cqe_comp_en = 1; cmd.cqe_comp_res_format = mlx5cq_attr->cqe_comp_res_format; } else { mlx5_dbg(fp, MLX5_DBG_CQ, "CQE Compression is not supported\n"); errno = EINVAL; goto err_db; } } } ret = ibv_cmd_create_cq(context, ncqe - 1, cq_attr->channel, cq_attr->comp_vector, ibv_cq_ex_to_cq(&cq->ibv_cq), &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (ret) { mlx5_dbg(fp, MLX5_DBG_CQ, "ret %d\n", ret); goto err_db; } cq->active_buf = &cq->buf_a; cq->resize_buf = NULL; cq->cqn = resp.cqn; cq->stall_enable = to_mctx(context)->stall_enable; cq->stall_adaptive_enable = to_mctx(context)->stall_adaptive_enable; cq->stall_cycles = to_mctx(context)->stall_cycles; if (cq_alloc_flags & MLX5_CQ_FLAGS_EXTENDED) mlx5_cq_fill_pfns(cq, cq_attr); return &cq->ibv_cq; err_db: mlx5_free_db(to_mctx(context), cq->dbrec); err_buf: mlx5_free_cq_buf(to_mctx(context), &cq->buf_a); err_spl: mlx5_spinlock_destroy(&cq->lock); err: free(cq); return NULL; } struct ibv_cq *mlx5_create_cq(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector) { struct ibv_cq_ex *cq; struct ibv_cq_init_attr_ex cq_attr = {.cqe = cqe, .channel = channel, .comp_vector = comp_vector, .wc_flags = IBV_WC_STANDARD_FLAGS}; if (cqe <= 0) { errno = EINVAL; return NULL; } cq = create_cq(context, &cq_attr, 0, NULL); return cq ? ibv_cq_ex_to_cq(cq) : NULL; } struct ibv_cq_ex *mlx5_create_cq_ex(struct ibv_context *context, struct ibv_cq_init_attr_ex *cq_attr) { return create_cq(context, cq_attr, MLX5_CQ_FLAGS_EXTENDED, NULL); } struct ibv_cq_ex *mlx5dv_create_cq(struct ibv_context *context, struct ibv_cq_init_attr_ex *cq_attr, struct mlx5dv_cq_init_attr *mlx5_cq_attr) { struct ibv_cq_ex *cq; + int err = 0; cq = create_cq(context, cq_attr, MLX5_CQ_FLAGS_EXTENDED, mlx5_cq_attr); if (!cq) return NULL; - verbs_init_cq(ibv_cq_ex_to_cq(cq), context, + err = verbs_init_cq(ibv_cq_ex_to_cq(cq), context, cq_attr->channel, cq_attr->cq_context); + if (err) + goto err; + return cq; + +err: + context->ops.destroy_cq(ibv_cq_ex_to_cq(cq)); + + return NULL; } int mlx5_resize_cq(struct ibv_cq *ibcq, int cqe) { struct mlx5_cq *cq = to_mcq(ibcq); struct mlx5_resize_cq_resp resp; struct mlx5_resize_cq cmd; struct mlx5_context *mctx = to_mctx(ibcq->context); int err; if (cqe < 0) { errno = EINVAL; return errno; } memset(&cmd, 0, sizeof(cmd)); memset(&resp, 0, sizeof(resp)); if (((long long)cqe * 64) > INT_MAX) return EINVAL; mlx5_spin_lock(&cq->lock); cq->active_cqes = cq->ibv_cq.cqe; if (cq->active_buf == &cq->buf_a) cq->resize_buf = &cq->buf_b; else cq->resize_buf = &cq->buf_a; cqe = align_queue_size(cqe + 1); if (cqe == ibcq->cqe + 1) { cq->resize_buf = NULL; err = 0; goto out; } /* currently we don't change cqe size */ cq->resize_cqe_sz = cq->cqe_sz; cq->resize_cqes = cqe; err = mlx5_alloc_cq_buf(mctx, cq, cq->resize_buf, cq->resize_cqes, cq->resize_cqe_sz); if (err) { cq->resize_buf = NULL; errno = ENOMEM; goto out; } cmd.buf_addr = (uintptr_t)cq->resize_buf->buf; cmd.cqe_size = cq->resize_cqe_sz; err = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (err) goto out_buf; mlx5_cq_resize_copy_cqes(cq); mlx5_free_cq_buf(mctx, cq->active_buf); cq->active_buf = cq->resize_buf; cq->ibv_cq.cqe = cqe - 1; mlx5_spin_unlock(&cq->lock); cq->resize_buf = NULL; return 0; out_buf: mlx5_free_cq_buf(mctx, cq->resize_buf); cq->resize_buf = NULL; out: mlx5_spin_unlock(&cq->lock); return err; } int mlx5_destroy_cq(struct ibv_cq *cq) { int ret; + struct mlx5_cq *mcq = to_mcq(cq); ret = ibv_cmd_destroy_cq(cq); if (ret) return ret; + verbs_cleanup_cq(cq); mlx5_free_db(to_mctx(cq->context), to_mcq(cq)->dbrec); mlx5_free_cq_buf(to_mctx(cq->context), to_mcq(cq)->active_buf); + mlx5_spinlock_destroy(&mcq->lock); free(to_mcq(cq)); return 0; } struct ibv_srq *mlx5_create_srq(struct ibv_pd *pd, struct ibv_srq_init_attr *attr) { struct mlx5_create_srq cmd; struct mlx5_create_srq_resp resp; struct mlx5_srq *srq; int ret; struct mlx5_context *ctx; int max_sge; struct ibv_srq *ibsrq; ctx = to_mctx(pd->context); srq = calloc(1, sizeof *srq); if (!srq) { fprintf(stderr, "%s-%d:\n", __func__, __LINE__); return NULL; } ibsrq = &srq->vsrq.srq; memset(&cmd, 0, sizeof cmd); if (mlx5_spinlock_init(&srq->lock)) { fprintf(stderr, "%s-%d:\n", __func__, __LINE__); goto err; } if (attr->attr.max_wr > ctx->max_srq_recv_wr) { fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n", __func__, __LINE__, attr->attr.max_wr, ctx->max_srq_recv_wr); errno = EINVAL; - goto err; + goto err_spl; } /* * this calculation does not consider required control segments. The * final calculation is done again later. This is done so to avoid * overflows of variables */ max_sge = ctx->max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg); if (attr->attr.max_sge > max_sge) { fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n", __func__, __LINE__, attr->attr.max_wr, ctx->max_srq_recv_wr); errno = EINVAL; - goto err; + goto err_spl; } srq->max = align_queue_size(attr->attr.max_wr + 1); srq->max_gs = attr->attr.max_sge; srq->counter = 0; if (mlx5_alloc_srq_buf(pd->context, srq)) { fprintf(stderr, "%s-%d:\n", __func__, __LINE__); - goto err; + goto err_spl; } srq->db = mlx5_alloc_dbrec(to_mctx(pd->context)); if (!srq->db) { fprintf(stderr, "%s-%d:\n", __func__, __LINE__); goto err_free; } *srq->db = 0; cmd.buf_addr = (uintptr_t) srq->buf.buf; cmd.db_addr = (uintptr_t) srq->db; srq->wq_sig = srq_sig_enabled(); if (srq->wq_sig) cmd.flags = MLX5_SRQ_FLAG_SIGNATURE; attr->attr.max_sge = srq->max_gs; pthread_mutex_lock(&ctx->srq_table_mutex); ret = ibv_cmd_create_srq(pd, ibsrq, attr, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (ret) goto err_db; ret = mlx5_store_srq(ctx, resp.srqn, srq); if (ret) goto err_destroy; pthread_mutex_unlock(&ctx->srq_table_mutex); srq->srqn = resp.srqn; srq->rsc.rsn = resp.srqn; srq->rsc.type = MLX5_RSC_TYPE_SRQ; return ibsrq; err_destroy: ibv_cmd_destroy_srq(ibsrq); err_db: pthread_mutex_unlock(&ctx->srq_table_mutex); mlx5_free_db(to_mctx(pd->context), srq->db); err_free: free(srq->wrid); mlx5_free_buf(&srq->buf); +err_spl: + mlx5_spinlock_destroy(&srq->lock); + err: free(srq); return NULL; } int mlx5_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr, int attr_mask) { struct ibv_modify_srq cmd; return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd); } int mlx5_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr) { struct ibv_query_srq cmd; return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd); } int mlx5_destroy_srq(struct ibv_srq *srq) { int ret; struct mlx5_srq *msrq = to_msrq(srq); struct mlx5_context *ctx = to_mctx(srq->context); ret = ibv_cmd_destroy_srq(srq); if (ret) return ret; if (ctx->cqe_version && msrq->rsc.type == MLX5_RSC_TYPE_XSRQ) mlx5_clear_uidx(ctx, msrq->rsc.rsn); else mlx5_clear_srq(ctx, msrq->srqn); mlx5_free_db(ctx, msrq->db); mlx5_free_buf(&msrq->buf); free(msrq->wrid); + mlx5_spinlock_destroy(&msrq->lock); free(msrq); return 0; } static int sq_overhead(enum ibv_qp_type qp_type) { size_t size = 0; size_t mw_bind_size = sizeof(struct mlx5_wqe_umr_ctrl_seg) + sizeof(struct mlx5_wqe_mkey_context_seg) + max_t(size_t, sizeof(struct mlx5_wqe_umr_klm_seg), 64); switch (qp_type) { case IBV_QPT_RC: size += sizeof(struct mlx5_wqe_ctrl_seg) + max(sizeof(struct mlx5_wqe_atomic_seg) + sizeof(struct mlx5_wqe_raddr_seg), mw_bind_size); break; case IBV_QPT_UC: size = sizeof(struct mlx5_wqe_ctrl_seg) + max(sizeof(struct mlx5_wqe_raddr_seg), mw_bind_size); break; case IBV_QPT_UD: size = sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_datagram_seg); break; case IBV_QPT_XRC_SEND: size = sizeof(struct mlx5_wqe_ctrl_seg) + mw_bind_size; SWITCH_FALLTHROUGH; case IBV_QPT_XRC_RECV: size = max(size, sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_xrc_seg) + sizeof(struct mlx5_wqe_raddr_seg)); break; case IBV_QPT_RAW_PACKET: size = sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_eth_seg); break; default: return -EINVAL; } return size; } static int mlx5_calc_send_wqe(struct mlx5_context *ctx, struct ibv_qp_init_attr_ex *attr, struct mlx5_qp *qp) { int size; int inl_size = 0; int max_gather; int tot_size; size = sq_overhead(attr->qp_type); if (size < 0) return size; if (attr->cap.max_inline_data) { inl_size = size + align(sizeof(struct mlx5_wqe_inl_data_seg) + attr->cap.max_inline_data, 16); } if (attr->comp_mask & IBV_QP_INIT_ATTR_MAX_TSO_HEADER) { size += align(attr->max_tso_header, 16); qp->max_tso_header = attr->max_tso_header; } max_gather = (ctx->max_sq_desc_sz - size) / sizeof(struct mlx5_wqe_data_seg); if (attr->cap.max_send_sge > max_gather) return -EINVAL; size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); tot_size = max_int(size, inl_size); if (tot_size > ctx->max_sq_desc_sz) return -EINVAL; return align(tot_size, MLX5_SEND_WQE_BB); } static int mlx5_calc_rcv_wqe(struct mlx5_context *ctx, struct ibv_qp_init_attr_ex *attr, struct mlx5_qp *qp) { uint32_t size; int num_scatter; if (attr->srq) return 0; num_scatter = max_t(uint32_t, attr->cap.max_recv_sge, 1); size = sizeof(struct mlx5_wqe_data_seg) * num_scatter; if (qp->wq_sig) size += sizeof(struct mlx5_rwqe_sig); if (size > ctx->max_rq_desc_sz) return -EINVAL; size = mlx5_round_up_power_of_two(size); return size; } static int mlx5_calc_sq_size(struct mlx5_context *ctx, struct ibv_qp_init_attr_ex *attr, struct mlx5_qp *qp) { int wqe_size; int wq_size; FILE *fp = ctx->dbg_fp; if (!attr->cap.max_send_wr) return 0; wqe_size = mlx5_calc_send_wqe(ctx, attr, qp); if (wqe_size < 0) { mlx5_dbg(fp, MLX5_DBG_QP, "\n"); return wqe_size; } if (wqe_size > ctx->max_sq_desc_sz) { mlx5_dbg(fp, MLX5_DBG_QP, "\n"); return -EINVAL; } qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) - sizeof(struct mlx5_wqe_inl_data_seg); attr->cap.max_inline_data = qp->max_inline_data; /* * to avoid overflow, we limit max_send_wr so * that the multiplication will fit in int */ if (attr->cap.max_send_wr > 0x7fffffff / ctx->max_sq_desc_sz) { mlx5_dbg(fp, MLX5_DBG_QP, "\n"); return -EINVAL; } wq_size = mlx5_round_up_power_of_two(attr->cap.max_send_wr * wqe_size); qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; if (qp->sq.wqe_cnt > ctx->max_send_wqebb) { mlx5_dbg(fp, MLX5_DBG_QP, "\n"); return -EINVAL; } qp->sq.wqe_shift = mlx5_ilog2(MLX5_SEND_WQE_BB); qp->sq.max_gs = attr->cap.max_send_sge; qp->sq.max_post = wq_size / wqe_size; return wq_size; } static int mlx5_calc_rwq_size(struct mlx5_context *ctx, struct mlx5_rwq *rwq, struct ibv_wq_init_attr *attr) { size_t wqe_size; int wq_size; uint32_t num_scatter; int scat_spc; if (!attr->max_wr) return -EINVAL; /* TBD: check caps for RQ */ num_scatter = max_t(uint32_t, attr->max_sge, 1); wqe_size = sizeof(struct mlx5_wqe_data_seg) * num_scatter; if (rwq->wq_sig) wqe_size += sizeof(struct mlx5_rwqe_sig); if (wqe_size <= 0 || wqe_size > ctx->max_rq_desc_sz) return -EINVAL; wqe_size = mlx5_round_up_power_of_two(wqe_size); wq_size = mlx5_round_up_power_of_two(attr->max_wr) * wqe_size; wq_size = max(wq_size, MLX5_SEND_WQE_BB); rwq->rq.wqe_cnt = wq_size / wqe_size; rwq->rq.wqe_shift = mlx5_ilog2(wqe_size); rwq->rq.max_post = 1 << mlx5_ilog2(wq_size / wqe_size); scat_spc = wqe_size - ((rwq->wq_sig) ? sizeof(struct mlx5_rwqe_sig) : 0); rwq->rq.max_gs = scat_spc / sizeof(struct mlx5_wqe_data_seg); return wq_size; } static int mlx5_calc_rq_size(struct mlx5_context *ctx, struct ibv_qp_init_attr_ex *attr, struct mlx5_qp *qp) { int wqe_size; int wq_size; int scat_spc; FILE *fp = ctx->dbg_fp; if (!attr->cap.max_recv_wr) return 0; if (attr->cap.max_recv_wr > ctx->max_recv_wr) { mlx5_dbg(fp, MLX5_DBG_QP, "\n"); return -EINVAL; } wqe_size = mlx5_calc_rcv_wqe(ctx, attr, qp); if (wqe_size < 0 || wqe_size > ctx->max_rq_desc_sz) { mlx5_dbg(fp, MLX5_DBG_QP, "\n"); return -EINVAL; } wq_size = mlx5_round_up_power_of_two(attr->cap.max_recv_wr) * wqe_size; if (wqe_size) { wq_size = max(wq_size, MLX5_SEND_WQE_BB); qp->rq.wqe_cnt = wq_size / wqe_size; qp->rq.wqe_shift = mlx5_ilog2(wqe_size); qp->rq.max_post = 1 << mlx5_ilog2(wq_size / wqe_size); scat_spc = wqe_size - (qp->wq_sig ? sizeof(struct mlx5_rwqe_sig) : 0); qp->rq.max_gs = scat_spc / sizeof(struct mlx5_wqe_data_seg); } else { qp->rq.wqe_cnt = 0; qp->rq.wqe_shift = 0; qp->rq.max_post = 0; qp->rq.max_gs = 0; } return wq_size; } static int mlx5_calc_wq_size(struct mlx5_context *ctx, struct ibv_qp_init_attr_ex *attr, struct mlx5_qp *qp) { int ret; int result; ret = mlx5_calc_sq_size(ctx, attr, qp); if (ret < 0) return ret; result = ret; ret = mlx5_calc_rq_size(ctx, attr, qp); if (ret < 0) return ret; result += ret; qp->sq.offset = ret; qp->rq.offset = 0; return result; } static void map_uuar(struct ibv_context *context, struct mlx5_qp *qp, int uuar_index) { struct mlx5_context *ctx = to_mctx(context); qp->bf = &ctx->bfs[uuar_index]; } static const char *qptype2key(enum ibv_qp_type type) { switch (type) { case IBV_QPT_RC: return "HUGE_RC"; case IBV_QPT_UC: return "HUGE_UC"; case IBV_QPT_UD: return "HUGE_UD"; case IBV_QPT_RAW_PACKET: return "HUGE_RAW_ETH"; default: return "HUGE_NA"; } } static int mlx5_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_init_attr_ex *attr, struct mlx5_qp *qp, int size) { int err; enum mlx5_alloc_type alloc_type; enum mlx5_alloc_type default_alloc_type = MLX5_ALLOC_TYPE_ANON; const char *qp_huge_key; if (qp->sq.wqe_cnt) { qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid)); if (!qp->sq.wrid) { errno = ENOMEM; err = -1; return err; } qp->sq.wr_data = malloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data)); if (!qp->sq.wr_data) { errno = ENOMEM; err = -1; goto ex_wrid; } } qp->sq.wqe_head = malloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head)); if (!qp->sq.wqe_head) { errno = ENOMEM; err = -1; goto ex_wrid; } if (qp->rq.wqe_cnt) { qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof(uint64_t)); if (!qp->rq.wrid) { errno = ENOMEM; err = -1; goto ex_wrid; } } /* compatibility support */ qp_huge_key = qptype2key(qp->ibv_qp->qp_type); if (mlx5_use_huge(qp_huge_key)) default_alloc_type = MLX5_ALLOC_TYPE_HUGE; mlx5_get_alloc_type(MLX5_QP_PREFIX, &alloc_type, default_alloc_type); err = mlx5_alloc_prefered_buf(to_mctx(context), &qp->buf, align(qp->buf_size, to_mdev (context->device)->page_size), to_mdev(context->device)->page_size, alloc_type, MLX5_QP_PREFIX); if (err) { err = -ENOMEM; goto ex_wrid; } memset(qp->buf.buf, 0, qp->buf_size); if (attr->qp_type == IBV_QPT_RAW_PACKET) { size_t aligned_sq_buf_size = align(qp->sq_buf_size, to_mdev(context->device)->page_size); /* For Raw Packet QP, allocate a separate buffer for the SQ */ err = mlx5_alloc_prefered_buf(to_mctx(context), &qp->sq_buf, aligned_sq_buf_size, to_mdev(context->device)->page_size, alloc_type, MLX5_QP_PREFIX); if (err) { err = -ENOMEM; goto rq_buf; } memset(qp->sq_buf.buf, 0, aligned_sq_buf_size); } return 0; rq_buf: mlx5_free_actual_buf(to_mctx(qp->verbs_qp.qp.context), &qp->buf); ex_wrid: if (qp->rq.wrid) free(qp->rq.wrid); if (qp->sq.wqe_head) free(qp->sq.wqe_head); if (qp->sq.wr_data) free(qp->sq.wr_data); if (qp->sq.wrid) free(qp->sq.wrid); return err; } static void mlx5_free_qp_buf(struct mlx5_qp *qp) { struct mlx5_context *ctx = to_mctx(qp->ibv_qp->context); mlx5_free_actual_buf(ctx, &qp->buf); if (qp->sq_buf.buf) mlx5_free_actual_buf(ctx, &qp->sq_buf); if (qp->rq.wrid) free(qp->rq.wrid); if (qp->sq.wqe_head) free(qp->sq.wqe_head); if (qp->sq.wrid) free(qp->sq.wrid); if (qp->sq.wr_data) free(qp->sq.wr_data); } static int mlx5_cmd_create_rss_qp(struct ibv_context *context, struct ibv_qp_init_attr_ex *attr, struct mlx5_qp *qp) { struct mlx5_create_qp_ex_rss cmd_ex_rss = {}; struct mlx5_create_qp_resp_ex resp = {}; int ret; if (attr->rx_hash_conf.rx_hash_key_len > sizeof(cmd_ex_rss.rx_hash_key)) { errno = EINVAL; return errno; } cmd_ex_rss.rx_hash_fields_mask = attr->rx_hash_conf.rx_hash_fields_mask; cmd_ex_rss.rx_hash_function = attr->rx_hash_conf.rx_hash_function; cmd_ex_rss.rx_key_len = attr->rx_hash_conf.rx_hash_key_len; memcpy(cmd_ex_rss.rx_hash_key, attr->rx_hash_conf.rx_hash_key, attr->rx_hash_conf.rx_hash_key_len); ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr, &cmd_ex_rss.ibv_cmd, sizeof(cmd_ex_rss.ibv_cmd), sizeof(cmd_ex_rss), &resp.ibv_resp, sizeof(resp.ibv_resp), sizeof(resp)); if (ret) return ret; qp->rss_qp = 1; return 0; } static int mlx5_cmd_create_qp_ex(struct ibv_context *context, struct ibv_qp_init_attr_ex *attr, struct mlx5_create_qp *cmd, struct mlx5_qp *qp, struct mlx5_create_qp_resp_ex *resp) { struct mlx5_create_qp_ex cmd_ex; int ret; memset(&cmd_ex, 0, sizeof(cmd_ex)); memcpy(&cmd_ex.ibv_cmd.base, &cmd->ibv_cmd.user_handle, offsetof(typeof(cmd->ibv_cmd), is_srq) + sizeof(cmd->ibv_cmd.is_srq) - offsetof(typeof(cmd->ibv_cmd), user_handle)); memcpy(&cmd_ex.drv_ex, &cmd->buf_addr, offsetof(typeof(*cmd), sq_buf_addr) + sizeof(cmd->sq_buf_addr) - sizeof(cmd->ibv_cmd)); ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr, &cmd_ex.ibv_cmd, sizeof(cmd_ex.ibv_cmd), sizeof(cmd_ex), &resp->ibv_resp, sizeof(resp->ibv_resp), sizeof(*resp)); return ret; } enum { MLX5_CREATE_QP_SUP_COMP_MASK = (IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_XRCD | IBV_QP_INIT_ATTR_CREATE_FLAGS | IBV_QP_INIT_ATTR_MAX_TSO_HEADER | IBV_QP_INIT_ATTR_IND_TABLE | IBV_QP_INIT_ATTR_RX_HASH), }; enum { MLX5_CREATE_QP_EX2_COMP_MASK = (IBV_QP_INIT_ATTR_CREATE_FLAGS | IBV_QP_INIT_ATTR_MAX_TSO_HEADER | IBV_QP_INIT_ATTR_IND_TABLE | IBV_QP_INIT_ATTR_RX_HASH), }; static struct ibv_qp *create_qp(struct ibv_context *context, struct ibv_qp_init_attr_ex *attr) { struct mlx5_create_qp cmd; struct mlx5_create_qp_resp resp; struct mlx5_create_qp_resp_ex resp_ex; struct mlx5_qp *qp; int ret; struct mlx5_context *ctx = to_mctx(context); struct ibv_qp *ibqp; int32_t usr_idx = 0; uint32_t uuar_index; FILE *fp = ctx->dbg_fp; if (attr->comp_mask & ~MLX5_CREATE_QP_SUP_COMP_MASK) return NULL; if ((attr->comp_mask & IBV_QP_INIT_ATTR_MAX_TSO_HEADER) && (attr->qp_type != IBV_QPT_RAW_PACKET)) return NULL; qp = calloc(1, sizeof(*qp)); if (!qp) { mlx5_dbg(fp, MLX5_DBG_QP, "\n"); return NULL; } ibqp = (struct ibv_qp *)&qp->verbs_qp; qp->ibv_qp = ibqp; memset(&cmd, 0, sizeof(cmd)); memset(&resp, 0, sizeof(resp)); memset(&resp_ex, 0, sizeof(resp_ex)); if (attr->comp_mask & IBV_QP_INIT_ATTR_RX_HASH) { ret = mlx5_cmd_create_rss_qp(context, attr, qp); if (ret) goto err; return ibqp; } qp->wq_sig = qp_sig_enabled(); if (qp->wq_sig) cmd.flags |= MLX5_QP_FLAG_SIGNATURE; if (use_scatter_to_cqe()) cmd.flags |= MLX5_QP_FLAG_SCATTER_CQE; ret = mlx5_calc_wq_size(ctx, attr, qp); if (ret < 0) { errno = -ret; goto err; } if (attr->qp_type == IBV_QPT_RAW_PACKET) { qp->buf_size = qp->sq.offset; qp->sq_buf_size = ret - qp->buf_size; qp->sq.offset = 0; } else { qp->buf_size = ret; qp->sq_buf_size = 0; } if (mlx5_alloc_qp_buf(context, attr, qp, ret)) { mlx5_dbg(fp, MLX5_DBG_QP, "\n"); goto err; } if (attr->qp_type == IBV_QPT_RAW_PACKET) { qp->sq_start = qp->sq_buf.buf; qp->sq.qend = qp->sq_buf.buf + (qp->sq.wqe_cnt << qp->sq.wqe_shift); } else { qp->sq_start = qp->buf.buf + qp->sq.offset; qp->sq.qend = qp->buf.buf + qp->sq.offset + (qp->sq.wqe_cnt << qp->sq.wqe_shift); } mlx5_init_qp_indices(qp); - if (mlx5_spinlock_init(&qp->sq.lock) || - mlx5_spinlock_init(&qp->rq.lock)) + if (mlx5_spinlock_init(&qp->sq.lock)) goto err_free_qp_buf; + if (mlx5_spinlock_init(&qp->rq.lock)) + goto err_sq_spl; + qp->db = mlx5_alloc_dbrec(ctx); if (!qp->db) { mlx5_dbg(fp, MLX5_DBG_QP, "\n"); - goto err_free_qp_buf; + goto err_rq_spl; } qp->db[MLX5_RCV_DBR] = 0; qp->db[MLX5_SND_DBR] = 0; cmd.buf_addr = (uintptr_t) qp->buf.buf; cmd.sq_buf_addr = (attr->qp_type == IBV_QPT_RAW_PACKET) ? (uintptr_t) qp->sq_buf.buf : 0; cmd.db_addr = (uintptr_t) qp->db; cmd.sq_wqe_count = qp->sq.wqe_cnt; cmd.rq_wqe_count = qp->rq.wqe_cnt; cmd.rq_wqe_shift = qp->rq.wqe_shift; if (ctx->atomic_cap == IBV_ATOMIC_HCA) qp->atomics_enabled = 1; if (!ctx->cqe_version) { cmd.uidx = 0xffffff; pthread_mutex_lock(&ctx->qp_table_mutex); } else if (!is_xrc_tgt(attr->qp_type)) { usr_idx = mlx5_store_uidx(ctx, qp); if (usr_idx < 0) { mlx5_dbg(fp, MLX5_DBG_QP, "Couldn't find free user index\n"); goto err_rq_db; } cmd.uidx = usr_idx; } if (attr->comp_mask & MLX5_CREATE_QP_EX2_COMP_MASK) ret = mlx5_cmd_create_qp_ex(context, attr, &cmd, qp, &resp_ex); else ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (ret) { mlx5_dbg(fp, MLX5_DBG_QP, "ret %d\n", ret); goto err_free_uidx; } uuar_index = (attr->comp_mask & MLX5_CREATE_QP_EX2_COMP_MASK) ? resp_ex.uuar_index : resp.uuar_index; if (!ctx->cqe_version) { if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) { ret = mlx5_store_qp(ctx, ibqp->qp_num, qp); if (ret) { mlx5_dbg(fp, MLX5_DBG_QP, "ret %d\n", ret); goto err_destroy; } } pthread_mutex_unlock(&ctx->qp_table_mutex); } map_uuar(context, qp, uuar_index); qp->rq.max_post = qp->rq.wqe_cnt; if (attr->sq_sig_all) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; else qp->sq_signal_bits = 0; attr->cap.max_send_wr = qp->sq.max_post; attr->cap.max_recv_wr = qp->rq.max_post; attr->cap.max_recv_sge = qp->rq.max_gs; qp->rsc.type = MLX5_RSC_TYPE_QP; qp->rsc.rsn = (ctx->cqe_version && !is_xrc_tgt(attr->qp_type)) ? usr_idx : ibqp->qp_num; return ibqp; err_destroy: ibv_cmd_destroy_qp(ibqp); err_free_uidx: if (!ctx->cqe_version) pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex); else if (!is_xrc_tgt(attr->qp_type)) mlx5_clear_uidx(ctx, usr_idx); err_rq_db: mlx5_free_db(to_mctx(context), qp->db); +err_rq_spl: + mlx5_spinlock_destroy(&qp->rq.lock); + +err_sq_spl: + mlx5_spinlock_destroy(&qp->sq.lock); + err_free_qp_buf: mlx5_free_qp_buf(qp); err: free(qp); return NULL; } struct ibv_qp *mlx5_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) { struct ibv_qp *qp; struct ibv_qp_init_attr_ex attrx; memset(&attrx, 0, sizeof(attrx)); memcpy(&attrx, attr, sizeof(*attr)); attrx.comp_mask = IBV_QP_INIT_ATTR_PD; attrx.pd = pd; qp = create_qp(pd->context, &attrx); if (qp) memcpy(attr, &attrx, sizeof(*attr)); return qp; } static void mlx5_lock_cqs(struct ibv_qp *qp) { struct mlx5_cq *send_cq = to_mcq(qp->send_cq); struct mlx5_cq *recv_cq = to_mcq(qp->recv_cq); if (send_cq && recv_cq) { if (send_cq == recv_cq) { mlx5_spin_lock(&send_cq->lock); } else if (send_cq->cqn < recv_cq->cqn) { mlx5_spin_lock(&send_cq->lock); mlx5_spin_lock(&recv_cq->lock); } else { mlx5_spin_lock(&recv_cq->lock); mlx5_spin_lock(&send_cq->lock); } } else if (send_cq) { mlx5_spin_lock(&send_cq->lock); } else if (recv_cq) { mlx5_spin_lock(&recv_cq->lock); } } static void mlx5_unlock_cqs(struct ibv_qp *qp) { struct mlx5_cq *send_cq = to_mcq(qp->send_cq); struct mlx5_cq *recv_cq = to_mcq(qp->recv_cq); if (send_cq && recv_cq) { if (send_cq == recv_cq) { mlx5_spin_unlock(&send_cq->lock); } else if (send_cq->cqn < recv_cq->cqn) { mlx5_spin_unlock(&recv_cq->lock); mlx5_spin_unlock(&send_cq->lock); } else { mlx5_spin_unlock(&send_cq->lock); mlx5_spin_unlock(&recv_cq->lock); } } else if (send_cq) { mlx5_spin_unlock(&send_cq->lock); } else if (recv_cq) { mlx5_spin_unlock(&recv_cq->lock); } } int mlx5_destroy_qp(struct ibv_qp *ibqp) { struct mlx5_qp *qp = to_mqp(ibqp); struct mlx5_context *ctx = to_mctx(ibqp->context); int ret; if (qp->rss_qp) { ret = ibv_cmd_destroy_qp(ibqp); if (ret) return ret; goto free; } if (!ctx->cqe_version) pthread_mutex_lock(&ctx->qp_table_mutex); ret = ibv_cmd_destroy_qp(ibqp); if (ret) { if (!ctx->cqe_version) pthread_mutex_unlock(&ctx->qp_table_mutex); return ret; } mlx5_lock_cqs(ibqp); __mlx5_cq_clean(to_mcq(ibqp->recv_cq), qp->rsc.rsn, ibqp->srq ? to_msrq(ibqp->srq) : NULL); if (ibqp->send_cq != ibqp->recv_cq) __mlx5_cq_clean(to_mcq(ibqp->send_cq), qp->rsc.rsn, NULL); if (!ctx->cqe_version) { if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) mlx5_clear_qp(ctx, ibqp->qp_num); } mlx5_unlock_cqs(ibqp); if (!ctx->cqe_version) pthread_mutex_unlock(&ctx->qp_table_mutex); else if (!is_xrc_tgt(ibqp->qp_type)) mlx5_clear_uidx(ctx, qp->rsc.rsn); mlx5_free_db(ctx, qp->db); + mlx5_spinlock_destroy(&qp->rq.lock); + mlx5_spinlock_destroy(&qp->sq.lock); mlx5_free_qp_buf(qp); free: free(qp); return 0; } int mlx5_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) { struct ibv_query_qp cmd; struct mlx5_qp *qp = to_mqp(ibqp); int ret; if (qp->rss_qp) return ENOSYS; ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof(cmd)); if (ret) return ret; init_attr->cap.max_send_wr = qp->sq.max_post; init_attr->cap.max_send_sge = qp->sq.max_gs; init_attr->cap.max_inline_data = qp->max_inline_data; attr->cap = init_attr->cap; return 0; } enum { MLX5_MODIFY_QP_EX_ATTR_MASK = IBV_QP_RATE_LIMIT, }; int mlx5_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { struct ibv_modify_qp cmd = {}; struct ibv_modify_qp_ex cmd_ex = {}; struct ibv_modify_qp_resp_ex resp = {}; struct mlx5_qp *mqp = to_mqp(qp); struct mlx5_context *context = to_mctx(qp->context); int ret; uint32_t *db; if (mqp->rss_qp) return ENOSYS; if (attr_mask & IBV_QP_PORT) { switch (qp->qp_type) { case IBV_QPT_RAW_PACKET: if (context->cached_link_layer[attr->port_num - 1] == IBV_LINK_LAYER_ETHERNET) { if (context->cached_device_cap_flags & IBV_DEVICE_RAW_IP_CSUM) mqp->qp_cap_cache |= MLX5_CSUM_SUPPORT_RAW_OVER_ETH | MLX5_RX_CSUM_VALID; if (ibv_is_qpt_supported( context->cached_tso_caps.supported_qpts, IBV_QPT_RAW_PACKET)) mqp->max_tso = context->cached_tso_caps.max_tso; } break; default: break; } } if (attr_mask & MLX5_MODIFY_QP_EX_ATTR_MASK) ret = ibv_cmd_modify_qp_ex(qp, attr, attr_mask, &cmd_ex, sizeof(cmd_ex), sizeof(cmd_ex), &resp, sizeof(resp), sizeof(resp)); else ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof(cmd)); if (!ret && (attr_mask & IBV_QP_STATE) && attr->qp_state == IBV_QPS_RESET) { if (qp->recv_cq) { mlx5_cq_clean(to_mcq(qp->recv_cq), mqp->rsc.rsn, qp->srq ? to_msrq(qp->srq) : NULL); } if (qp->send_cq != qp->recv_cq && qp->send_cq) mlx5_cq_clean(to_mcq(qp->send_cq), to_mqp(qp)->rsc.rsn, NULL); mlx5_init_qp_indices(mqp); db = mqp->db; db[MLX5_RCV_DBR] = 0; db[MLX5_SND_DBR] = 0; } /* * When the Raw Packet QP is in INIT state, its RQ * underneath is already in RDY, which means it can * receive packets. According to the IB spec, a QP can't * receive packets until moved to RTR state. To achieve this, * for Raw Packet QPs, we update the doorbell record * once the QP is moved to RTR. */ if (!ret && (attr_mask & IBV_QP_STATE) && attr->qp_state == IBV_QPS_RTR && qp->qp_type == IBV_QPT_RAW_PACKET) { mlx5_spin_lock(&mqp->rq.lock); mqp->db[MLX5_RCV_DBR] = htobe32(mqp->rq.head & 0xffff); mlx5_spin_unlock(&mqp->rq.lock); } return ret; } #define RROCE_UDP_SPORT_MIN 0xC000 #define RROCE_UDP_SPORT_MAX 0xFFFF struct ibv_ah *mlx5_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) { struct mlx5_context *ctx = to_mctx(pd->context); struct ibv_port_attr port_attr; struct mlx5_ah *ah; uint32_t gid_type; uint32_t tmp; uint8_t grh; int is_eth; if (attr->port_num < 1 || attr->port_num > ctx->num_ports) return NULL; if (ctx->cached_link_layer[attr->port_num - 1]) { is_eth = ctx->cached_link_layer[attr->port_num - 1] == IBV_LINK_LAYER_ETHERNET; } else { if (ibv_query_port(pd->context, attr->port_num, &port_attr)) return NULL; is_eth = (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET); } if (unlikely((!attr->is_global) && is_eth)) { errno = EINVAL; return NULL; } ah = calloc(1, sizeof *ah); if (!ah) return NULL; if (is_eth) { if (ibv_query_gid_type(pd->context, attr->port_num, attr->grh.sgid_index, &gid_type)) goto err; if (gid_type == IBV_GID_TYPE_ROCE_V2) ah->av.rlid = htobe16(rand() % (RROCE_UDP_SPORT_MAX + 1 - RROCE_UDP_SPORT_MIN) + RROCE_UDP_SPORT_MIN); /* Since RoCE packets must contain GRH, this bit is reserved * for RoCE and shouldn't be set. */ grh = 0; } else { ah->av.fl_mlid = attr->src_path_bits & 0x7f; ah->av.rlid = htobe16(attr->dlid); grh = 1; } ah->av.stat_rate_sl = (attr->static_rate << 4) | attr->sl; if (attr->is_global) { ah->av.tclass = attr->grh.traffic_class; ah->av.hop_limit = attr->grh.hop_limit; tmp = htobe32((grh << 30) | ((attr->grh.sgid_index & 0xff) << 20) | (attr->grh.flow_label & 0xfffff)); ah->av.grh_gid_fl = tmp; memcpy(ah->av.rgid, attr->grh.dgid.raw, 16); } if (is_eth) { if (ctx->cmds_supp_uhw & MLX5_USER_CMDS_SUPP_UHW_CREATE_AH) { struct mlx5_create_ah_resp resp = {}; if (ibv_cmd_create_ah(pd, &ah->ibv_ah, attr, &resp.ibv_resp, sizeof(resp))) goto err; ah->kern_ah = true; memcpy(ah->av.rmac, resp.dmac, ETHERNET_LL_SIZE); } else { uint16_t vid; if (ibv_resolve_eth_l2_from_gid(pd->context, attr, ah->av.rmac, &vid)) goto err; } } return &ah->ibv_ah; err: free(ah); return NULL; } int mlx5_destroy_ah(struct ibv_ah *ah) { struct mlx5_ah *mah = to_mah(ah); int err; if (mah->kern_ah) { err = ibv_cmd_destroy_ah(ah); if (err) return err; } free(mah); return 0; } int mlx5_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) { return ibv_cmd_attach_mcast(qp, gid, lid); } int mlx5_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) { return ibv_cmd_detach_mcast(qp, gid, lid); } struct ibv_qp *mlx5_create_qp_ex(struct ibv_context *context, struct ibv_qp_init_attr_ex *attr) { return create_qp(context, attr); } int mlx5_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num) { struct mlx5_srq *msrq = to_msrq(srq); *srq_num = msrq->srqn; return 0; } struct ibv_xrcd * mlx5_open_xrcd(struct ibv_context *context, struct ibv_xrcd_init_attr *xrcd_init_attr) { int err; struct verbs_xrcd *xrcd; struct ibv_open_xrcd cmd = {}; struct ibv_open_xrcd_resp resp = {}; xrcd = calloc(1, sizeof(*xrcd)); if (!xrcd) return NULL; err = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), xrcd_init_attr, &cmd, sizeof(cmd), &resp, sizeof(resp)); if (err) { free(xrcd); return NULL; } return &xrcd->xrcd; } int mlx5_close_xrcd(struct ibv_xrcd *ib_xrcd) { struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd); int ret; ret = ibv_cmd_close_xrcd(xrcd); if (!ret) free(xrcd); return ret; } static struct ibv_srq * mlx5_create_xrc_srq(struct ibv_context *context, struct ibv_srq_init_attr_ex *attr) { int err; struct mlx5_create_srq_ex cmd; struct mlx5_create_srq_resp resp; struct mlx5_srq *msrq; struct mlx5_context *ctx = to_mctx(context); int max_sge; struct ibv_srq *ibsrq; int uidx; FILE *fp = ctx->dbg_fp; msrq = calloc(1, sizeof(*msrq)); if (!msrq) return NULL; ibsrq = (struct ibv_srq *)&msrq->vsrq; memset(&cmd, 0, sizeof(cmd)); memset(&resp, 0, sizeof(resp)); if (mlx5_spinlock_init(&msrq->lock)) { fprintf(stderr, "%s-%d:\n", __func__, __LINE__); goto err; } if (attr->attr.max_wr > ctx->max_srq_recv_wr) { fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n", __func__, __LINE__, attr->attr.max_wr, ctx->max_srq_recv_wr); errno = EINVAL; - goto err; + goto err_spl; } /* * this calculation does not consider required control segments. The * final calculation is done again later. This is done so to avoid * overflows of variables */ max_sge = ctx->max_recv_wr / sizeof(struct mlx5_wqe_data_seg); if (attr->attr.max_sge > max_sge) { fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n", __func__, __LINE__, attr->attr.max_wr, ctx->max_srq_recv_wr); errno = EINVAL; - goto err; + goto err_spl; } msrq->max = align_queue_size(attr->attr.max_wr + 1); msrq->max_gs = attr->attr.max_sge; msrq->counter = 0; if (mlx5_alloc_srq_buf(context, msrq)) { fprintf(stderr, "%s-%d:\n", __func__, __LINE__); - goto err; + goto err_spl; } msrq->db = mlx5_alloc_dbrec(ctx); if (!msrq->db) { fprintf(stderr, "%s-%d:\n", __func__, __LINE__); goto err_free; } *msrq->db = 0; cmd.buf_addr = (uintptr_t)msrq->buf.buf; cmd.db_addr = (uintptr_t)msrq->db; msrq->wq_sig = srq_sig_enabled(); if (msrq->wq_sig) cmd.flags = MLX5_SRQ_FLAG_SIGNATURE; attr->attr.max_sge = msrq->max_gs; if (ctx->cqe_version) { uidx = mlx5_store_uidx(ctx, msrq); if (uidx < 0) { mlx5_dbg(fp, MLX5_DBG_QP, "Couldn't find free user index\n"); goto err_free_db; } cmd.uidx = uidx; } else { cmd.uidx = 0xffffff; pthread_mutex_lock(&ctx->srq_table_mutex); } err = ibv_cmd_create_srq_ex(context, &msrq->vsrq, sizeof(msrq->vsrq), attr, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (err) goto err_free_uidx; if (!ctx->cqe_version) { err = mlx5_store_srq(to_mctx(context), resp.srqn, msrq); if (err) goto err_destroy; pthread_mutex_unlock(&ctx->srq_table_mutex); } msrq->srqn = resp.srqn; msrq->rsc.type = MLX5_RSC_TYPE_XSRQ; msrq->rsc.rsn = ctx->cqe_version ? cmd.uidx : resp.srqn; return ibsrq; err_destroy: ibv_cmd_destroy_srq(ibsrq); err_free_uidx: if (ctx->cqe_version) mlx5_clear_uidx(ctx, cmd.uidx); else pthread_mutex_unlock(&ctx->srq_table_mutex); err_free_db: mlx5_free_db(ctx, msrq->db); err_free: free(msrq->wrid); mlx5_free_buf(&msrq->buf); +err_spl: + mlx5_spinlock_destroy(&msrq->lock); + err: free(msrq); return NULL; } struct ibv_srq *mlx5_create_srq_ex(struct ibv_context *context, struct ibv_srq_init_attr_ex *attr) { if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) || (attr->srq_type == IBV_SRQT_BASIC)) return mlx5_create_srq(attr->pd, (struct ibv_srq_init_attr *)attr); else if (attr->srq_type == IBV_SRQT_XRC) return mlx5_create_xrc_srq(context, attr); return NULL; } int mlx5_query_device_ex(struct ibv_context *context, const struct ibv_query_device_ex_input *input, struct ibv_device_attr_ex *attr, size_t attr_size) { struct mlx5_context *mctx = to_mctx(context); struct mlx5_query_device_ex_resp resp; struct mlx5_query_device_ex cmd; struct ibv_device_attr *a; uint64_t raw_fw_ver; unsigned sub_minor; unsigned major; unsigned minor; int err; int cmd_supp_uhw = mctx->cmds_supp_uhw & MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE; memset(&cmd, 0, sizeof(cmd)); memset(&resp, 0, sizeof(resp)); err = ibv_cmd_query_device_ex(context, input, attr, attr_size, &raw_fw_ver, &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd), &resp.ibv_resp, sizeof(resp.ibv_resp), cmd_supp_uhw ? sizeof(resp) : sizeof(resp.ibv_resp)); if (err) return err; attr->tso_caps = resp.tso_caps; attr->rss_caps.rx_hash_fields_mask = resp.rss_caps.rx_hash_fields_mask; attr->rss_caps.rx_hash_function = resp.rss_caps.rx_hash_function; attr->packet_pacing_caps = resp.packet_pacing_caps.caps; if (resp.support_multi_pkt_send_wqe) mctx->vendor_cap_flags |= MLX5_VENDOR_CAP_FLAGS_MPW; mctx->cqe_comp_caps = resp.cqe_comp_caps; major = (raw_fw_ver >> 32) & 0xffff; minor = (raw_fw_ver >> 16) & 0xffff; sub_minor = raw_fw_ver & 0xffff; a = &attr->orig_attr; snprintf(a->fw_ver, sizeof(a->fw_ver), "%d.%d.%04d", major, minor, sub_minor); return 0; } static int rwq_sig_enabled(struct ibv_context *context) { char *env; env = getenv("MLX5_RWQ_SIGNATURE"); if (env) return 1; return 0; } static void mlx5_free_rwq_buf(struct mlx5_rwq *rwq, struct ibv_context *context) { struct mlx5_context *ctx = to_mctx(context); mlx5_free_actual_buf(ctx, &rwq->buf); free(rwq->rq.wrid); } static int mlx5_alloc_rwq_buf(struct ibv_context *context, struct mlx5_rwq *rwq, int size) { int err; enum mlx5_alloc_type default_alloc_type = MLX5_ALLOC_TYPE_PREFER_CONTIG; rwq->rq.wrid = malloc(rwq->rq.wqe_cnt * sizeof(uint64_t)); if (!rwq->rq.wrid) { errno = ENOMEM; return -1; } err = mlx5_alloc_prefered_buf(to_mctx(context), &rwq->buf, align(rwq->buf_size, to_mdev (context->device)->page_size), to_mdev(context->device)->page_size, default_alloc_type, MLX5_RWQ_PREFIX); if (err) { free(rwq->rq.wrid); errno = ENOMEM; return -1; } return 0; } struct ibv_wq *mlx5_create_wq(struct ibv_context *context, struct ibv_wq_init_attr *attr) { struct mlx5_create_wq cmd; struct mlx5_create_wq_resp resp; int err; struct mlx5_rwq *rwq; struct mlx5_context *ctx = to_mctx(context); int ret; int32_t usr_idx = 0; FILE *fp = ctx->dbg_fp; if (attr->wq_type != IBV_WQT_RQ) return NULL; memset(&cmd, 0, sizeof(cmd)); memset(&resp, 0, sizeof(resp)); rwq = calloc(1, sizeof(*rwq)); if (!rwq) return NULL; rwq->wq_sig = rwq_sig_enabled(context); if (rwq->wq_sig) cmd.drv.flags = MLX5_RWQ_FLAG_SIGNATURE; ret = mlx5_calc_rwq_size(ctx, rwq, attr); if (ret < 0) { errno = -ret; goto err; } + ret = ibv_init_wq(&rwq->wq); + if (ret < 0) + goto err; + rwq->buf_size = ret; if (mlx5_alloc_rwq_buf(context, rwq, ret)) - goto err; + goto err_cleanup_wq; mlx5_init_rwq_indices(rwq); if (mlx5_spinlock_init(&rwq->rq.lock)) goto err_free_rwq_buf; rwq->db = mlx5_alloc_dbrec(ctx); if (!rwq->db) - goto err_free_rwq_buf; + goto err_spl; rwq->db[MLX5_RCV_DBR] = 0; rwq->db[MLX5_SND_DBR] = 0; rwq->pbuff = rwq->buf.buf + rwq->rq.offset; rwq->recv_db = &rwq->db[MLX5_RCV_DBR]; cmd.drv.buf_addr = (uintptr_t)rwq->buf.buf; cmd.drv.db_addr = (uintptr_t)rwq->db; cmd.drv.rq_wqe_count = rwq->rq.wqe_cnt; cmd.drv.rq_wqe_shift = rwq->rq.wqe_shift; usr_idx = mlx5_store_uidx(ctx, rwq); if (usr_idx < 0) { mlx5_dbg(fp, MLX5_DBG_QP, "Couldn't find free user index\n"); goto err_free_db_rec; } cmd.drv.user_index = usr_idx; err = ibv_cmd_create_wq(context, attr, &rwq->wq, &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd), &resp.ibv_resp, sizeof(resp.ibv_resp), sizeof(resp)); if (err) goto err_create; rwq->rsc.type = MLX5_RSC_TYPE_RWQ; rwq->rsc.rsn = cmd.drv.user_index; rwq->wq.post_recv = mlx5_post_wq_recv; return &rwq->wq; err_create: mlx5_clear_uidx(ctx, cmd.drv.user_index); err_free_db_rec: mlx5_free_db(to_mctx(context), rwq->db); +err_spl: + mlx5_spinlock_destroy(&rwq->rq.lock); err_free_rwq_buf: mlx5_free_rwq_buf(rwq, context); +err_cleanup_wq: + ibv_cleanup_wq(&rwq->wq); err: free(rwq); return NULL; } int mlx5_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *attr) { struct mlx5_modify_wq cmd = {}; struct mlx5_rwq *rwq = to_mrwq(wq); if ((attr->attr_mask & IBV_WQ_ATTR_STATE) && attr->wq_state == IBV_WQS_RDY) { if ((attr->attr_mask & IBV_WQ_ATTR_CURR_STATE) && attr->curr_wq_state != wq->state) return -EINVAL; if (wq->state == IBV_WQS_RESET) { mlx5_spin_lock(&to_mcq(wq->cq)->lock); __mlx5_cq_clean(to_mcq(wq->cq), rwq->rsc.rsn, NULL); mlx5_spin_unlock(&to_mcq(wq->cq)->lock); mlx5_init_rwq_indices(rwq); rwq->db[MLX5_RCV_DBR] = 0; rwq->db[MLX5_SND_DBR] = 0; } } return ibv_cmd_modify_wq(wq, attr, &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd)); } int mlx5_destroy_wq(struct ibv_wq *wq) { struct mlx5_rwq *rwq = to_mrwq(wq); int ret; ret = ibv_cmd_destroy_wq(wq); if (ret) return ret; mlx5_spin_lock(&to_mcq(wq->cq)->lock); __mlx5_cq_clean(to_mcq(wq->cq), rwq->rsc.rsn, NULL); mlx5_spin_unlock(&to_mcq(wq->cq)->lock); mlx5_clear_uidx(to_mctx(wq->context), rwq->rsc.rsn); mlx5_free_db(to_mctx(wq->context), rwq->db); + mlx5_spinlock_destroy(&rwq->rq.lock); mlx5_free_rwq_buf(rwq, wq->context); + ibv_cleanup_wq(&rwq->wq); free(rwq); return 0; } struct ibv_rwq_ind_table *mlx5_create_rwq_ind_table(struct ibv_context *context, struct ibv_rwq_ind_table_init_attr *init_attr) { struct ibv_create_rwq_ind_table *cmd; struct mlx5_create_rwq_ind_table_resp resp; struct ibv_rwq_ind_table *ind_table; uint32_t required_tbl_size; int num_tbl_entries; int cmd_size; int err; num_tbl_entries = 1 << init_attr->log_ind_tbl_size; /* Data must be u64 aligned */ required_tbl_size = (num_tbl_entries * sizeof(uint32_t)) < sizeof(uint64_t) ? sizeof(uint64_t) : (num_tbl_entries * sizeof(uint32_t)); cmd_size = required_tbl_size + sizeof(*cmd); cmd = calloc(1, cmd_size); if (!cmd) return NULL; memset(&resp, 0, sizeof(resp)); ind_table = calloc(1, sizeof(*ind_table)); if (!ind_table) goto free_cmd; err = ibv_cmd_create_rwq_ind_table(context, init_attr, ind_table, cmd, cmd_size, cmd_size, &resp.ibv_resp, sizeof(resp.ibv_resp), sizeof(resp)); if (err) goto err; free(cmd); return ind_table; err: free(ind_table); free_cmd: free(cmd); return NULL; } int mlx5_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table) { int ret; ret = ibv_cmd_destroy_rwq_ind_table(rwq_ind_table); if (ret) return ret; free(rwq_ind_table); return 0; } diff --git a/contrib/ofed/librdmacm/cma.c b/contrib/ofed/librdmacm/cma.c index 833888757a1c..3c108b392a8a 100644 --- a/contrib/ofed/librdmacm/cma.c +++ b/contrib/ofed/librdmacm/cma.c @@ -1,2460 +1,2461 @@ /* * Copyright (c) 2005-2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cma.h" #include "indexer.h" #include #include #include #include #include #include #define CMA_INIT_CMD(req, req_size, op) \ do { \ memset(req, 0, req_size); \ (req)->cmd = UCMA_CMD_##op; \ (req)->in = req_size - sizeof(struct ucma_abi_cmd_hdr); \ } while (0) #define CMA_INIT_CMD_RESP(req, req_size, op, resp, resp_size) \ do { \ CMA_INIT_CMD(req, req_size, op); \ (req)->out = resp_size; \ (req)->response = (uintptr_t) (resp); \ } while (0) struct cma_port { uint8_t link_layer; }; struct cma_device { struct ibv_context *verbs; struct ibv_pd *pd; struct ibv_xrcd *xrcd; struct cma_port *port; __be64 guid; int port_cnt; int refcnt; int max_qpsize; uint8_t max_initiator_depth; uint8_t max_responder_resources; }; struct cma_id_private { struct rdma_cm_id id; struct cma_device *cma_dev; void *connect; size_t connect_len; int events_completed; int connect_error; int sync; pthread_cond_t cond; pthread_mutex_t mut; uint32_t handle; struct cma_multicast *mc_list; struct ibv_qp_init_attr *qp_init_attr; uint8_t initiator_depth; uint8_t responder_resources; }; struct cma_multicast { struct cma_multicast *next; struct cma_id_private *id_priv; void *context; int events_completed; pthread_cond_t cond; uint32_t handle; union ibv_gid mgid; uint16_t mlid; struct sockaddr_storage addr; }; struct cma_event { struct rdma_cm_event event; uint8_t private_data[RDMA_MAX_PRIVATE_DATA]; struct cma_id_private *id_priv; struct cma_multicast *mc; }; static struct cma_device *cma_dev_array; static int cma_dev_cnt; static int cma_init_cnt; static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; static int abi_ver = RDMA_USER_CM_MAX_ABI_VERSION; int af_ib_support; static struct index_map ucma_idm; static fastlock_t idm_lock; static int check_abi_version(void) { char value[8]; if ((ibv_read_sysfs_file(ibv_get_sysfs_path(), "class/misc/rdma_cm/abi_version", value, sizeof value) < 0) && (ibv_read_sysfs_file(ibv_get_sysfs_path(), "class/infiniband_ucma/abi_version", value, sizeof value) < 0)) { /* * Older version of Linux do not have class/misc. To support * backports, assume the most recent version of the ABI. If * we're wrong, we'll simply fail later when calling the ABI. */ return 0; } abi_ver = strtol(value, NULL, 10); if (abi_ver < RDMA_USER_CM_MIN_ABI_VERSION || abi_ver > RDMA_USER_CM_MAX_ABI_VERSION) { return -1; } return 0; } /* * This function is called holding the mutex lock * cma_dev_cnt must be set before calling this function to * ensure that the lock is not acquired recursively. */ static void ucma_set_af_ib_support(void) { struct rdma_cm_id *id; struct sockaddr_ib sib; int ret; ret = rdma_create_id(NULL, &id, NULL, RDMA_PS_IB); if (ret) return; memset(&sib, 0, sizeof sib); sib.sib_family = AF_IB; sib.sib_sid = htobe64(RDMA_IB_IP_PS_TCP); sib.sib_sid_mask = htobe64(RDMA_IB_IP_PS_MASK); af_ib_support = 1; ret = rdma_bind_addr(id, (struct sockaddr *) &sib); af_ib_support = !ret; rdma_destroy_id(id); } int ucma_init(void) { struct ibv_device **dev_list = NULL; int i, ret, dev_cnt; /* Quick check without lock to see if we're already initialized */ if (cma_dev_cnt) return 0; pthread_mutex_lock(&mut); if (cma_dev_cnt) { pthread_mutex_unlock(&mut); return 0; } fastlock_init(&idm_lock); ret = check_abi_version(); if (ret) goto err1; dev_list = ibv_get_device_list(&dev_cnt); if (!dev_list) { ret = ERR(ENODEV); goto err1; } if (!dev_cnt) { ret = ERR(ENODEV); goto err2; } cma_dev_array = calloc(dev_cnt, sizeof(*cma_dev_array)); if (!cma_dev_array) { ret = ERR(ENOMEM); goto err2; } for (i = 0; dev_list[i]; i++) cma_dev_array[i].guid = ibv_get_device_guid(dev_list[i]); cma_dev_cnt = dev_cnt; ucma_set_af_ib_support(); pthread_mutex_unlock(&mut); ibv_free_device_list(dev_list); return 0; err2: ibv_free_device_list(dev_list); err1: fastlock_destroy(&idm_lock); pthread_mutex_unlock(&mut); return ret; } static struct ibv_context *ucma_open_device(__be64 guid) { struct ibv_device **dev_list; struct ibv_context *verbs = NULL; int i; dev_list = ibv_get_device_list(NULL); if (!dev_list) { return NULL; } for (i = 0; dev_list[i]; i++) { if (ibv_get_device_guid(dev_list[i]) == guid) { verbs = ibv_open_device(dev_list[i]); break; } } ibv_free_device_list(dev_list); return verbs; } static int ucma_init_device(struct cma_device *cma_dev) { struct ibv_port_attr port_attr; struct ibv_device_attr attr; int i, ret; if (cma_dev->verbs) return 0; cma_dev->verbs = ucma_open_device(cma_dev->guid); if (!cma_dev->verbs) return ERR(ENODEV); ret = ibv_query_device(cma_dev->verbs, &attr); if (ret) { ret = ERR(ret); goto err; } cma_dev->port = malloc(sizeof(*cma_dev->port) * attr.phys_port_cnt); if (!cma_dev->port) { ret = ERR(ENOMEM); goto err; } for (i = 1; i <= attr.phys_port_cnt; i++) { if (ibv_query_port(cma_dev->verbs, i, &port_attr)) cma_dev->port[i - 1].link_layer = IBV_LINK_LAYER_UNSPECIFIED; else cma_dev->port[i - 1].link_layer = port_attr.link_layer; } cma_dev->port_cnt = attr.phys_port_cnt; cma_dev->max_qpsize = attr.max_qp_wr; cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom; cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom; cma_init_cnt++; return 0; err: ibv_close_device(cma_dev->verbs); cma_dev->verbs = NULL; return ret; } static int ucma_init_all(void) { int i, ret = 0; if (!cma_dev_cnt) { ret = ucma_init(); if (ret) return ret; } if (cma_init_cnt == cma_dev_cnt) return 0; pthread_mutex_lock(&mut); for (i = 0; i < cma_dev_cnt; i++) { ret = ucma_init_device(&cma_dev_array[i]); if (ret) break; } pthread_mutex_unlock(&mut); return ret; } struct ibv_context **rdma_get_devices(int *num_devices) { struct ibv_context **devs = NULL; int i; if (ucma_init_all()) goto out; devs = malloc(sizeof(*devs) * (cma_dev_cnt + 1)); if (!devs) goto out; for (i = 0; i < cma_dev_cnt; i++) devs[i] = cma_dev_array[i].verbs; devs[i] = NULL; out: if (num_devices) *num_devices = devs ? cma_dev_cnt : 0; return devs; } void rdma_free_devices(struct ibv_context **list) { free(list); } struct rdma_event_channel *rdma_create_event_channel(void) { struct rdma_event_channel *channel; if (ucma_init()) return NULL; channel = malloc(sizeof(*channel)); if (!channel) return NULL; channel->fd = open("/dev/rdma_cm", O_RDWR | O_CLOEXEC); if (channel->fd < 0) { goto err; } return channel; err: free(channel); return NULL; } void rdma_destroy_event_channel(struct rdma_event_channel *channel) { close(channel->fd); free(channel); } static int ucma_get_device(struct cma_id_private *id_priv, __be64 guid) { struct cma_device *cma_dev; int i, ret; for (i = 0; i < cma_dev_cnt; i++) { cma_dev = &cma_dev_array[i]; if (cma_dev->guid == guid) goto match; } return ERR(ENODEV); match: pthread_mutex_lock(&mut); if ((ret = ucma_init_device(cma_dev))) goto out; if (!cma_dev->refcnt++) { cma_dev->pd = ibv_alloc_pd(cma_dev->verbs); if (!cma_dev->pd) { cma_dev->refcnt--; ret = ERR(ENOMEM); goto out; } } id_priv->cma_dev = cma_dev; id_priv->id.verbs = cma_dev->verbs; id_priv->id.pd = cma_dev->pd; out: pthread_mutex_unlock(&mut); return ret; } static void ucma_put_device(struct cma_device *cma_dev) { pthread_mutex_lock(&mut); if (!--cma_dev->refcnt) { ibv_dealloc_pd(cma_dev->pd); if (cma_dev->xrcd) ibv_close_xrcd(cma_dev->xrcd); } pthread_mutex_unlock(&mut); } static struct ibv_xrcd *ucma_get_xrcd(struct cma_device *cma_dev) { struct ibv_xrcd_init_attr attr; pthread_mutex_lock(&mut); if (!cma_dev->xrcd) { memset(&attr, 0, sizeof attr); attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS; attr.fd = -1; attr.oflags = O_CREAT; cma_dev->xrcd = ibv_open_xrcd(cma_dev->verbs, &attr); } pthread_mutex_unlock(&mut); return cma_dev->xrcd; } static void ucma_insert_id(struct cma_id_private *id_priv) { fastlock_acquire(&idm_lock); idm_set(&ucma_idm, id_priv->handle, id_priv); fastlock_release(&idm_lock); } static void ucma_remove_id(struct cma_id_private *id_priv) { if (id_priv->handle <= IDX_MAX_INDEX) idm_clear(&ucma_idm, id_priv->handle); } static struct cma_id_private *ucma_lookup_id(int handle) { return idm_lookup(&ucma_idm, handle); } static void ucma_free_id(struct cma_id_private *id_priv) { ucma_remove_id(id_priv); if (id_priv->cma_dev) ucma_put_device(id_priv->cma_dev); pthread_cond_destroy(&id_priv->cond); pthread_mutex_destroy(&id_priv->mut); if (id_priv->id.route.path_rec) free(id_priv->id.route.path_rec); if (id_priv->sync) rdma_destroy_event_channel(id_priv->id.channel); if (id_priv->connect_len) free(id_priv->connect); free(id_priv); } static struct cma_id_private *ucma_alloc_id(struct rdma_event_channel *channel, void *context, enum rdma_port_space ps, enum ibv_qp_type qp_type) { struct cma_id_private *id_priv; id_priv = calloc(1, sizeof(*id_priv)); if (!id_priv) return NULL; id_priv->id.context = context; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->handle = 0xFFFFFFFF; if (!channel) { id_priv->id.channel = rdma_create_event_channel(); if (!id_priv->id.channel) goto err; id_priv->sync = 1; } else { id_priv->id.channel = channel; } - pthread_mutex_init(&id_priv->mut, NULL); + if (pthread_mutex_init(&id_priv->mut, NULL)) + goto err; if (pthread_cond_init(&id_priv->cond, NULL)) goto err; return id_priv; err: ucma_free_id(id_priv); return NULL; } static int rdma_create_id2(struct rdma_event_channel *channel, struct rdma_cm_id **id, void *context, enum rdma_port_space ps, enum ibv_qp_type qp_type) { struct ucma_abi_create_id_resp resp; struct ucma_abi_create_id cmd; struct cma_id_private *id_priv; int ret; ret = ucma_init(); if (ret) return ret; id_priv = ucma_alloc_id(channel, context, ps, qp_type); if (!id_priv) return ERR(ENOMEM); CMA_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_ID, &resp, sizeof resp); cmd.uid = (uintptr_t) id_priv; cmd.ps = ps; cmd.qp_type = qp_type; ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) goto err; VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); id_priv->handle = resp.id; ucma_insert_id(id_priv); *id = &id_priv->id; return 0; err: ucma_free_id(id_priv); return ret; } int rdma_create_id(struct rdma_event_channel *channel, struct rdma_cm_id **id, void *context, enum rdma_port_space ps) { enum ibv_qp_type qp_type; qp_type = (ps == RDMA_PS_IPOIB || ps == RDMA_PS_UDP) ? IBV_QPT_UD : IBV_QPT_RC; return rdma_create_id2(channel, id, context, ps, qp_type); } static int ucma_destroy_kern_id(int fd, uint32_t handle) { struct ucma_abi_destroy_id_resp resp; struct ucma_abi_destroy_id cmd; int ret; CMA_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_ID, &resp, sizeof resp); cmd.id = handle; ret = write(fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); return resp.events_reported; } int rdma_destroy_id(struct rdma_cm_id *id) { struct cma_id_private *id_priv; int ret; id_priv = container_of(id, struct cma_id_private, id); ret = ucma_destroy_kern_id(id->channel->fd, id_priv->handle); if (ret < 0) return ret; if (id_priv->id.event) rdma_ack_cm_event(id_priv->id.event); pthread_mutex_lock(&id_priv->mut); while (id_priv->events_completed < ret) pthread_cond_wait(&id_priv->cond, &id_priv->mut); pthread_mutex_unlock(&id_priv->mut); ucma_free_id(id_priv); return 0; } int ucma_addrlen(struct sockaddr *addr) { if (!addr) return 0; switch (addr->sa_family) { case PF_INET: return sizeof(struct sockaddr_in); case PF_INET6: return sizeof(struct sockaddr_in6); case PF_IB: return af_ib_support ? sizeof(struct sockaddr_ib) : 0; default: return 0; } } static int ucma_query_addr(struct rdma_cm_id *id) { struct ucma_abi_query_addr_resp resp; struct ucma_abi_query cmd; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.option = UCMA_QUERY_ADDR; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); memcpy(&id->route.addr.src_addr, &resp.src_addr, resp.src_size); memcpy(&id->route.addr.dst_addr, &resp.dst_addr, resp.dst_size); if (!id_priv->cma_dev && resp.node_guid) { ret = ucma_get_device(id_priv, resp.node_guid); if (ret) return ret; id->port_num = resp.port_num; id->route.addr.addr.ibaddr.pkey = resp.pkey; } return 0; } static int ucma_query_gid(struct rdma_cm_id *id) { struct ucma_abi_query_addr_resp resp; struct ucma_abi_query cmd; struct cma_id_private *id_priv; struct sockaddr_ib *sib; int ret; CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.option = UCMA_QUERY_GID; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); sib = (struct sockaddr_ib *) &resp.src_addr; memcpy(id->route.addr.addr.ibaddr.sgid.raw, sib->sib_addr.sib_raw, sizeof id->route.addr.addr.ibaddr.sgid); sib = (struct sockaddr_ib *) &resp.dst_addr; memcpy(id->route.addr.addr.ibaddr.dgid.raw, sib->sib_addr.sib_raw, sizeof id->route.addr.addr.ibaddr.dgid); return 0; } static void ucma_convert_path(struct ibv_path_data *path_data, struct ibv_sa_path_rec *sa_path) { uint32_t fl_hop; sa_path->dgid = path_data->path.dgid; sa_path->sgid = path_data->path.sgid; sa_path->dlid = path_data->path.dlid; sa_path->slid = path_data->path.slid; sa_path->raw_traffic = 0; fl_hop = be32toh(path_data->path.flowlabel_hoplimit); sa_path->flow_label = htobe32(fl_hop >> 8); sa_path->hop_limit = (uint8_t) fl_hop; sa_path->traffic_class = path_data->path.tclass; sa_path->reversible = path_data->path.reversible_numpath >> 7; sa_path->numb_path = 1; sa_path->pkey = path_data->path.pkey; sa_path->sl = be16toh(path_data->path.qosclass_sl) & 0xF; sa_path->mtu_selector = 2; /* exactly */ sa_path->mtu = path_data->path.mtu & 0x1F; sa_path->rate_selector = 2; sa_path->rate = path_data->path.rate & 0x1F; sa_path->packet_life_time_selector = 2; sa_path->packet_life_time = path_data->path.packetlifetime & 0x1F; sa_path->preference = (uint8_t) path_data->flags; } static int ucma_query_path(struct rdma_cm_id *id) { struct ucma_abi_query_path_resp *resp; struct ucma_abi_query cmd; struct cma_id_private *id_priv; int ret, i, size; size = sizeof(*resp) + sizeof(struct ibv_path_data) * 6; resp = alloca(size); CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, resp, size); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.option = UCMA_QUERY_PATH; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; VALGRIND_MAKE_MEM_DEFINED(resp, size); if (resp->num_paths) { id->route.path_rec = malloc(sizeof(*id->route.path_rec) * resp->num_paths); if (!id->route.path_rec) return ERR(ENOMEM); id->route.num_paths = resp->num_paths; for (i = 0; i < resp->num_paths; i++) ucma_convert_path(&resp->path_data[i], &id->route.path_rec[i]); } return 0; } static int ucma_query_route(struct rdma_cm_id *id) { struct ucma_abi_query_route_resp resp; struct ucma_abi_query cmd; struct cma_id_private *id_priv; int ret, i; CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY_ROUTE, &resp, sizeof resp); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); if (resp.num_paths) { id->route.path_rec = malloc(sizeof(*id->route.path_rec) * resp.num_paths); if (!id->route.path_rec) return ERR(ENOMEM); id->route.num_paths = resp.num_paths; for (i = 0; i < resp.num_paths; i++) ibv_copy_path_rec_from_kern(&id->route.path_rec[i], &resp.ib_route[i]); } memcpy(id->route.addr.addr.ibaddr.sgid.raw, resp.ib_route[0].sgid, sizeof id->route.addr.addr.ibaddr.sgid); memcpy(id->route.addr.addr.ibaddr.dgid.raw, resp.ib_route[0].dgid, sizeof id->route.addr.addr.ibaddr.dgid); id->route.addr.addr.ibaddr.pkey = resp.ib_route[0].pkey; memcpy(&id->route.addr.src_addr, &resp.src_addr, sizeof resp.src_addr); memcpy(&id->route.addr.dst_addr, &resp.dst_addr, sizeof resp.dst_addr); if (!id_priv->cma_dev && resp.node_guid) { ret = ucma_get_device(id_priv, resp.node_guid); if (ret) return ret; id_priv->id.port_num = resp.port_num; } return 0; } static int rdma_bind_addr2(struct rdma_cm_id *id, struct sockaddr *addr, socklen_t addrlen) { struct ucma_abi_bind cmd; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD(&cmd, sizeof cmd, BIND); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.addr_size = addrlen; memcpy(&cmd.addr, addr, addrlen); ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; ret = ucma_query_addr(id); if (!ret) ret = ucma_query_gid(id); return ret; } int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct ucma_abi_bind_ip cmd; struct cma_id_private *id_priv; int ret, addrlen; addrlen = ucma_addrlen(addr); if (!addrlen) return ERR(EINVAL); if (af_ib_support) return rdma_bind_addr2(id, addr, addrlen); CMA_INIT_CMD(&cmd, sizeof cmd, BIND_IP); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; memcpy(&cmd.addr, addr, addrlen); ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; return ucma_query_route(id); } int ucma_complete(struct rdma_cm_id *id) { struct cma_id_private *id_priv; int ret; id_priv = container_of(id, struct cma_id_private, id); if (!id_priv->sync) return 0; if (id_priv->id.event) { rdma_ack_cm_event(id_priv->id.event); id_priv->id.event = NULL; } ret = rdma_get_cm_event(id_priv->id.channel, &id_priv->id.event); if (ret) return ret; if (id_priv->id.event->status) { if (id_priv->id.event->event == RDMA_CM_EVENT_REJECTED) ret = ERR(ECONNREFUSED); else if (id_priv->id.event->status < 0) ret = ERR(-id_priv->id.event->status); else ret = ERR(-id_priv->id.event->status); } return ret; } static int rdma_resolve_addr2(struct rdma_cm_id *id, struct sockaddr *src_addr, socklen_t src_len, struct sockaddr *dst_addr, socklen_t dst_len, int timeout_ms) { struct ucma_abi_resolve_addr cmd; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ADDR); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; if ((cmd.src_size = src_len)) memcpy(&cmd.src_addr, src_addr, src_len); memcpy(&cmd.dst_addr, dst_addr, dst_len); cmd.dst_size = dst_len; cmd.timeout_ms = timeout_ms; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; memcpy(&id->route.addr.dst_addr, dst_addr, dst_len); return ucma_complete(id); } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr, int timeout_ms) { struct ucma_abi_resolve_ip cmd; struct cma_id_private *id_priv; int ret, dst_len, src_len; dst_len = ucma_addrlen(dst_addr); if (!dst_len) return ERR(EINVAL); src_len = ucma_addrlen(src_addr); if (src_addr && !src_len) return ERR(EINVAL); if (af_ib_support) return rdma_resolve_addr2(id, src_addr, src_len, dst_addr, dst_len, timeout_ms); CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_IP); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; if (src_addr) memcpy(&cmd.src_addr, src_addr, src_len); memcpy(&cmd.dst_addr, dst_addr, dst_len); cmd.timeout_ms = timeout_ms; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; memcpy(&id->route.addr.dst_addr, dst_addr, dst_len); return ucma_complete(id); } static int ucma_set_ib_route(struct rdma_cm_id *id) { struct rdma_addrinfo hint, *rai; int ret; memset(&hint, 0, sizeof hint); hint.ai_flags = RAI_ROUTEONLY; hint.ai_family = id->route.addr.src_addr.sa_family; hint.ai_src_len = ucma_addrlen((struct sockaddr *) &id->route.addr.src_addr); hint.ai_src_addr = &id->route.addr.src_addr; hint.ai_dst_len = ucma_addrlen((struct sockaddr *) &id->route.addr.dst_addr); hint.ai_dst_addr = &id->route.addr.dst_addr; ret = rdma_getaddrinfo(NULL, NULL, &hint, &rai); if (ret) return ret; if (rai->ai_route_len) ret = rdma_set_option(id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH, rai->ai_route, rai->ai_route_len); else ret = -1; rdma_freeaddrinfo(rai); return ret; } int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) { struct ucma_abi_resolve_route cmd; struct cma_id_private *id_priv; int ret; id_priv = container_of(id, struct cma_id_private, id); if (id->verbs->device->transport_type == IBV_TRANSPORT_IB) { ret = ucma_set_ib_route(id); if (!ret) goto out; } CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ROUTE); cmd.id = id_priv->handle; cmd.timeout_ms = timeout_ms; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; out: return ucma_complete(id); } static int ucma_is_ud_qp(enum ibv_qp_type qp_type) { return (qp_type == IBV_QPT_UD); } static int rdma_init_qp_attr(struct rdma_cm_id *id, struct ibv_qp_attr *qp_attr, int *qp_attr_mask) { struct ucma_abi_init_qp_attr cmd; struct ibv_kern_qp_attr resp; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD_RESP(&cmd, sizeof cmd, INIT_QP_ATTR, &resp, sizeof resp); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.qp_state = qp_attr->qp_state; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); ibv_copy_qp_attr_from_kern(qp_attr, &resp); *qp_attr_mask = resp.qp_attr_mask; return 0; } static int ucma_modify_qp_rtr(struct rdma_cm_id *id, uint8_t resp_res) { struct cma_id_private *id_priv; struct ibv_qp_attr qp_attr; int qp_attr_mask, ret; uint8_t link_layer; if (!id->qp) return ERR(EINVAL); /* Need to update QP attributes from default values. */ qp_attr.qp_state = IBV_QPS_INIT; ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask); if (ret) return ERR(ret); qp_attr.qp_state = IBV_QPS_RTR; ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); if (ret) return ret; /* * Workaround for rdma_ucm kernel bug: * mask off qp_attr_mask bits 21-24 which are used for RoCE */ id_priv = container_of(id, struct cma_id_private, id); link_layer = id_priv->cma_dev->port[id->port_num - 1].link_layer; if (link_layer == IBV_LINK_LAYER_INFINIBAND) qp_attr_mask &= UINT_MAX ^ 0xe00000; if (resp_res != RDMA_MAX_RESP_RES) qp_attr.max_dest_rd_atomic = resp_res; return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask)); } static int ucma_modify_qp_rts(struct rdma_cm_id *id, uint8_t init_depth) { struct ibv_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IBV_QPS_RTS; ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); if (ret) return ret; if (init_depth != RDMA_MAX_INIT_DEPTH) qp_attr.max_rd_atomic = init_depth; return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask)); } static int ucma_modify_qp_sqd(struct rdma_cm_id *id) { struct ibv_qp_attr qp_attr; if (!id->qp) return 0; qp_attr.qp_state = IBV_QPS_SQD; return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE)); } static int ucma_modify_qp_err(struct rdma_cm_id *id) { struct ibv_qp_attr qp_attr; if (!id->qp) return 0; qp_attr.qp_state = IBV_QPS_ERR; return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE)); } static int ucma_find_pkey(struct cma_device *cma_dev, uint8_t port_num, __be16 pkey, uint16_t *pkey_index) { int ret, i; __be16 chk_pkey; for (i = 0, ret = 0; !ret; i++) { ret = ibv_query_pkey(cma_dev->verbs, port_num, i, &chk_pkey); if (!ret && pkey == chk_pkey) { *pkey_index = (uint16_t) i; return 0; } } return ERR(EINVAL); } static int ucma_init_conn_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp) { struct ibv_qp_attr qp_attr; int ret; ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num, id_priv->id.route.addr.addr.ibaddr.pkey, &qp_attr.pkey_index); if (ret) return ret; qp_attr.port_num = id_priv->id.port_num; qp_attr.qp_state = IBV_QPS_INIT; qp_attr.qp_access_flags = 0; ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_ACCESS_FLAGS | IBV_QP_PKEY_INDEX | IBV_QP_PORT); return rdma_seterrno(ret); } static int ucma_init_conn_qp(struct cma_id_private *id_priv, struct ibv_qp *qp) { struct ibv_qp_attr qp_attr; int qp_attr_mask, ret; if (abi_ver == 3) return ucma_init_conn_qp3(id_priv, qp); qp_attr.qp_state = IBV_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; return rdma_seterrno(ibv_modify_qp(qp, &qp_attr, qp_attr_mask)); } static int ucma_init_ud_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp) { struct ibv_qp_attr qp_attr; int ret; ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num, id_priv->id.route.addr.addr.ibaddr.pkey, &qp_attr.pkey_index); if (ret) return ret; qp_attr.port_num = id_priv->id.port_num; qp_attr.qp_state = IBV_QPS_INIT; qp_attr.qkey = RDMA_UDP_QKEY; ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_QKEY | IBV_QP_PKEY_INDEX | IBV_QP_PORT); if (ret) return ERR(ret); qp_attr.qp_state = IBV_QPS_RTR; ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE); if (ret) return ERR(ret); qp_attr.qp_state = IBV_QPS_RTS; qp_attr.sq_psn = 0; ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN); return rdma_seterrno(ret); } static int ucma_init_ud_qp(struct cma_id_private *id_priv, struct ibv_qp *qp) { struct ibv_qp_attr qp_attr; int qp_attr_mask, ret; if (abi_ver == 3) return ucma_init_ud_qp3(id_priv, qp); qp_attr.qp_state = IBV_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ibv_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) return ERR(ret); qp_attr.qp_state = IBV_QPS_RTR; ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE); if (ret) return ERR(ret); qp_attr.qp_state = IBV_QPS_RTS; qp_attr.sq_psn = 0; ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN); return rdma_seterrno(ret); } static void ucma_destroy_cqs(struct rdma_cm_id *id) { if (id->qp_type == IBV_QPT_XRC_RECV && id->srq) return; if (id->recv_cq) { ibv_destroy_cq(id->recv_cq); if (id->send_cq && (id->send_cq != id->recv_cq)) { ibv_destroy_cq(id->send_cq); id->send_cq = NULL; } id->recv_cq = NULL; } if (id->recv_cq_channel) { ibv_destroy_comp_channel(id->recv_cq_channel); if (id->send_cq_channel && (id->send_cq_channel != id->recv_cq_channel)) { ibv_destroy_comp_channel(id->send_cq_channel); id->send_cq_channel = NULL; } id->recv_cq_channel = NULL; } } static int ucma_create_cqs(struct rdma_cm_id *id, uint32_t send_size, uint32_t recv_size) { if (recv_size) { id->recv_cq_channel = ibv_create_comp_channel(id->verbs); if (!id->recv_cq_channel) goto err; id->recv_cq = ibv_create_cq(id->verbs, recv_size, id, id->recv_cq_channel, 0); if (!id->recv_cq) goto err; } if (send_size) { id->send_cq_channel = ibv_create_comp_channel(id->verbs); if (!id->send_cq_channel) goto err; id->send_cq = ibv_create_cq(id->verbs, send_size, id, id->send_cq_channel, 0); if (!id->send_cq) goto err; } return 0; err: ucma_destroy_cqs(id); return ERR(ENOMEM); } int rdma_create_srq_ex(struct rdma_cm_id *id, struct ibv_srq_init_attr_ex *attr) { struct cma_id_private *id_priv; struct ibv_srq *srq; int ret; id_priv = container_of(id, struct cma_id_private, id); if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_TYPE)) return ERR(EINVAL); if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_PD) || !attr->pd) { attr->pd = id->pd; attr->comp_mask |= IBV_SRQ_INIT_ATTR_PD; } if (attr->srq_type == IBV_SRQT_XRC) { if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) || !attr->xrcd) { attr->xrcd = ucma_get_xrcd(id_priv->cma_dev); if (!attr->xrcd) return -1; } if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_CQ) || !attr->cq) { ret = ucma_create_cqs(id, 0, attr->attr.max_wr); if (ret) return ret; attr->cq = id->recv_cq; } attr->comp_mask |= IBV_SRQ_INIT_ATTR_XRCD | IBV_SRQ_INIT_ATTR_CQ; } srq = ibv_create_srq_ex(id->verbs, attr); if (!srq) { ret = -1; goto err; } if (!id->pd) id->pd = attr->pd; id->srq = srq; return 0; err: ucma_destroy_cqs(id); return ret; } int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd, struct ibv_srq_init_attr *attr) { struct ibv_srq_init_attr_ex attr_ex; int ret; memcpy(&attr_ex, attr, sizeof(*attr)); attr_ex.comp_mask = IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_PD; if (id->qp_type == IBV_QPT_XRC_RECV) { attr_ex.srq_type = IBV_SRQT_XRC; } else { attr_ex.srq_type = IBV_SRQT_BASIC; } attr_ex.pd = pd; ret = rdma_create_srq_ex(id, &attr_ex); memcpy(attr, &attr_ex, sizeof(*attr)); return ret; } void rdma_destroy_srq(struct rdma_cm_id *id) { ibv_destroy_srq(id->srq); id->srq = NULL; ucma_destroy_cqs(id); } int rdma_create_qp_ex(struct rdma_cm_id *id, struct ibv_qp_init_attr_ex *attr) { struct cma_id_private *id_priv; struct ibv_qp *qp; int ret; if (id->qp) return ERR(EINVAL); id_priv = container_of(id, struct cma_id_private, id); if (!(attr->comp_mask & IBV_QP_INIT_ATTR_PD) || !attr->pd) { attr->comp_mask |= IBV_QP_INIT_ATTR_PD; attr->pd = id->pd; } else if (id->verbs != attr->pd->context) return ERR(EINVAL); if ((id->recv_cq && attr->recv_cq && id->recv_cq != attr->recv_cq) || (id->send_cq && attr->send_cq && id->send_cq != attr->send_cq)) return ERR(EINVAL); if (id->qp_type == IBV_QPT_XRC_RECV) { if (!(attr->comp_mask & IBV_QP_INIT_ATTR_XRCD) || !attr->xrcd) { attr->xrcd = ucma_get_xrcd(id_priv->cma_dev); if (!attr->xrcd) return -1; attr->comp_mask |= IBV_QP_INIT_ATTR_XRCD; } } ret = ucma_create_cqs(id, attr->send_cq || id->send_cq ? 0 : attr->cap.max_send_wr, attr->recv_cq || id->recv_cq ? 0 : attr->cap.max_recv_wr); if (ret) return ret; if (!attr->send_cq) attr->send_cq = id->send_cq; if (!attr->recv_cq) attr->recv_cq = id->recv_cq; if (id->srq && !attr->srq) attr->srq = id->srq; qp = ibv_create_qp_ex(id->verbs, attr); if (!qp) { ret = ERR(ENOMEM); goto err1; } if (ucma_is_ud_qp(id->qp_type)) ret = ucma_init_ud_qp(id_priv, qp); else ret = ucma_init_conn_qp(id_priv, qp); if (ret) goto err2; id->pd = qp->pd; id->qp = qp; return 0; err2: ibv_destroy_qp(qp); err1: ucma_destroy_cqs(id); return ret; } int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) { struct ibv_qp_init_attr_ex attr_ex; int ret; memcpy(&attr_ex, qp_init_attr, sizeof(*qp_init_attr)); attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; attr_ex.pd = pd ? pd : id->pd; ret = rdma_create_qp_ex(id, &attr_ex); memcpy(qp_init_attr, &attr_ex, sizeof(*qp_init_attr)); return ret; } void rdma_destroy_qp(struct rdma_cm_id *id) { ibv_destroy_qp(id->qp); id->qp = NULL; ucma_destroy_cqs(id); } static int ucma_valid_param(struct cma_id_private *id_priv, struct rdma_conn_param *param) { if (id_priv->id.ps != RDMA_PS_TCP) return 0; if (!id_priv->id.qp && !param) goto err; if (!param) return 0; if ((param->responder_resources != RDMA_MAX_RESP_RES) && (param->responder_resources > id_priv->cma_dev->max_responder_resources)) goto err; if ((param->initiator_depth != RDMA_MAX_INIT_DEPTH) && (param->initiator_depth > id_priv->cma_dev->max_initiator_depth)) goto err; return 0; err: return ERR(EINVAL); } static void ucma_copy_conn_param_to_kern(struct cma_id_private *id_priv, struct ucma_abi_conn_param *dst, struct rdma_conn_param *src, uint32_t qp_num, uint8_t srq) { dst->qp_num = qp_num; dst->srq = srq; dst->responder_resources = id_priv->responder_resources; dst->initiator_depth = id_priv->initiator_depth; dst->valid = 1; if (id_priv->connect_len) { memcpy(dst->private_data, id_priv->connect, id_priv->connect_len); dst->private_data_len = id_priv->connect_len; } if (src) { dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; dst->rnr_retry_count = src->rnr_retry_count; if (src->private_data && src->private_data_len) { memcpy(dst->private_data + dst->private_data_len, src->private_data, src->private_data_len); dst->private_data_len += src->private_data_len; } } else { dst->retry_count = 7; dst->rnr_retry_count = 7; } } int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct ucma_abi_connect cmd; struct cma_id_private *id_priv; int ret; id_priv = container_of(id, struct cma_id_private, id); ret = ucma_valid_param(id_priv, conn_param); if (ret) return ret; if (conn_param && conn_param->initiator_depth != RDMA_MAX_INIT_DEPTH) id_priv->initiator_depth = conn_param->initiator_depth; else id_priv->initiator_depth = id_priv->cma_dev->max_initiator_depth; if (conn_param && conn_param->responder_resources != RDMA_MAX_RESP_RES) id_priv->responder_resources = conn_param->responder_resources; else id_priv->responder_resources = id_priv->cma_dev->max_responder_resources; CMA_INIT_CMD(&cmd, sizeof cmd, CONNECT); cmd.id = id_priv->handle; if (id->qp) { ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, conn_param, id->qp->qp_num, (id->qp->srq != NULL)); } else if (conn_param) { ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, conn_param, conn_param->qp_num, conn_param->srq); } else { ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, conn_param, 0, 0); } ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; if (id_priv->connect_len) { free(id_priv->connect); id_priv->connect_len = 0; } return ucma_complete(id); } int rdma_listen(struct rdma_cm_id *id, int backlog) { struct ucma_abi_listen cmd; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD(&cmd, sizeof cmd, LISTEN); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.backlog = backlog; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; if (af_ib_support) return ucma_query_addr(id); else return ucma_query_route(id); } int rdma_get_request(struct rdma_cm_id *listen, struct rdma_cm_id **id) { struct cma_id_private *id_priv; struct rdma_cm_event *event; int ret; id_priv = container_of(listen, struct cma_id_private, id); if (!id_priv->sync) return ERR(EINVAL); if (listen->event) { rdma_ack_cm_event(listen->event); listen->event = NULL; } ret = rdma_get_cm_event(listen->channel, &event); if (ret) return ret; if (event->status) { ret = ERR(event->status); goto err; } if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { ret = ERR(EINVAL); goto err; } if (id_priv->qp_init_attr) { struct ibv_qp_init_attr attr; attr = *id_priv->qp_init_attr; ret = rdma_create_qp(event->id, listen->pd, &attr); if (ret) goto err; } *id = event->id; (*id)->event = event; return 0; err: listen->event = event; return ret; } int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct ucma_abi_accept cmd; struct cma_id_private *id_priv; int ret; id_priv = container_of(id, struct cma_id_private, id); ret = ucma_valid_param(id_priv, conn_param); if (ret) return ret; if (!conn_param || conn_param->initiator_depth == RDMA_MAX_INIT_DEPTH) { id_priv->initiator_depth = min(id_priv->initiator_depth, id_priv->cma_dev->max_initiator_depth); } else { id_priv->initiator_depth = conn_param->initiator_depth; } if (!conn_param || conn_param->responder_resources == RDMA_MAX_RESP_RES) { id_priv->responder_resources = min(id_priv->responder_resources, id_priv->cma_dev->max_responder_resources); } else { id_priv->responder_resources = conn_param->responder_resources; } if (!ucma_is_ud_qp(id->qp_type)) { ret = ucma_modify_qp_rtr(id, id_priv->responder_resources); if (ret) return ret; ret = ucma_modify_qp_rts(id, id_priv->initiator_depth); if (ret) return ret; } CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT); cmd.id = id_priv->handle; cmd.uid = (uintptr_t) id_priv; if (id->qp) ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, conn_param, id->qp->qp_num, (id->qp->srq != NULL)); else ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, conn_param, conn_param->qp_num, conn_param->srq); ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) { ucma_modify_qp_err(id); return (ret >= 0) ? ERR(ENODATA) : -1; } if (ucma_is_ud_qp(id->qp_type)) return 0; return ucma_complete(id); } int rdma_reject(struct rdma_cm_id *id, const void *private_data, uint8_t private_data_len) { struct ucma_abi_reject cmd; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD(&cmd, sizeof cmd, REJECT); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; if (private_data && private_data_len) { memcpy(cmd.private_data, private_data, private_data_len); cmd.private_data_len = private_data_len; } ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; return 0; } int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event) { struct ucma_abi_notify cmd; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD(&cmd, sizeof cmd, NOTIFY); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.event = event; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; return 0; } int ucma_shutdown(struct rdma_cm_id *id) { switch (id->verbs->device->transport_type) { case IBV_TRANSPORT_IB: return ucma_modify_qp_err(id); case IBV_TRANSPORT_IWARP: return ucma_modify_qp_sqd(id); default: return ERR(EINVAL); } } int rdma_disconnect(struct rdma_cm_id *id) { struct ucma_abi_disconnect cmd; struct cma_id_private *id_priv; int ret; ret = ucma_shutdown(id); if (ret) return ret; CMA_INIT_CMD(&cmd, sizeof cmd, DISCONNECT); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; return ucma_complete(id); } static int rdma_join_multicast2(struct rdma_cm_id *id, struct sockaddr *addr, socklen_t addrlen, void *context) { struct ucma_abi_create_id_resp resp; struct cma_id_private *id_priv; struct cma_multicast *mc, **pos; int ret; id_priv = container_of(id, struct cma_id_private, id); mc = calloc(1, sizeof(*mc)); if (!mc) return ERR(ENOMEM); mc->context = context; mc->id_priv = id_priv; memcpy(&mc->addr, addr, addrlen); if (pthread_cond_init(&mc->cond, NULL)) { ret = -1; goto err1; } pthread_mutex_lock(&id_priv->mut); mc->next = id_priv->mc_list; id_priv->mc_list = mc; pthread_mutex_unlock(&id_priv->mut); if (af_ib_support) { struct ucma_abi_join_mcast cmd; CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_MCAST, &resp, sizeof resp); cmd.id = id_priv->handle; memcpy(&cmd.addr, addr, addrlen); cmd.addr_size = addrlen; cmd.uid = (uintptr_t) mc; cmd.reserved = 0; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) { ret = (ret >= 0) ? ERR(ENODATA) : -1; goto err2; } } else { struct ucma_abi_join_ip_mcast cmd; CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_IP_MCAST, &resp, sizeof resp); cmd.id = id_priv->handle; memcpy(&cmd.addr, addr, addrlen); cmd.uid = (uintptr_t) mc; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) { ret = (ret >= 0) ? ERR(ENODATA) : -1; goto err2; } } VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); mc->handle = resp.id; return ucma_complete(id); err2: pthread_mutex_lock(&id_priv->mut); for (pos = &id_priv->mc_list; *pos != mc; pos = &(*pos)->next) ; *pos = mc->next; pthread_mutex_unlock(&id_priv->mut); err1: free(mc); return ret; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, void *context) { int addrlen; addrlen = ucma_addrlen(addr); if (!addrlen) return ERR(EINVAL); return rdma_join_multicast2(id, addr, addrlen, context); } int rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) { struct ucma_abi_destroy_id cmd; struct ucma_abi_destroy_id_resp resp; struct cma_id_private *id_priv; struct cma_multicast *mc, **pos; int ret, addrlen; addrlen = ucma_addrlen(addr); if (!addrlen) return ERR(EINVAL); id_priv = container_of(id, struct cma_id_private, id); pthread_mutex_lock(&id_priv->mut); for (pos = &id_priv->mc_list; *pos; pos = &(*pos)->next) if (!memcmp(&(*pos)->addr, addr, addrlen)) break; mc = *pos; if (*pos) *pos = mc->next; pthread_mutex_unlock(&id_priv->mut); if (!mc) return ERR(EADDRNOTAVAIL); if (id->qp) ibv_detach_mcast(id->qp, &mc->mgid, mc->mlid); CMA_INIT_CMD_RESP(&cmd, sizeof cmd, LEAVE_MCAST, &resp, sizeof resp); cmd.id = mc->handle; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) { ret = (ret >= 0) ? ERR(ENODATA) : -1; goto free; } VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); pthread_mutex_lock(&id_priv->mut); while (mc->events_completed < resp.events_reported) pthread_cond_wait(&mc->cond, &id_priv->mut); pthread_mutex_unlock(&id_priv->mut); ret = 0; free: free(mc); return ret; } static void ucma_complete_event(struct cma_id_private *id_priv) { pthread_mutex_lock(&id_priv->mut); id_priv->events_completed++; pthread_cond_signal(&id_priv->cond); pthread_mutex_unlock(&id_priv->mut); } static void ucma_complete_mc_event(struct cma_multicast *mc) { pthread_mutex_lock(&mc->id_priv->mut); mc->events_completed++; pthread_cond_signal(&mc->cond); mc->id_priv->events_completed++; pthread_cond_signal(&mc->id_priv->cond); pthread_mutex_unlock(&mc->id_priv->mut); } int rdma_ack_cm_event(struct rdma_cm_event *event) { struct cma_event *evt; if (!event) return ERR(EINVAL); evt = container_of(event, struct cma_event, event); if (evt->mc) ucma_complete_mc_event(evt->mc); else ucma_complete_event(evt->id_priv); free(evt); return 0; } static void ucma_process_addr_resolved(struct cma_event *evt) { if (af_ib_support) { evt->event.status = ucma_query_addr(&evt->id_priv->id); if (!evt->event.status && evt->id_priv->id.verbs->device->transport_type == IBV_TRANSPORT_IB) evt->event.status = ucma_query_gid(&evt->id_priv->id); } else { evt->event.status = ucma_query_route(&evt->id_priv->id); } if (evt->event.status) evt->event.event = RDMA_CM_EVENT_ADDR_ERROR; } static void ucma_process_route_resolved(struct cma_event *evt) { if (evt->id_priv->id.verbs->device->transport_type != IBV_TRANSPORT_IB) return; if (af_ib_support) evt->event.status = ucma_query_path(&evt->id_priv->id); else evt->event.status = ucma_query_route(&evt->id_priv->id); if (evt->event.status) evt->event.event = RDMA_CM_EVENT_ROUTE_ERROR; } static int ucma_query_req_info(struct rdma_cm_id *id) { int ret; if (!af_ib_support) return ucma_query_route(id); ret = ucma_query_addr(id); if (ret) return ret; ret = ucma_query_gid(id); if (ret) return ret; ret = ucma_query_path(id); if (ret) return ret; return 0; } static int ucma_process_conn_req(struct cma_event *evt, uint32_t handle) { struct cma_id_private *id_priv; int ret; id_priv = ucma_alloc_id(evt->id_priv->id.channel, evt->id_priv->id.context, evt->id_priv->id.ps, evt->id_priv->id.qp_type); if (!id_priv) { ucma_destroy_kern_id(evt->id_priv->id.channel->fd, handle); ret = ERR(ENOMEM); goto err1; } evt->event.listen_id = &evt->id_priv->id; evt->event.id = &id_priv->id; id_priv->handle = handle; ucma_insert_id(id_priv); id_priv->initiator_depth = evt->event.param.conn.initiator_depth; id_priv->responder_resources = evt->event.param.conn.responder_resources; if (evt->id_priv->sync) { ret = rdma_migrate_id(&id_priv->id, NULL); if (ret) goto err2; } ret = ucma_query_req_info(&id_priv->id); if (ret) goto err2; return 0; err2: rdma_destroy_id(&id_priv->id); err1: ucma_complete_event(evt->id_priv); return ret; } static int ucma_process_conn_resp(struct cma_id_private *id_priv) { struct ucma_abi_accept cmd; int ret; ret = ucma_modify_qp_rtr(&id_priv->id, RDMA_MAX_RESP_RES); if (ret) goto err; ret = ucma_modify_qp_rts(&id_priv->id, RDMA_MAX_INIT_DEPTH); if (ret) goto err; CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT); cmd.id = id_priv->handle; ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) { ret = (ret >= 0) ? ERR(ENODATA) : -1; goto err; } return 0; err: ucma_modify_qp_err(&id_priv->id); return ret; } static int ucma_process_join(struct cma_event *evt) { evt->mc->mgid = evt->event.param.ud.ah_attr.grh.dgid; evt->mc->mlid = evt->event.param.ud.ah_attr.dlid; if (!evt->id_priv->id.qp) return 0; return rdma_seterrno(ibv_attach_mcast(evt->id_priv->id.qp, &evt->mc->mgid, evt->mc->mlid)); } static void ucma_copy_conn_event(struct cma_event *event, struct ucma_abi_conn_param *src) { struct rdma_conn_param *dst = &event->event.param.conn; dst->private_data_len = src->private_data_len; if (src->private_data_len) { dst->private_data = &event->private_data; memcpy(&event->private_data, src->private_data, src->private_data_len); } dst->responder_resources = src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; dst->qp_num = src->qp_num; } static void ucma_copy_ud_event(struct cma_event *event, struct ucma_abi_ud_param *src) { struct rdma_ud_param *dst = &event->event.param.ud; dst->private_data_len = src->private_data_len; if (src->private_data_len) { dst->private_data = &event->private_data; memcpy(&event->private_data, src->private_data, src->private_data_len); } ibv_copy_ah_attr_from_kern(&dst->ah_attr, &src->ah_attr); dst->qp_num = src->qp_num; dst->qkey = src->qkey; } int rdma_get_cm_event(struct rdma_event_channel *channel, struct rdma_cm_event **event) { struct ucma_abi_event_resp resp; struct ucma_abi_get_event cmd; struct cma_event *evt; int ret; ret = ucma_init(); if (ret) return ret; if (!event) return ERR(EINVAL); evt = malloc(sizeof(*evt)); if (!evt) return ERR(ENOMEM); retry: memset(evt, 0, sizeof(*evt)); CMA_INIT_CMD_RESP(&cmd, sizeof cmd, GET_EVENT, &resp, sizeof resp); ret = write(channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) { free(evt); return (ret >= 0) ? ERR(ENODATA) : -1; } VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); evt->event.event = resp.event; /* * We should have a non-zero uid, except for connection requests. * But a bug in older kernels can report a uid 0. Work-around this * issue by looking up the cma_id based on the kernel's id when the * uid is 0 and we're processing a connection established event. * In all other cases, if the uid is 0, we discard the event, like * the kernel should have done. */ if (resp.uid) { evt->id_priv = (void *) (uintptr_t) resp.uid; } else { evt->id_priv = ucma_lookup_id(resp.id); if (!evt->id_priv) { syslog(LOG_WARNING, PFX "Warning: discarding unmatched " "event - rdma_destroy_id may hang.\n"); goto retry; } if (resp.event != RDMA_CM_EVENT_ESTABLISHED) { ucma_complete_event(evt->id_priv); goto retry; } } evt->event.id = &evt->id_priv->id; evt->event.status = resp.status; switch (resp.event) { case RDMA_CM_EVENT_ADDR_RESOLVED: ucma_process_addr_resolved(evt); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: ucma_process_route_resolved(evt); break; case RDMA_CM_EVENT_CONNECT_REQUEST: evt->id_priv = (void *) (uintptr_t) resp.uid; if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) ucma_copy_ud_event(evt, &resp.param.ud); else ucma_copy_conn_event(evt, &resp.param.conn); ret = ucma_process_conn_req(evt, resp.id); if (ret) goto retry; break; case RDMA_CM_EVENT_CONNECT_RESPONSE: ucma_copy_conn_event(evt, &resp.param.conn); evt->event.status = ucma_process_conn_resp(evt->id_priv); if (!evt->event.status) evt->event.event = RDMA_CM_EVENT_ESTABLISHED; else { evt->event.event = RDMA_CM_EVENT_CONNECT_ERROR; evt->id_priv->connect_error = 1; } break; case RDMA_CM_EVENT_ESTABLISHED: if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) { ucma_copy_ud_event(evt, &resp.param.ud); break; } ucma_copy_conn_event(evt, &resp.param.conn); break; case RDMA_CM_EVENT_REJECTED: if (evt->id_priv->connect_error) { ucma_complete_event(evt->id_priv); goto retry; } ucma_copy_conn_event(evt, &resp.param.conn); ucma_modify_qp_err(evt->event.id); break; case RDMA_CM_EVENT_DISCONNECTED: if (evt->id_priv->connect_error) { ucma_complete_event(evt->id_priv); goto retry; } ucma_copy_conn_event(evt, &resp.param.conn); break; case RDMA_CM_EVENT_MULTICAST_JOIN: evt->mc = (void *) (uintptr_t) resp.uid; evt->id_priv = evt->mc->id_priv; evt->event.id = &evt->id_priv->id; ucma_copy_ud_event(evt, &resp.param.ud); evt->event.param.ud.private_data = evt->mc->context; evt->event.status = ucma_process_join(evt); if (evt->event.status) evt->event.event = RDMA_CM_EVENT_MULTICAST_ERROR; break; case RDMA_CM_EVENT_MULTICAST_ERROR: evt->mc = (void *) (uintptr_t) resp.uid; evt->id_priv = evt->mc->id_priv; evt->event.id = &evt->id_priv->id; evt->event.param.ud.private_data = evt->mc->context; break; default: evt->id_priv = (void *) (uintptr_t) resp.uid; evt->event.id = &evt->id_priv->id; evt->event.status = resp.status; if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) ucma_copy_ud_event(evt, &resp.param.ud); else ucma_copy_conn_event(evt, &resp.param.conn); break; } *event = &evt->event; return 0; } const char *rdma_event_str(enum rdma_cm_event_type event) { switch (event) { case RDMA_CM_EVENT_ADDR_RESOLVED: return "RDMA_CM_EVENT_ADDR_RESOLVED"; case RDMA_CM_EVENT_ADDR_ERROR: return "RDMA_CM_EVENT_ADDR_ERROR"; case RDMA_CM_EVENT_ROUTE_RESOLVED: return "RDMA_CM_EVENT_ROUTE_RESOLVED"; case RDMA_CM_EVENT_ROUTE_ERROR: return "RDMA_CM_EVENT_ROUTE_ERROR"; case RDMA_CM_EVENT_CONNECT_REQUEST: return "RDMA_CM_EVENT_CONNECT_REQUEST"; case RDMA_CM_EVENT_CONNECT_RESPONSE: return "RDMA_CM_EVENT_CONNECT_RESPONSE"; case RDMA_CM_EVENT_CONNECT_ERROR: return "RDMA_CM_EVENT_CONNECT_ERROR"; case RDMA_CM_EVENT_UNREACHABLE: return "RDMA_CM_EVENT_UNREACHABLE"; case RDMA_CM_EVENT_REJECTED: return "RDMA_CM_EVENT_REJECTED"; case RDMA_CM_EVENT_ESTABLISHED: return "RDMA_CM_EVENT_ESTABLISHED"; case RDMA_CM_EVENT_DISCONNECTED: return "RDMA_CM_EVENT_DISCONNECTED"; case RDMA_CM_EVENT_DEVICE_REMOVAL: return "RDMA_CM_EVENT_DEVICE_REMOVAL"; case RDMA_CM_EVENT_MULTICAST_JOIN: return "RDMA_CM_EVENT_MULTICAST_JOIN"; case RDMA_CM_EVENT_MULTICAST_ERROR: return "RDMA_CM_EVENT_MULTICAST_ERROR"; case RDMA_CM_EVENT_ADDR_CHANGE: return "RDMA_CM_EVENT_ADDR_CHANGE"; case RDMA_CM_EVENT_TIMEWAIT_EXIT: return "RDMA_CM_EVENT_TIMEWAIT_EXIT"; default: return "UNKNOWN EVENT"; } } int rdma_set_option(struct rdma_cm_id *id, int level, int optname, void *optval, size_t optlen) { struct ucma_abi_set_option cmd; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD(&cmd, sizeof cmd, SET_OPTION); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.optval = (uintptr_t) optval; cmd.level = level; cmd.optname = optname; cmd.optlen = optlen; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; return 0; } int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel) { struct ucma_abi_migrate_resp resp; struct ucma_abi_migrate_id cmd; struct cma_id_private *id_priv; int ret, sync; id_priv = container_of(id, struct cma_id_private, id); if (id_priv->sync && !channel) return ERR(EINVAL); if ((sync = (channel == NULL))) { channel = rdma_create_event_channel(); if (!channel) return -1; } CMA_INIT_CMD_RESP(&cmd, sizeof cmd, MIGRATE_ID, &resp, sizeof resp); cmd.id = id_priv->handle; cmd.fd = id->channel->fd; ret = write(channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) { if (sync) rdma_destroy_event_channel(channel); return (ret >= 0) ? ERR(ENODATA) : -1; } VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); if (id_priv->sync) { if (id->event) { rdma_ack_cm_event(id->event); id->event = NULL; } rdma_destroy_event_channel(id->channel); } /* * Eventually if we want to support migrating channels while events are * being processed on the current channel, we need to block here while * there are any outstanding events on the current channel for this id * to prevent the user from processing events for this id on the old * channel after this call returns. */ pthread_mutex_lock(&id_priv->mut); id_priv->sync = sync; id->channel = channel; while (id_priv->events_completed < resp.events_reported) pthread_cond_wait(&id_priv->cond, &id_priv->mut); pthread_mutex_unlock(&id_priv->mut); return 0; } static int ucma_passive_ep(struct rdma_cm_id *id, struct rdma_addrinfo *res, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) { struct cma_id_private *id_priv; int ret; if (af_ib_support) ret = rdma_bind_addr2(id, res->ai_src_addr, res->ai_src_len); else ret = rdma_bind_addr(id, res->ai_src_addr); if (ret) return ret; id_priv = container_of(id, struct cma_id_private, id); if (pd) id->pd = pd; if (qp_init_attr) { id_priv->qp_init_attr = malloc(sizeof(*qp_init_attr)); if (!id_priv->qp_init_attr) return ERR(ENOMEM); *id_priv->qp_init_attr = *qp_init_attr; id_priv->qp_init_attr->qp_type = res->ai_qp_type; } return 0; } int rdma_create_ep(struct rdma_cm_id **id, struct rdma_addrinfo *res, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) { struct rdma_cm_id *cm_id; struct cma_id_private *id_priv; int ret; ret = rdma_create_id2(NULL, &cm_id, NULL, res->ai_port_space, res->ai_qp_type); if (ret) return ret; if (res->ai_flags & RAI_PASSIVE) { ret = ucma_passive_ep(cm_id, res, pd, qp_init_attr); if (ret) goto err; goto out; } if (af_ib_support) ret = rdma_resolve_addr2(cm_id, res->ai_src_addr, res->ai_src_len, res->ai_dst_addr, res->ai_dst_len, 2000); else ret = rdma_resolve_addr(cm_id, res->ai_src_addr, res->ai_dst_addr, 2000); if (ret) goto err; if (res->ai_route_len) { ret = rdma_set_option(cm_id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH, res->ai_route, res->ai_route_len); if (!ret) ret = ucma_complete(cm_id); } else { ret = rdma_resolve_route(cm_id, 2000); } if (ret) goto err; if (qp_init_attr) { qp_init_attr->qp_type = res->ai_qp_type; ret = rdma_create_qp(cm_id, pd, qp_init_attr); if (ret) goto err; } if (res->ai_connect_len) { id_priv = container_of(cm_id, struct cma_id_private, id); id_priv->connect = malloc(res->ai_connect_len); if (!id_priv->connect) { ret = ERR(ENOMEM); goto err; } memcpy(id_priv->connect, res->ai_connect, res->ai_connect_len); id_priv->connect_len = res->ai_connect_len; } out: *id = cm_id; return 0; err: rdma_destroy_ep(cm_id); return ret; } void rdma_destroy_ep(struct rdma_cm_id *id) { struct cma_id_private *id_priv; if (id->qp) rdma_destroy_qp(id); if (id->srq) rdma_destroy_srq(id); id_priv = container_of(id, struct cma_id_private, id); if (id_priv->qp_init_attr) free(id_priv->qp_init_attr); rdma_destroy_id(id); } int ucma_max_qpsize(struct rdma_cm_id *id) { struct cma_id_private *id_priv; int i, max_size = 0; id_priv = container_of(id, struct cma_id_private, id); if (id && id_priv->cma_dev) { max_size = id_priv->cma_dev->max_qpsize; } else { ucma_init_all(); for (i = 0; i < cma_dev_cnt; i++) { if (!max_size || max_size > cma_dev_array[i].max_qpsize) max_size = cma_dev_array[i].max_qpsize; } } return max_size; } __be16 ucma_get_port(struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ((struct sockaddr_in *) addr)->sin_port; case AF_INET6: return ((struct sockaddr_in6 *) addr)->sin6_port; case AF_IB: return htobe16((uint16_t) be64toh(((struct sockaddr_ib *) addr)->sib_sid)); default: return 0; } } __be16 rdma_get_src_port(struct rdma_cm_id *id) { return ucma_get_port(&id->route.addr.src_addr); } __be16 rdma_get_dst_port(struct rdma_cm_id *id) { return ucma_get_port(&id->route.addr.dst_addr); }