diff --git a/sys/contrib/rdma/krping/krping.c b/sys/contrib/rdma/krping/krping.c index 75c874411bb3..da0a03af42cd 100644 --- a/sys/contrib/rdma/krping/krping.c +++ b/sys/contrib/rdma/krping/krping.c @@ -1,2210 +1,2214 @@ /* * Copyright (c) 2005 Ammasso, Inc. All rights reserved. * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "krping.h" #include "getopt.h" #define PFX "krping: " extern int krping_debug; #define DEBUG_LOG(...) do { if (krping_debug) log(LOG_INFO, __VA_ARGS__); } while (0) #define BIND_INFO 1 MODULE_AUTHOR("Steve Wise"); MODULE_DESCRIPTION("RDMA ping server"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(krping, 1); MODULE_DEPEND(krping, linuxkpi, 1, 1, 1); static __inline uint64_t get_cycles(void) { uint32_t low, high; __asm __volatile("rdtsc" : "=a" (low), "=d" (high)); return (low | ((u_int64_t)high << 32)); } typedef uint64_t cycles_t; enum mem_type { DMA = 1, REG = 2, }; static const struct krping_option krping_opts[] = { {"count", OPT_INT, 'C'}, {"size", OPT_INT, 'S'}, {"addr", OPT_STRING, 'a'}, {"addr6", OPT_STRING, 'A'}, {"port", OPT_INT, 'p'}, {"verbose", OPT_NOPARAM, 'v'}, {"validate", OPT_NOPARAM, 'V'}, {"server", OPT_NOPARAM, 's'}, {"client", OPT_NOPARAM, 'c'}, {"server_inv", OPT_NOPARAM, 'I'}, {"wlat", OPT_NOPARAM, 'l'}, {"rlat", OPT_NOPARAM, 'L'}, {"bw", OPT_NOPARAM, 'B'}, {"duplex", OPT_NOPARAM, 'd'}, {"tos", OPT_INT, 't'}, {"txdepth", OPT_INT, 'T'}, {"poll", OPT_NOPARAM, 'P'}, {"local_dma_lkey", OPT_NOPARAM, 'Z'}, {"read_inv", OPT_NOPARAM, 'R'}, {"fr", OPT_NOPARAM, 'f'}, {NULL, 0, 0} }; #define htonll(x) cpu_to_be64((x)) #define ntohll(x) cpu_to_be64((x)) static DEFINE_MUTEX(krping_mutex); /* * List of running krping threads. */ static LIST_HEAD(krping_cbs); /* * Invoke like this, one on each side, using the server's address on * the RDMA device (iw%d): * * /bin/echo server,port=9999,addr=192.168.69.142,validate > /proc/krping * /bin/echo client,port=9999,addr=192.168.69.142,validate > /proc/krping * /bin/echo client,port=9999,addr6=2001:db8:0:f101::1,validate > /proc/krping * * krping "ping/pong" loop: * client sends source rkey/addr/len * server receives source rkey/add/len * server rdma reads "ping" data from source * server sends "go ahead" on rdma read completion * client sends sink rkey/addr/len * server receives sink rkey/addr/len * server rdma writes "pong" data to sink * server sends "go ahead" on rdma write completion * */ /* * These states are used to signal events between the completion handler * and the main client or server thread. * * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV, * and RDMA_WRITE_COMPLETE for each ping. */ enum test_state { IDLE = 1, CONNECT_REQUEST, ADDR_RESOLVED, ROUTE_RESOLVED, CONNECTED, RDMA_READ_ADV, RDMA_READ_COMPLETE, RDMA_WRITE_ADV, RDMA_WRITE_COMPLETE, ERROR }; struct krping_rdma_info { uint64_t buf; uint32_t rkey; uint32_t size; }; /* * Default max buffer size for IO... */ #define RPING_BUFSIZE 128*1024 #define RPING_SQ_DEPTH 64 /* * Control block struct. */ struct krping_cb { int server; /* 0 iff client */ struct ib_cq *cq; struct ib_pd *pd; struct ib_qp *qp; struct ib_mr *dma_mr; struct ib_fast_reg_page_list *page_list; int page_list_len; struct ib_reg_wr reg_mr_wr; struct ib_send_wr invalidate_wr; struct ib_mr *reg_mr; int server_invalidate; int read_inv; u8 key; struct ib_recv_wr rq_wr; /* recv work request record */ struct ib_sge recv_sgl; /* recv single SGE */ struct krping_rdma_info recv_buf __aligned(16); /* malloc'd buffer */ u64 recv_dma_addr; DECLARE_PCI_UNMAP_ADDR(recv_mapping) struct ib_send_wr sq_wr; /* send work requrest record */ struct ib_sge send_sgl; struct krping_rdma_info send_buf __aligned(16); /* single send buf */ u64 send_dma_addr; DECLARE_PCI_UNMAP_ADDR(send_mapping) struct ib_rdma_wr rdma_sq_wr; /* rdma work request record */ struct ib_sge rdma_sgl; /* rdma single SGE */ char *rdma_buf; /* used as rdma sink */ u64 rdma_dma_addr; DECLARE_PCI_UNMAP_ADDR(rdma_mapping) struct ib_mr *rdma_mr; uint32_t remote_rkey; /* remote guys RKEY */ uint64_t remote_addr; /* remote guys TO */ uint32_t remote_len; /* remote guys LEN */ char *start_buf; /* rdma read src */ u64 start_dma_addr; DECLARE_PCI_UNMAP_ADDR(start_mapping) struct ib_mr *start_mr; enum test_state state; /* used for cond/signalling */ wait_queue_head_t sem; struct krping_stats stats; uint16_t port; /* dst port in NBO */ u8 addr[16] __aligned(8); /* dst addr in NBO */ char *addr_str; /* dst addr string */ uint8_t addr_type; /* ADDR_FAMILY - IPv4/V6 */ int verbose; /* verbose logging */ int count; /* ping count */ int size; /* ping data size */ int validate; /* validate ping data */ int wlat; /* run wlat test */ int rlat; /* run rlat test */ int bw; /* run bw test */ int duplex; /* run bw full duplex test */ int poll; /* poll or block for rlat test */ int txdepth; /* SQ depth */ int local_dma_lkey; /* use 0 for lkey */ int frtest; /* reg test */ int tos; /* type of service */ /* CM stuff */ struct rdma_cm_id *cm_id; /* connection on client side,*/ /* listener on server side. */ struct rdma_cm_id *child_cm_id; /* connection on server side */ struct list_head list; }; static int krping_cma_event_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { int ret; struct krping_cb *cb = cma_id->context; DEBUG_LOG("cma_event type %d cma_id %p (%s)\n", event->event, cma_id, (cma_id == cb->cm_id) ? "parent" : "child"); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: cb->state = ADDR_RESOLVED; ret = rdma_resolve_route(cma_id, 2000); if (ret) { printk(KERN_ERR PFX "rdma_resolve_route error %d\n", ret); wake_up_interruptible(&cb->sem); } break; case RDMA_CM_EVENT_ROUTE_RESOLVED: cb->state = ROUTE_RESOLVED; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_CONNECT_REQUEST: cb->state = CONNECT_REQUEST; cb->child_cm_id = cma_id; DEBUG_LOG("child cma %p\n", cb->child_cm_id); wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_ESTABLISHED: DEBUG_LOG("ESTABLISHED\n"); if (!cb->server) { cb->state = CONNECTED; } wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: printk(KERN_ERR PFX "cma event %d, error %d\n", event->event, event->status); cb->state = ERROR; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DISCONNECTED: printk(KERN_ERR PFX "DISCONNECT EVENT...\n"); cb->state = ERROR; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: printk(KERN_ERR PFX "cma detected device removal!!!!\n"); cb->state = ERROR; wake_up_interruptible(&cb->sem); break; default: printk(KERN_ERR PFX "oof bad type!\n"); wake_up_interruptible(&cb->sem); break; } return 0; } static int server_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { printk(KERN_ERR PFX "Received bogus data, size %d\n", wc->byte_len); return -1; } cb->remote_rkey = ntohl(cb->recv_buf.rkey); cb->remote_addr = ntohll(cb->recv_buf.buf); cb->remote_len = ntohl(cb->recv_buf.size); DEBUG_LOG("Received rkey %x addr %llx len %d from peer\n", cb->remote_rkey, (unsigned long long)cb->remote_addr, cb->remote_len); if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE) cb->state = RDMA_READ_ADV; else cb->state = RDMA_WRITE_ADV; return 0; } static int client_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { printk(KERN_ERR PFX "Received bogus data, size %d\n", wc->byte_len); return -1; } if (cb->state == RDMA_READ_ADV) cb->state = RDMA_WRITE_ADV; else cb->state = RDMA_WRITE_COMPLETE; return 0; } static void krping_cq_event_handler(struct ib_cq *cq, void *ctx) { struct krping_cb *cb = ctx; struct ib_wc wc; - struct ib_recv_wr *bad_wr; + const struct ib_recv_wr *bad_wr; int ret; BUG_ON(cb->cq != cq); if (cb->frtest) { printk(KERN_ERR PFX "cq completion event in frtest!\n"); return; } if (!cb->wlat && !cb->rlat && !cb->bw) ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { if (wc.status) { if (wc.status == IB_WC_WR_FLUSH_ERR) { DEBUG_LOG("cq flushed\n"); continue; } else { printk(KERN_ERR PFX "cq completion failed with " "wr_id %jx status %d opcode %d vender_err %x\n", (uintmax_t)wc.wr_id, wc.status, wc.opcode, wc.vendor_err); goto error; } } if (cb->state == ERROR) { printk(KERN_ERR PFX "cq completion in ERROR state\n"); return; } switch (wc.opcode) { case IB_WC_SEND: DEBUG_LOG("send completion\n"); cb->stats.send_bytes += cb->send_sgl.length; cb->stats.send_msgs++; break; case IB_WC_RDMA_WRITE: DEBUG_LOG("rdma write completion\n"); cb->stats.write_bytes += cb->rdma_sq_wr.wr.sg_list->length; cb->stats.write_msgs++; cb->state = RDMA_WRITE_COMPLETE; wake_up_interruptible(&cb->sem); break; case IB_WC_RDMA_READ: DEBUG_LOG("rdma read completion\n"); cb->stats.read_bytes += cb->rdma_sq_wr.wr.sg_list->length; cb->stats.read_msgs++; cb->state = RDMA_READ_COMPLETE; wake_up_interruptible(&cb->sem); break; case IB_WC_RECV: DEBUG_LOG("recv completion\n"); cb->stats.recv_bytes += sizeof(cb->recv_buf); cb->stats.recv_msgs++; if (cb->wlat || cb->rlat || cb->bw) ret = server_recv(cb, &wc); else ret = cb->server ? server_recv(cb, &wc) : client_recv(cb, &wc); if (ret) { printk(KERN_ERR PFX "recv wc error: %d\n", ret); goto error; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "post recv error: %d\n", ret); goto error; } wake_up_interruptible(&cb->sem); break; default: printk(KERN_ERR PFX "%s:%d Unexpected opcode %d, Shutting down\n", __func__, __LINE__, wc.opcode); goto error; } } if (ret) { printk(KERN_ERR PFX "poll error %d\n", ret); goto error; } return; error: cb->state = ERROR; wake_up_interruptible(&cb->sem); } static int krping_accept(struct krping_cb *cb) { struct rdma_conn_param conn_param; int ret; DEBUG_LOG("accepting client connection request\n"); memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; ret = rdma_accept(cb->child_cm_id, &conn_param); if (ret) { printk(KERN_ERR PFX "rdma_accept error: %d\n", ret); return ret; } if (!cb->wlat && !cb->rlat && !cb->bw) { wait_event_interruptible(cb->sem, cb->state >= CONNECTED); if (cb->state == ERROR) { printk(KERN_ERR PFX "wait for CONNECTED state %d\n", cb->state); return -1; } } return 0; } static void krping_setup_wr(struct krping_cb *cb) { cb->recv_sgl.addr = cb->recv_dma_addr; cb->recv_sgl.length = sizeof cb->recv_buf; cb->recv_sgl.lkey = cb->pd->local_dma_lkey; cb->rq_wr.sg_list = &cb->recv_sgl; cb->rq_wr.num_sge = 1; cb->send_sgl.addr = cb->send_dma_addr; cb->send_sgl.length = sizeof cb->send_buf; cb->send_sgl.lkey = cb->pd->local_dma_lkey; cb->sq_wr.opcode = IB_WR_SEND; cb->sq_wr.send_flags = IB_SEND_SIGNALED; cb->sq_wr.sg_list = &cb->send_sgl; cb->sq_wr.num_sge = 1; if (cb->server || cb->wlat || cb->rlat || cb->bw) { cb->rdma_sgl.addr = cb->rdma_dma_addr; cb->rdma_sq_wr.wr.send_flags = IB_SEND_SIGNALED; cb->rdma_sq_wr.wr.sg_list = &cb->rdma_sgl; cb->rdma_sq_wr.wr.num_sge = 1; } /* * A chain of 2 WRs, INVALDATE_MR + REG_MR. * both unsignaled. The client uses them to reregister * the rdma buffers with a new key each iteration. */ cb->reg_mr_wr.wr.opcode = IB_WR_REG_MR; cb->reg_mr_wr.mr = cb->reg_mr; cb->invalidate_wr.next = &cb->reg_mr_wr.wr; cb->invalidate_wr.opcode = IB_WR_LOCAL_INV; } static int krping_setup_buffers(struct krping_cb *cb) { int ret; DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb); cb->recv_dma_addr = ib_dma_map_single(cb->pd->device, &cb->recv_buf, sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr); cb->send_dma_addr = ib_dma_map_single(cb->pd->device, &cb->send_buf, sizeof(cb->send_buf), DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr); cb->rdma_buf = ib_dma_alloc_coherent(cb->pd->device, cb->size, &cb->rdma_dma_addr, GFP_KERNEL); if (!cb->rdma_buf) { DEBUG_LOG(PFX "rdma_buf allocation failed\n"); ret = -ENOMEM; goto bail; } pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr); cb->page_list_len = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; cb->reg_mr = ib_alloc_mr(cb->pd, IB_MR_TYPE_MEM_REG, cb->page_list_len); if (IS_ERR(cb->reg_mr)) { ret = PTR_ERR(cb->reg_mr); DEBUG_LOG(PFX "recv_buf reg_mr failed %d\n", ret); goto bail; } DEBUG_LOG(PFX "reg rkey 0x%x page_list_len %u\n", cb->reg_mr->rkey, cb->page_list_len); if (!cb->server || cb->wlat || cb->rlat || cb->bw) { cb->start_buf = ib_dma_alloc_coherent(cb->pd->device, cb->size, &cb->start_dma_addr, GFP_KERNEL); if (!cb->start_buf) { DEBUG_LOG(PFX "start_buf malloc failed\n"); ret = -ENOMEM; goto bail; } pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr); } krping_setup_wr(cb); DEBUG_LOG(PFX "allocated & registered buffers...\n"); return 0; bail: if (cb->reg_mr && !IS_ERR(cb->reg_mr)) ib_dereg_mr(cb->reg_mr); if (cb->rdma_mr && !IS_ERR(cb->rdma_mr)) ib_dereg_mr(cb->rdma_mr); if (cb->dma_mr && !IS_ERR(cb->dma_mr)) ib_dereg_mr(cb->dma_mr); if (cb->rdma_buf) { ib_dma_free_coherent(cb->pd->device, cb->size, cb->rdma_buf, cb->rdma_dma_addr); } if (cb->start_buf) { ib_dma_free_coherent(cb->pd->device, cb->size, cb->start_buf, cb->start_dma_addr); } return ret; } static void krping_free_buffers(struct krping_cb *cb) { DEBUG_LOG("krping_free_buffers called on cb %p\n", cb); if (cb->dma_mr) ib_dereg_mr(cb->dma_mr); if (cb->rdma_mr) ib_dereg_mr(cb->rdma_mr); if (cb->start_mr) ib_dereg_mr(cb->start_mr); if (cb->reg_mr) ib_dereg_mr(cb->reg_mr); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, recv_mapping), sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, send_mapping), sizeof(cb->send_buf), DMA_BIDIRECTIONAL); ib_dma_free_coherent(cb->pd->device, cb->size, cb->rdma_buf, cb->rdma_dma_addr); if (cb->start_buf) { ib_dma_free_coherent(cb->pd->device, cb->size, cb->start_buf, cb->start_dma_addr); } } static int krping_create_qp(struct krping_cb *cb) { struct ib_qp_init_attr init_attr; int ret; memset(&init_attr, 0, sizeof(init_attr)); init_attr.cap.max_send_wr = cb->txdepth; init_attr.cap.max_recv_wr = 2; /* For flush_qp() */ init_attr.cap.max_send_wr++; init_attr.cap.max_recv_wr++; init_attr.cap.max_recv_sge = 1; init_attr.cap.max_send_sge = 1; init_attr.qp_type = IB_QPT_RC; init_attr.send_cq = cb->cq; init_attr.recv_cq = cb->cq; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; if (cb->server) { ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr); if (!ret) cb->qp = cb->child_cm_id->qp; } else { ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr); if (!ret) cb->qp = cb->cm_id->qp; } return ret; } static void krping_free_qp(struct krping_cb *cb) { ib_destroy_qp(cb->qp); ib_destroy_cq(cb->cq); ib_dealloc_pd(cb->pd); } static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id) { int ret; struct ib_cq_init_attr attr = {0}; cb->pd = ib_alloc_pd(cm_id->device, 0); if (IS_ERR(cb->pd)) { printk(KERN_ERR PFX "ib_alloc_pd failed\n"); return PTR_ERR(cb->pd); } DEBUG_LOG("created pd %p\n", cb->pd); strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name)); attr.cqe = cb->txdepth * 2; attr.comp_vector = 0; cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL, cb, &attr); if (IS_ERR(cb->cq)) { printk(KERN_ERR PFX "ib_create_cq failed\n"); ret = PTR_ERR(cb->cq); goto err1; } DEBUG_LOG("created cq %p\n", cb->cq); if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) { ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); if (ret) { printk(KERN_ERR PFX "ib_create_cq failed\n"); goto err2; } } ret = krping_create_qp(cb); if (ret) { printk(KERN_ERR PFX "krping_create_qp failed: %d\n", ret); goto err2; } DEBUG_LOG("created qp %p\n", cb->qp); return 0; err2: ib_destroy_cq(cb->cq); err1: ib_dealloc_pd(cb->pd); return ret; } /* * return the (possibly rebound) rkey for the rdma buffer. * REG mode: invalidate and rebind via reg wr. * other modes: just return the mr rkey. */ static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv) { u32 rkey; - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; int ret; struct scatterlist sg = {0}; cb->invalidate_wr.ex.invalidate_rkey = cb->reg_mr->rkey; /* * Update the reg key. */ ib_update_fast_reg_key(cb->reg_mr, ++cb->key); cb->reg_mr_wr.key = cb->reg_mr->rkey; /* * Update the reg WR with new buf info. */ if (buf == (u64)cb->start_dma_addr) cb->reg_mr_wr.access = IB_ACCESS_REMOTE_READ; else cb->reg_mr_wr.access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; sg_dma_address(&sg) = buf; sg_dma_len(&sg) = cb->size; ret = ib_map_mr_sg(cb->reg_mr, &sg, 1, NULL, PAGE_SIZE); BUG_ON(ret <= 0 || ret > cb->page_list_len); DEBUG_LOG(PFX "post_inv = %d, reg_mr new rkey 0x%x pgsz %u len %u" " iova_start %llx\n", post_inv, cb->reg_mr_wr.key, cb->reg_mr->page_size, (unsigned)cb->reg_mr->length, (unsigned long long)cb->reg_mr->iova); if (post_inv) ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr); else ret = ib_post_send(cb->qp, &cb->reg_mr_wr.wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "post send error %d\n", ret); cb->state = ERROR; } rkey = cb->reg_mr->rkey; return rkey; } static void krping_format_send(struct krping_cb *cb, u64 buf) { struct krping_rdma_info *info = &cb->send_buf; u32 rkey; /* * Client side will do reg or mw bind before * advertising the rdma buffer. Server side * sends have no data. */ if (!cb->server || cb->wlat || cb->rlat || cb->bw) { rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate); info->buf = htonll(buf); info->rkey = htonl(rkey); info->size = htonl(cb->size); DEBUG_LOG("RDMA addr %llx rkey %x len %d\n", (unsigned long long)buf, rkey, cb->size); } } static void krping_test_server(struct krping_cb *cb) { - struct ib_send_wr *bad_wr, inv; + const struct ib_send_wr *bad_wr; + struct ib_send_wr inv; int ret; while (1) { /* Wait for client's Start STAG/TO/Len */ wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV); if (cb->state != RDMA_READ_ADV) { printk(KERN_ERR PFX "wait for RDMA_READ_ADV state %d\n", cb->state); break; } DEBUG_LOG("server received sink adv\n"); cb->rdma_sq_wr.rkey = cb->remote_rkey; cb->rdma_sq_wr.remote_addr = cb->remote_addr; cb->rdma_sq_wr.wr.sg_list->length = cb->remote_len; cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, !cb->read_inv); cb->rdma_sq_wr.wr.next = NULL; /* Issue RDMA Read. */ if (cb->read_inv) cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; else { cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ; /* * Immediately follow the read with a * fenced LOCAL_INV. */ cb->rdma_sq_wr.wr.next = &inv; memset(&inv, 0, sizeof inv); inv.opcode = IB_WR_LOCAL_INV; inv.ex.invalidate_rkey = cb->reg_mr->rkey; inv.send_flags = IB_SEND_FENCE; } ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "post send error %d\n", ret); break; } cb->rdma_sq_wr.wr.next = NULL; DEBUG_LOG("server posted rdma read req \n"); /* Wait for read completion */ wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_COMPLETE); if (cb->state != RDMA_READ_COMPLETE) { printk(KERN_ERR PFX "wait for RDMA_READ_COMPLETE state %d\n", cb->state); break; } DEBUG_LOG("server received read complete\n"); /* Display data in recv buf */ if (cb->verbose) printk(KERN_INFO PFX "server ping data: %s\n", cb->rdma_buf); /* Tell client to continue */ if (cb->server && cb->server_invalidate) { cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey; cb->sq_wr.opcode = IB_WR_SEND_WITH_INV; DEBUG_LOG("send-w-inv rkey 0x%x\n", cb->remote_rkey); } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "post send error %d\n", ret); break; } DEBUG_LOG("server posted go ahead\n"); /* Wait for client's RDMA STAG/TO/Len */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV); if (cb->state != RDMA_WRITE_ADV) { printk(KERN_ERR PFX "wait for RDMA_WRITE_ADV state %d\n", cb->state); break; } DEBUG_LOG("server received sink adv\n"); /* RDMA Write echo data */ cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.rkey = cb->remote_rkey; cb->rdma_sq_wr.remote_addr = cb->remote_addr; cb->rdma_sq_wr.wr.sg_list->length = strlen(cb->rdma_buf) + 1; if (cb->local_dma_lkey) cb->rdma_sgl.lkey = cb->pd->local_dma_lkey; else cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0); DEBUG_LOG("rdma write from lkey %x laddr %llx len %d\n", cb->rdma_sq_wr.wr.sg_list->lkey, (unsigned long long)cb->rdma_sq_wr.wr.sg_list->addr, cb->rdma_sq_wr.wr.sg_list->length); ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "post send error %d\n", ret); break; } /* Wait for completion */ ret = wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_COMPLETE); if (cb->state != RDMA_WRITE_COMPLETE) { printk(KERN_ERR PFX "wait for RDMA_WRITE_COMPLETE state %d\n", cb->state); break; } DEBUG_LOG("server rdma write complete \n"); cb->state = CONNECTED; /* Tell client to begin again */ if (cb->server && cb->server_invalidate) { cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey; cb->sq_wr.opcode = IB_WR_SEND_WITH_INV; DEBUG_LOG("send-w-inv rkey 0x%x\n", cb->remote_rkey); } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "post send error %d\n", ret); break; } DEBUG_LOG("server posted go ahead\n"); } } static void rlat_test(struct krping_cb *cb) { int scnt; int iters = cb->count; struct timeval start_tv, stop_tv; int ret; struct ib_wc wc; - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; int ne; scnt = 0; cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ; cb->rdma_sq_wr.rkey = cb->remote_rkey; cb->rdma_sq_wr.remote_addr = cb->remote_addr; cb->rdma_sq_wr.wr.sg_list->length = cb->size; microtime(&start_tv); if (!cb->poll) { cb->state = RDMA_READ_ADV; ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); } while (scnt < iters) { cb->state = RDMA_READ_ADV; ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "Couldn't post send: ret=%d scnt %d\n", ret, scnt); return; } do { if (!cb->poll) { wait_event_interruptible(cb->sem, cb->state != RDMA_READ_ADV); if (cb->state == RDMA_READ_COMPLETE) { ne = 1; ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); } else { ne = -1; } } else ne = ib_poll_cq(cb->cq, 1, &wc); if (cb->state == ERROR) { printk(KERN_ERR PFX "state == ERROR...bailing scnt %d\n", scnt); return; } } while (ne == 0); if (ne < 0) { printk(KERN_ERR PFX "poll CQ failed %d\n", ne); return; } if (cb->poll && wc.status != IB_WC_SUCCESS) { printk(KERN_ERR PFX "Completion wth error at %s:\n", cb->server ? "server" : "client"); printk(KERN_ERR PFX "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } ++scnt; } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } printk(KERN_ERR PFX "delta sec %lu delta usec %lu iter %d size %d\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size); } static void wlat_test(struct krping_cb *cb) { int ccnt, scnt, rcnt; int iters=cb->count; volatile char *poll_buf = (char *) cb->start_buf; char *buf = (char *)cb->rdma_buf; struct timeval start_tv, stop_tv; cycles_t *post_cycles_start = NULL; cycles_t *post_cycles_stop = NULL; cycles_t *poll_cycles_start = NULL; cycles_t *poll_cycles_stop = NULL; cycles_t *last_poll_cycles_start = NULL; cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; int i; int cycle_iters = 1000; ccnt = 0; scnt = 0; rcnt = 0; post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_start) { printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); goto done; } post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_stop) { printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); goto done; } poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_start) { printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); goto done; } poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_stop) { printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); goto done; } last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!last_poll_cycles_start) { printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); goto done; } cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.rkey = cb->remote_rkey; cb->rdma_sq_wr.remote_addr = cb->remote_addr; cb->rdma_sq_wr.wr.sg_list->length = cb->size; if (cycle_iters > iters) cycle_iters = iters; microtime(&start_tv); while (scnt < iters || ccnt < iters || rcnt < iters) { /* Wait till buffer changes. */ if (rcnt < iters && !(scnt < 1 && !cb->server)) { ++rcnt; while (*poll_buf != (char)rcnt) { if (cb->state == ERROR) { printk(KERN_ERR PFX "state = ERROR, bailing\n"); goto done; } } } if (scnt < iters) { - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; *buf = (char)scnt+1; if (scnt < cycle_iters) post_cycles_start[scnt] = get_cycles(); if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) { printk(KERN_ERR PFX "Couldn't post send: scnt=%d\n", scnt); goto done; } if (scnt < cycle_iters) post_cycles_stop[scnt] = get_cycles(); scnt++; } if (ccnt < iters) { struct ib_wc wc; int ne; if (ccnt < cycle_iters) poll_cycles_start[ccnt] = get_cycles(); do { if (ccnt < cycle_iters) last_poll_cycles_start[ccnt] = get_cycles(); ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ccnt < cycle_iters) poll_cycles_stop[ccnt] = get_cycles(); ++ccnt; if (ne < 0) { printk(KERN_ERR PFX "poll CQ failed %d\n", ne); goto done; } if (wc.status != IB_WC_SUCCESS) { printk(KERN_ERR PFX "Completion wth error at %s:\n", cb->server ? "server" : "client"); printk(KERN_ERR PFX "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); printk(KERN_ERR PFX "scnt=%d, rcnt=%d, ccnt=%d\n", scnt, rcnt, ccnt); goto done; } } } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } for (i=0; i < cycle_iters; i++) { sum_post += post_cycles_stop[i] - post_cycles_start[i]; sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i]; } printk(KERN_ERR PFX "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d" " sum_post %llu sum_poll %llu sum_last_poll %llu\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size, cycle_iters, (unsigned long long)sum_post, (unsigned long long)sum_poll, (unsigned long long)sum_last_poll); done: kfree(post_cycles_start); kfree(post_cycles_stop); kfree(poll_cycles_start); kfree(poll_cycles_stop); kfree(last_poll_cycles_start); } static void bw_test(struct krping_cb *cb) { int ccnt, scnt, rcnt; int iters=cb->count; struct timeval start_tv, stop_tv; cycles_t *post_cycles_start = NULL; cycles_t *post_cycles_stop = NULL; cycles_t *poll_cycles_start = NULL; cycles_t *poll_cycles_stop = NULL; cycles_t *last_poll_cycles_start = NULL; cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; int i; int cycle_iters = 1000; ccnt = 0; scnt = 0; rcnt = 0; post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_start) { printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); goto done; } post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_stop) { printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); goto done; } poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_start) { printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); goto done; } poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_stop) { printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); goto done; } last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!last_poll_cycles_start) { printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); goto done; } cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.rkey = cb->remote_rkey; cb->rdma_sq_wr.remote_addr = cb->remote_addr; cb->rdma_sq_wr.wr.sg_list->length = cb->size; if (cycle_iters > iters) cycle_iters = iters; microtime(&start_tv); while (scnt < iters || ccnt < iters) { while (scnt < iters && scnt - ccnt < cb->txdepth) { - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; if (scnt < cycle_iters) post_cycles_start[scnt] = get_cycles(); if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) { printk(KERN_ERR PFX "Couldn't post send: scnt=%d\n", scnt); goto done; } if (scnt < cycle_iters) post_cycles_stop[scnt] = get_cycles(); ++scnt; } if (ccnt < iters) { int ne; struct ib_wc wc; if (ccnt < cycle_iters) poll_cycles_start[ccnt] = get_cycles(); do { if (ccnt < cycle_iters) last_poll_cycles_start[ccnt] = get_cycles(); ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ccnt < cycle_iters) poll_cycles_stop[ccnt] = get_cycles(); ccnt += 1; if (ne < 0) { printk(KERN_ERR PFX "poll CQ failed %d\n", ne); goto done; } if (wc.status != IB_WC_SUCCESS) { printk(KERN_ERR PFX "Completion wth error at %s:\n", cb->server ? "server" : "client"); printk(KERN_ERR PFX "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); goto done; } } } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } for (i=0; i < cycle_iters; i++) { sum_post += post_cycles_stop[i] - post_cycles_start[i]; sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i]; } printk(KERN_ERR PFX "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d" " sum_post %llu sum_poll %llu sum_last_poll %llu\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size, cycle_iters, (unsigned long long)sum_post, (unsigned long long)sum_poll, (unsigned long long)sum_last_poll); done: kfree(post_cycles_start); kfree(post_cycles_stop); kfree(poll_cycles_start); kfree(poll_cycles_stop); kfree(last_poll_cycles_start); } static void krping_rlat_test_server(struct krping_cb *cb) { - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { printk(KERN_ERR PFX "send completiong error %d\n", wc.status); return; } wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_wlat_test_server(struct krping_cb *cb) { - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { printk(KERN_ERR PFX "send completiong error %d\n", wc.status); return; } wlat_test(cb); wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_bw_test_server(struct krping_cb *cb) { - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { printk(KERN_ERR PFX "send completiong error %d\n", wc.status); return; } if (cb->duplex) bw_test(cb); wait_event_interruptible(cb->sem, cb->state == ERROR); } static int reg_supported(struct ib_device *dev) { u64 needed_flags = IB_DEVICE_MEM_MGT_EXTENSIONS; if ((dev->attrs.device_cap_flags & needed_flags) != needed_flags) { printk(KERN_ERR PFX "Fastreg not supported - device_cap_flags 0x%llx\n", (unsigned long long)dev->attrs.device_cap_flags); return 0; } DEBUG_LOG("Fastreg supported - device_cap_flags 0x%llx\n", (unsigned long long)dev->attrs.device_cap_flags); return 1; } static void fill_sockaddr(struct sockaddr_storage *sin, struct krping_cb *cb) { memset(sin, 0, sizeof(*sin)); if (cb->addr_type == AF_INET) { struct sockaddr_in *sin4 = (struct sockaddr_in *)sin; sin4->sin_len = sizeof(*sin4); sin4->sin_family = AF_INET; memcpy((void *)&sin4->sin_addr.s_addr, cb->addr, 4); sin4->sin_port = cb->port; } else if (cb->addr_type == AF_INET6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; sin6->sin6_len = sizeof(*sin6); sin6->sin6_family = AF_INET6; memcpy((void *)&sin6->sin6_addr, cb->addr, 16); sin6->sin6_port = cb->port; } } static int krping_bind_server(struct krping_cb *cb) { struct sockaddr_storage sin; int ret; fill_sockaddr(&sin, cb); ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *)&sin); if (ret) { printk(KERN_ERR PFX "rdma_bind_addr error %d\n", ret); return ret; } DEBUG_LOG("rdma_bind_addr successful\n"); DEBUG_LOG("rdma_listen\n"); ret = rdma_listen(cb->cm_id, 3); if (ret) { printk(KERN_ERR PFX "rdma_listen failed: %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST); if (cb->state != CONNECT_REQUEST) { printk(KERN_ERR PFX "wait for CONNECT_REQUEST state %d\n", cb->state); return -1; } if (!reg_supported(cb->child_cm_id->device)) return -EINVAL; return 0; } static void krping_run_server(struct krping_cb *cb) { - struct ib_recv_wr *bad_wr; + const struct ib_recv_wr *bad_wr; int ret; ret = krping_bind_server(cb); if (ret) return; ret = krping_setup_qp(cb, cb->child_cm_id); if (ret) { printk(KERN_ERR PFX "setup_qp failed: %d\n", ret); goto err0; } ret = krping_setup_buffers(cb); if (ret) { printk(KERN_ERR PFX "krping_setup_buffers failed: %d\n", ret); goto err1; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret); goto err2; } ret = krping_accept(cb); if (ret) { printk(KERN_ERR PFX "connect error %d\n", ret); goto err2; } if (cb->wlat) krping_wlat_test_server(cb); else if (cb->rlat) krping_rlat_test_server(cb); else if (cb->bw) krping_bw_test_server(cb); else krping_test_server(cb); rdma_disconnect(cb->child_cm_id); err2: krping_free_buffers(cb); err1: krping_free_qp(cb); err0: rdma_destroy_id(cb->child_cm_id); } static void krping_test_client(struct krping_cb *cb) { int ping, start, cc, i, ret; - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; unsigned char c; start = 65; for (ping = 0; !cb->count || ping < cb->count; ping++) { cb->state = RDMA_READ_ADV; /* Put some ascii text in the buffer. */ cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping); for (i = cc, c = start; i < cb->size; i++) { cb->start_buf[i] = c; c++; if (c > 122) c = 65; } start++; if (start > 122) start = 65; cb->start_buf[cb->size - 1] = 0; krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { printk(KERN_ERR PFX "krping_format_send failed\n"); break; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "post send error %d\n", ret); break; } /* Wait for server to ACK */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV); if (cb->state != RDMA_WRITE_ADV) { printk(KERN_ERR PFX "wait for RDMA_WRITE_ADV state %d\n", cb->state); break; } krping_format_send(cb, cb->rdma_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "post send error %d\n", ret); break; } /* Wait for the server to say the RDMA Write is complete. */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_COMPLETE); if (cb->state != RDMA_WRITE_COMPLETE) { printk(KERN_ERR PFX "wait for RDMA_WRITE_COMPLETE state %d\n", cb->state); break; } if (cb->validate) if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) { printk(KERN_ERR PFX "data mismatch!\n"); break; } if (cb->verbose) printk(KERN_INFO PFX "ping data: %s\n", cb->rdma_buf); #ifdef SLOW_KRPING wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); #endif } } static void krping_rlat_test_client(struct krping_cb *cb) { - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { printk(KERN_ERR PFX "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { printk(KERN_ERR PFX "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } #if 0 { int i; struct timeval start, stop; time_t sec; suseconds_t usec; unsigned long long elapsed; struct ib_wc wc; - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; int ne; cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.rkey = cb->remote_rkey; cb->rdma_sq_wr.remote_addr = cb->remote_addr; cb->rdma_sq_wr.wr.sg_list->length = 0; cb->rdma_sq_wr.wr.num_sge = 0; microtime(&start); for (i=0; i < 100000; i++) { if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) { printk(KERN_ERR PFX "Couldn't post send\n"); return; } do { ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ne < 0) { printk(KERN_ERR PFX "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { printk(KERN_ERR PFX "Completion wth error at %s:\n", cb->server ? "server" : "client"); printk(KERN_ERR PFX "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } } microtime(&stop); if (stop.tv_usec < start.tv_usec) { stop.tv_usec += 1000000; stop.tv_sec -= 1; } sec = stop.tv_sec - start.tv_sec; usec = stop.tv_usec - start.tv_usec; elapsed = sec * 1000000 + usec; printk(KERN_ERR PFX "0B-write-lat iters 100000 usec %llu\n", elapsed); } #endif rlat_test(cb); } static void krping_wlat_test_client(struct krping_cb *cb) { - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { printk(KERN_ERR PFX "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { printk(KERN_ERR PFX "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } wlat_test(cb); } static void krping_bw_test_client(struct krping_cb *cb) { - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { printk(KERN_ERR PFX "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { printk(KERN_ERR PFX "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } bw_test(cb); } /* * Manual qp flush test */ static void flush_qp(struct krping_cb *cb) { - struct ib_send_wr wr = { 0 }, *bad; - struct ib_recv_wr recv_wr = { 0 }, *recv_bad; + struct ib_send_wr wr = { 0 }; + const struct ib_send_wr *bad; + struct ib_recv_wr recv_wr = { 0 }; + const struct ib_recv_wr *recv_bad; struct ib_wc wc; int ret; int flushed = 0; int ccnt = 0; rdma_disconnect(cb->cm_id); DEBUG_LOG("disconnected!\n"); wr.opcode = IB_WR_SEND; wr.wr_id = 0xdeadbeefcafebabe; ret = ib_post_send(cb->qp, &wr, &bad); if (ret) { printk(KERN_ERR PFX "%s post_send failed ret %d\n", __func__, ret); return; } recv_wr.wr_id = 0xcafebabedeadbeef; ret = ib_post_recv(cb->qp, &recv_wr, &recv_bad); if (ret) { printk(KERN_ERR PFX "%s post_recv failed ret %d\n", __func__, ret); return; } /* poll until the flush WRs complete */ do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { printk(KERN_ERR PFX "ib_poll_cq failed %d\n", ret); return; } if (ret == 0) continue; ccnt++; if (wc.wr_id == 0xdeadbeefcafebabe || wc.wr_id == 0xcafebabedeadbeef) flushed++; } while (flushed != 2); DEBUG_LOG("qp_flushed! ccnt %u\n", ccnt); } static void krping_fr_test(struct krping_cb *cb) { - struct ib_send_wr inv, *bad; + struct ib_send_wr inv; + const struct ib_send_wr *bad; struct ib_reg_wr fr; struct ib_wc wc; u8 key = 0; struct ib_mr *mr; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; unsigned long start; int count = 0; int scnt = 0; struct scatterlist sg = {0}; mr = ib_alloc_mr(cb->pd, IB_MR_TYPE_MEM_REG, plen); if (IS_ERR(mr)) { printk(KERN_ERR PFX "ib_alloc_mr failed %ld\n", PTR_ERR(mr)); return; } sg_dma_address(&sg) = (dma_addr_t)0xcafebabe0000ULL; sg_dma_len(&sg) = size; ret = ib_map_mr_sg(mr, &sg, 1, NULL, PAGE_SIZE); if (ret <= 0) { printk(KERN_ERR PFX "ib_map_mr_sge err %d\n", ret); goto err2; } memset(&fr, 0, sizeof fr); fr.wr.opcode = IB_WR_REG_MR; fr.access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr.mr = mr; fr.wr.next = &inv; memset(&inv, 0, sizeof inv); inv.opcode = IB_WR_LOCAL_INV; inv.send_flags = IB_SEND_SIGNALED; DEBUG_LOG("fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth); start = time_uptime; while (!cb->count || count <= cb->count) { if (SIGPENDING(curthread)) { printk(KERN_ERR PFX "signal!\n"); break; } if ((time_uptime - start) >= 9) { DEBUG_LOG("fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); if (cb->state == ERROR) break; start = time_uptime; } while (scnt < (cb->txdepth>>1)) { ib_update_fast_reg_key(mr, ++key); fr.key = mr->rkey; inv.ex.invalidate_rkey = mr->rkey; size = arc4random() % cb->size; if (size == 0) size = cb->size; sg_dma_len(&sg) = size; ret = ib_map_mr_sg(mr, &sg, 1, NULL, PAGE_SIZE); if (ret <= 0) { printk(KERN_ERR PFX "ib_map_mr_sge err %d\n", ret); goto err2; } ret = ib_post_send(cb->qp, &fr.wr, &bad); if (ret) { printk(KERN_ERR PFX "ib_post_send failed %d\n", ret); goto err2; } scnt++; } ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { printk(KERN_ERR PFX "ib_poll_cq failed %d\n", ret); goto err2; } if (ret == 1) { if (wc.status) { printk(KERN_ERR PFX "completion error %u\n", wc.status); goto err2; } count++; scnt--; } } err2: flush_qp(cb); DEBUG_LOG("fr_test: done!\n"); ib_dereg_mr(mr); } static int krping_connect_client(struct krping_cb *cb) { struct rdma_conn_param conn_param; int ret; memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; conn_param.retry_count = 10; ret = rdma_connect(cb->cm_id, &conn_param); if (ret) { printk(KERN_ERR PFX "rdma_connect error %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= CONNECTED); if (cb->state == ERROR) { printk(KERN_ERR PFX "wait for CONNECTED state %d\n", cb->state); return -1; } DEBUG_LOG("rdma_connect successful\n"); return 0; } static int krping_bind_client(struct krping_cb *cb) { struct sockaddr_storage sin; int ret; fill_sockaddr(&sin, cb); ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *)&sin, 2000); if (ret) { printk(KERN_ERR PFX "rdma_resolve_addr error %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED); if (cb->state != ROUTE_RESOLVED) { printk(KERN_ERR PFX "addr/route resolution did not resolve: state %d\n", cb->state); return -EINTR; } if (!reg_supported(cb->cm_id->device)) return -EINVAL; DEBUG_LOG("rdma_resolve_addr - rdma_resolve_route successful\n"); return 0; } static void krping_run_client(struct krping_cb *cb) { - struct ib_recv_wr *bad_wr; + const struct ib_recv_wr *bad_wr; int ret; /* set type of service, if any */ if (cb->tos != 0) rdma_set_service_type(cb->cm_id, cb->tos); ret = krping_bind_client(cb); if (ret) return; ret = krping_setup_qp(cb, cb->cm_id); if (ret) { printk(KERN_ERR PFX "setup_qp failed: %d\n", ret); return; } ret = krping_setup_buffers(cb); if (ret) { printk(KERN_ERR PFX "krping_setup_buffers failed: %d\n", ret); goto err1; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret); goto err2; } ret = krping_connect_client(cb); if (ret) { printk(KERN_ERR PFX "connect error %d\n", ret); goto err2; } if (cb->wlat) krping_wlat_test_client(cb); else if (cb->rlat) krping_rlat_test_client(cb); else if (cb->bw) krping_bw_test_client(cb); else if (cb->frtest) krping_fr_test(cb); else krping_test_client(cb); rdma_disconnect(cb->cm_id); err2: krping_free_buffers(cb); err1: krping_free_qp(cb); } static uint16_t krping_get_ipv6_scope_id(char *name) { struct ifnet *ifp; uint16_t retval; if (name == NULL) return (0); CURVNET_SET_QUIET(TD_TO_VNET(curthread)); ifp = ifunit_ref(name); CURVNET_RESTORE(); if (ifp == NULL) return (0); retval = ifp->if_index; if_rele(ifp); return (retval); } int krping_doit(char *cmd) { struct krping_cb *cb; int op; int ret = 0; char *optarg; char *scope; unsigned long optint; cb = kzalloc(sizeof(*cb), GFP_KERNEL); if (!cb) return -ENOMEM; mutex_lock(&krping_mutex); list_add_tail(&cb->list, &krping_cbs); mutex_unlock(&krping_mutex); cb->server = -1; cb->state = IDLE; cb->size = 64; cb->txdepth = RPING_SQ_DEPTH; init_waitqueue_head(&cb->sem); while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg, &optint)) != 0) { switch (op) { case 'a': cb->addr_str = optarg; cb->addr_type = AF_INET; DEBUG_LOG("ipaddr (%s)\n", optarg); if (inet_pton(AF_INET, optarg, cb->addr) != 1) { printk(KERN_ERR PFX "bad addr string %s\n", optarg); ret = EINVAL; } break; case 'A': cb->addr_str = optarg; cb->addr_type = AF_INET6; DEBUG_LOG("ipv6addr (%s)\n", optarg); scope = strstr(optarg, "%"); /* extract scope ID, if any */ if (scope != NULL) *scope++ = 0; /* extract IPv6 network address */ if (inet_pton(AF_INET6, optarg, cb->addr) != 1) { printk(KERN_ERR PFX "bad addr string %s\n", optarg); ret = EINVAL; } else if (IN6_IS_SCOPE_LINKLOCAL((struct in6_addr *)cb->addr) || IN6_IS_ADDR_MC_INTFACELOCAL((struct in6_addr *)cb->addr)) { uint16_t scope_id = krping_get_ipv6_scope_id(scope); DEBUG_LOG("ipv6 scope ID = %d\n", scope_id); cb->addr[2] = scope_id >> 8; cb->addr[3] = scope_id & 0xFF; } break; case 'p': cb->port = htons(optint); DEBUG_LOG("port %d\n", (int)optint); break; case 'P': cb->poll = 1; DEBUG_LOG("server\n"); break; case 's': cb->server = 1; DEBUG_LOG("server\n"); break; case 'c': cb->server = 0; DEBUG_LOG("client\n"); break; case 'S': cb->size = optint; if ((cb->size < 1) || (cb->size > RPING_BUFSIZE)) { printk(KERN_ERR PFX "Invalid size %d " "(valid range is 1 to %d)\n", cb->size, RPING_BUFSIZE); ret = EINVAL; } else DEBUG_LOG("size %d\n", (int)optint); break; case 'C': cb->count = optint; if (cb->count < 0) { printk(KERN_ERR PFX "Invalid count %d\n", cb->count); ret = EINVAL; } else DEBUG_LOG("count %d\n", (int) cb->count); break; case 'v': cb->verbose++; DEBUG_LOG("verbose\n"); break; case 'V': cb->validate++; DEBUG_LOG("validate data\n"); break; case 'l': cb->wlat++; break; case 'L': cb->rlat++; break; case 'B': cb->bw++; break; case 'd': cb->duplex++; break; case 'I': cb->server_invalidate = 1; break; case 't': cb->tos = optint; DEBUG_LOG("type of service, tos=%d\n", (int) cb->tos); break; case 'T': cb->txdepth = optint; DEBUG_LOG("txdepth %d\n", (int) cb->txdepth); break; case 'Z': cb->local_dma_lkey = 1; DEBUG_LOG("using local dma lkey\n"); break; case 'R': cb->read_inv = 1; DEBUG_LOG("using read-with-inv\n"); break; case 'f': cb->frtest = 1; DEBUG_LOG("fast-reg test!\n"); break; default: printk(KERN_ERR PFX "unknown opt %s\n", optarg); ret = -EINVAL; break; } } if (ret) goto out; if (cb->server == -1) { printk(KERN_ERR PFX "must be either client or server\n"); ret = -EINVAL; goto out; } if (cb->server && cb->frtest) { printk(KERN_ERR PFX "must be client to run frtest\n"); ret = -EINVAL; goto out; } if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) { printk(KERN_ERR PFX "Pick only one test: fr, bw, rlat, wlat\n"); ret = -EINVAL; goto out; } if (cb->wlat || cb->rlat || cb->bw) { printk(KERN_ERR PFX "wlat, rlat, and bw tests only support mem_mode MR - which is no longer supported\n"); ret = -EINVAL; goto out; } cb->cm_id = rdma_create_id(TD_TO_VNET(curthread), krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cb->cm_id)) { ret = PTR_ERR(cb->cm_id); printk(KERN_ERR PFX "rdma_create_id error %d\n", ret); goto out; } DEBUG_LOG("created cm_id %p\n", cb->cm_id); if (cb->server) krping_run_server(cb); else krping_run_client(cb); DEBUG_LOG("destroy cm_id %p\n", cb->cm_id); rdma_destroy_id(cb->cm_id); out: mutex_lock(&krping_mutex); list_del(&cb->list); mutex_unlock(&krping_mutex); kfree(cb); return ret; } void krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg) { struct krping_cb *cb; mutex_lock(&krping_mutex); list_for_each_entry(cb, &krping_cbs, list) (*f)(cb->pd ? &cb->stats : NULL, arg); mutex_unlock(&krping_mutex); } void krping_cancel_all(void) { struct krping_cb *cb; mutex_lock(&krping_mutex); list_for_each_entry(cb, &krping_cbs, list) { cb->state = ERROR; wake_up_interruptible(&cb->sem); } mutex_unlock(&krping_mutex); } diff --git a/sys/dev/cxgbe/iw_cxgbe/iw_cxgbe.h b/sys/dev/cxgbe/iw_cxgbe/iw_cxgbe.h index 3664895200c1..59ca38a96004 100644 --- a/sys/dev/cxgbe/iw_cxgbe/iw_cxgbe.h +++ b/sys/dev/cxgbe/iw_cxgbe/iw_cxgbe.h @@ -1,989 +1,989 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009-2013, 2016 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $FreeBSD$ */ #ifndef __IW_CXGB4_H__ #define __IW_CXGB4_H__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_tcb.h" #include "t4_l2t.h" #define DRV_NAME "iw_cxgbe" #define MOD DRV_NAME ":" #define KTR_IW_CXGBE KTR_SPARE3 extern int c4iw_debug; extern int use_dsgl; extern int inline_threshold; #define PDBG(fmt, args...) \ do { \ if (c4iw_debug) \ printf(MOD fmt, ## args); \ } while (0) #include "t4.h" static inline void *cplhdr(struct mbuf *m) { return mtod(m, void*); } #define PBL_OFF(rdev_p, a) ((a) - (rdev_p)->adap->vres.pbl.start) #define RQT_OFF(rdev_p, a) ((a) - (rdev_p)->adap->vres.rq.start) #define C4IW_ID_TABLE_F_RANDOM 1 /* Pseudo-randomize the id's returned */ #define C4IW_ID_TABLE_F_EMPTY 2 /* Table is initially empty */ #define C4IW_MAX_PAGE_SIZE 0x8000000 struct c4iw_id_table { u32 flags; u32 start; /* logical minimal id */ u32 last; /* hint for find */ u32 max; spinlock_t lock; unsigned long *table; }; struct c4iw_resource { struct c4iw_id_table tpt_table; struct c4iw_id_table qid_table; struct c4iw_id_table pdid_table; }; struct c4iw_qid_list { struct list_head entry; u32 qid; }; struct c4iw_dev_ucontext { struct list_head qpids; struct list_head cqids; struct mutex lock; }; enum c4iw_rdev_flags { T4_FATAL_ERROR = (1<<0), T4_STATUS_PAGE_DISABLED = (1<<1), }; struct c4iw_stat { u64 total; u64 cur; u64 max; u64 fail; }; struct c4iw_stats { struct mutex lock; struct c4iw_stat qid; struct c4iw_stat pd; struct c4iw_stat stag; struct c4iw_stat pbl; struct c4iw_stat rqt; }; struct c4iw_hw_queue { int t4_eq_status_entries; int t4_max_eq_size; int t4_max_iq_size; int t4_max_rq_size; int t4_max_sq_size; int t4_max_qp_depth; int t4_max_cq_depth; int t4_stat_len; }; struct c4iw_rdev { struct adapter *adap; struct c4iw_resource resource; unsigned long qpshift; u32 qpmask; unsigned long cqshift; u32 cqmask; struct c4iw_dev_ucontext uctx; vmem_t *rqt_arena; vmem_t *pbl_arena; u32 flags; struct c4iw_stats stats; struct c4iw_hw_queue hw_queue; struct t4_dev_status_page *status_page; unsigned long bar2_pa; void __iomem *bar2_kva; unsigned int bar2_len; struct workqueue_struct *free_workq; }; static inline int c4iw_fatal_error(struct c4iw_rdev *rdev) { return rdev->flags & T4_FATAL_ERROR; } static inline int c4iw_num_stags(struct c4iw_rdev *rdev) { return (int)(rdev->adap->vres.stag.size >> 5); } static inline int t4_max_fr_depth(struct c4iw_rdev *rdev, bool use_dsgl) { if (rdev->adap->params.ulptx_memwrite_dsgl && use_dsgl) return rdev->adap->params.dev_512sgl_mr ? T4_MAX_FR_FW_DSGL_DEPTH : T4_MAX_FR_DSGL_DEPTH; else return T4_MAX_FR_IMMD_DEPTH; } #define C4IW_WR_TO (60*HZ) struct c4iw_wr_wait { int ret; struct completion completion; }; static inline void c4iw_init_wr_wait(struct c4iw_wr_wait *wr_waitp) { wr_waitp->ret = 0; init_completion(&wr_waitp->completion); } static inline void c4iw_wake_up(struct c4iw_wr_wait *wr_waitp, int ret) { wr_waitp->ret = ret; complete(&wr_waitp->completion); } static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev, struct c4iw_wr_wait *wr_waitp, u32 hwtid, u32 qpid, struct socket *so, const char *func) { struct adapter *sc = rdev->adap; unsigned to = C4IW_WR_TO; int ret; int timedout = 0; struct timeval t1, t2; if (c4iw_fatal_error(rdev)) { wr_waitp->ret = -EIO; goto out; } getmicrotime(&t1); do { /* If waiting for reply in rdma_init()/rdma_fini() threads, then * check if there are any connection errors. */ if (so && so->so_error) { wr_waitp->ret = -ECONNRESET; CTR5(KTR_IW_CXGBE, "%s - Connection ERROR %u for sock %p" "tid %u qpid %u", func, so->so_error, so, hwtid, qpid); break; } ret = wait_for_completion_timeout(&wr_waitp->completion, to); if (!ret) { getmicrotime(&t2); timevalsub(&t2, &t1); printf("%s - Device %s not responding after %ld.%06ld " "seconds - tid %u qpid %u\n", func, device_get_nameunit(sc->dev), t2.tv_sec, t2.tv_usec, hwtid, qpid); if (c4iw_fatal_error(rdev)) { wr_waitp->ret = -EIO; break; } to = to << 2; timedout = 1; } } while (!ret); out: if (timedout) { getmicrotime(&t2); timevalsub(&t2, &t1); printf("%s - Device %s reply after %ld.%06ld seconds - " "tid %u qpid %u\n", func, device_get_nameunit(sc->dev), t2.tv_sec, t2.tv_usec, hwtid, qpid); } if (wr_waitp->ret) CTR4(KTR_IW_CXGBE, "%p: FW reply %d tid %u qpid %u", sc, wr_waitp->ret, hwtid, qpid); return (wr_waitp->ret); } struct c4iw_dev { struct ib_device ibdev; struct pci_dev pdev; struct c4iw_rdev rdev; u32 device_cap_flags; struct idr cqidr; struct idr qpidr; struct idr mmidr; spinlock_t lock; struct dentry *debugfs_root; u32 avail_ird; }; static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev) { return container_of(ibdev, struct c4iw_dev, ibdev); } static inline struct c4iw_dev *rdev_to_c4iw_dev(struct c4iw_rdev *rdev) { return container_of(rdev, struct c4iw_dev, rdev); } static inline struct c4iw_cq *get_chp(struct c4iw_dev *rhp, u32 cqid) { return idr_find(&rhp->cqidr, cqid); } static inline struct c4iw_qp *get_qhp(struct c4iw_dev *rhp, u32 qpid) { return idr_find(&rhp->qpidr, qpid); } static inline struct c4iw_mr *get_mhp(struct c4iw_dev *rhp, u32 mmid) { return idr_find(&rhp->mmidr, mmid); } static inline int _insert_handle(struct c4iw_dev *rhp, struct idr *idr, void *handle, u32 id, int lock) { int ret; int newid; do { if (!idr_pre_get(idr, lock ? GFP_KERNEL : GFP_ATOMIC)) return -ENOMEM; if (lock) spin_lock_irq(&rhp->lock); ret = idr_get_new_above(idr, handle, id, &newid); BUG_ON(!ret && newid != id); if (lock) spin_unlock_irq(&rhp->lock); } while (ret == -EAGAIN); return ret; } static inline int insert_handle(struct c4iw_dev *rhp, struct idr *idr, void *handle, u32 id) { return _insert_handle(rhp, idr, handle, id, 1); } static inline int insert_handle_nolock(struct c4iw_dev *rhp, struct idr *idr, void *handle, u32 id) { return _insert_handle(rhp, idr, handle, id, 0); } static inline void _remove_handle(struct c4iw_dev *rhp, struct idr *idr, u32 id, int lock) { if (lock) spin_lock_irq(&rhp->lock); idr_remove(idr, id); if (lock) spin_unlock_irq(&rhp->lock); } static inline void remove_handle(struct c4iw_dev *rhp, struct idr *idr, u32 id) { _remove_handle(rhp, idr, id, 1); } static inline void remove_handle_nolock(struct c4iw_dev *rhp, struct idr *idr, u32 id) { _remove_handle(rhp, idr, id, 0); } extern int c4iw_max_read_depth; static inline int cur_max_read_depth(struct c4iw_dev *dev) { return min(dev->rdev.adap->params.max_ordird_qp, c4iw_max_read_depth); } struct c4iw_pd { struct ib_pd ibpd; u32 pdid; struct c4iw_dev *rhp; }; static inline struct c4iw_pd *to_c4iw_pd(struct ib_pd *ibpd) { return container_of(ibpd, struct c4iw_pd, ibpd); } struct tpt_attributes { u64 len; u64 va_fbo; enum fw_ri_mem_perms perms; u32 stag; u32 pdid; u32 qpid; u32 pbl_addr; u32 pbl_size; u32 state:1; u32 type:2; u32 rsvd:1; u32 remote_invaliate_disable:1; u32 zbva:1; u32 mw_bind_enable:1; u32 page_size:5; }; struct c4iw_mr { struct ib_mr ibmr; struct ib_umem *umem; struct c4iw_dev *rhp; u64 kva; struct tpt_attributes attr; u64 *mpl; dma_addr_t mpl_addr; u32 max_mpl_len; u32 mpl_len; }; static inline struct c4iw_mr *to_c4iw_mr(struct ib_mr *ibmr) { return container_of(ibmr, struct c4iw_mr, ibmr); } struct c4iw_mw { struct ib_mw ibmw; struct c4iw_dev *rhp; u64 kva; struct tpt_attributes attr; }; static inline struct c4iw_mw *to_c4iw_mw(struct ib_mw *ibmw) { return container_of(ibmw, struct c4iw_mw, ibmw); } struct c4iw_cq { struct ib_cq ibcq; struct c4iw_dev *rhp; struct t4_cq cq; spinlock_t lock; spinlock_t comp_handler_lock; atomic_t refcnt; wait_queue_head_t wait; }; static inline struct c4iw_cq *to_c4iw_cq(struct ib_cq *ibcq) { return container_of(ibcq, struct c4iw_cq, ibcq); } struct c4iw_mpa_attributes { u8 initiator; u8 recv_marker_enabled; u8 xmit_marker_enabled; u8 crc_enabled; u8 enhanced_rdma_conn; u8 version; u8 p2p_type; }; struct c4iw_qp_attributes { u32 scq; u32 rcq; u32 sq_num_entries; u32 rq_num_entries; u32 sq_max_sges; u32 sq_max_sges_rdma_write; u32 rq_max_sges; u32 state; u8 enable_rdma_read; u8 enable_rdma_write; u8 enable_bind; u8 enable_mmid0_fastreg; u32 max_ord; u32 max_ird; u32 pd; u32 next_state; char terminate_buffer[52]; u32 terminate_msg_len; u8 is_terminate_local; struct c4iw_mpa_attributes mpa_attr; struct c4iw_ep *llp_stream_handle; u8 layer_etype; u8 ecode; u16 sq_db_inc; u16 rq_db_inc; u8 send_term; }; struct c4iw_qp { struct ib_qp ibqp; struct c4iw_dev *rhp; struct c4iw_ep *ep; struct c4iw_qp_attributes attr; struct t4_wq wq; spinlock_t lock; struct mutex mutex; struct kref kref; wait_queue_head_t wait; struct timer_list timer; int sq_sig_all; struct work_struct free_work; struct c4iw_ucontext *ucontext; }; static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp) { return container_of(ibqp, struct c4iw_qp, ibqp); } struct c4iw_ucontext { struct ib_ucontext ibucontext; struct c4iw_dev_ucontext uctx; u32 key; spinlock_t mmap_lock; struct list_head mmaps; struct kref kref; }; static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c) { return container_of(c, struct c4iw_ucontext, ibucontext); } void _c4iw_free_ucontext(struct kref *kref); static inline void c4iw_put_ucontext(struct c4iw_ucontext *ucontext) { kref_put(&ucontext->kref, _c4iw_free_ucontext); } static inline void c4iw_get_ucontext(struct c4iw_ucontext *ucontext) { kref_get(&ucontext->kref); } struct c4iw_mm_entry { struct list_head entry; u64 addr; u32 key; unsigned len; }; static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext, u32 key, unsigned len) { struct list_head *pos, *nxt; struct c4iw_mm_entry *mm; spin_lock(&ucontext->mmap_lock); list_for_each_safe(pos, nxt, &ucontext->mmaps) { mm = list_entry(pos, struct c4iw_mm_entry, entry); if (mm->key == key && mm->len == len) { list_del_init(&mm->entry); spin_unlock(&ucontext->mmap_lock); CTR4(KTR_IW_CXGBE, "%s key 0x%x addr 0x%llx len %d", __func__, key, (unsigned long long) mm->addr, mm->len); return mm; } } spin_unlock(&ucontext->mmap_lock); return NULL; } static inline void insert_mmap(struct c4iw_ucontext *ucontext, struct c4iw_mm_entry *mm) { spin_lock(&ucontext->mmap_lock); CTR4(KTR_IW_CXGBE, "%s key 0x%x addr 0x%llx len %d", __func__, mm->key, (unsigned long long) mm->addr, mm->len); list_add_tail(&mm->entry, &ucontext->mmaps); spin_unlock(&ucontext->mmap_lock); } enum c4iw_qp_attr_mask { C4IW_QP_ATTR_NEXT_STATE = 1 << 0, C4IW_QP_ATTR_SQ_DB = 1<<1, C4IW_QP_ATTR_RQ_DB = 1<<2, C4IW_QP_ATTR_ENABLE_RDMA_READ = 1 << 7, C4IW_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8, C4IW_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9, C4IW_QP_ATTR_MAX_ORD = 1 << 11, C4IW_QP_ATTR_MAX_IRD = 1 << 12, C4IW_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22, C4IW_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23, C4IW_QP_ATTR_MPA_ATTR = 1 << 24, C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25, C4IW_QP_ATTR_VALID_MODIFY = (C4IW_QP_ATTR_ENABLE_RDMA_READ | C4IW_QP_ATTR_ENABLE_RDMA_WRITE | C4IW_QP_ATTR_MAX_ORD | C4IW_QP_ATTR_MAX_IRD | C4IW_QP_ATTR_LLP_STREAM_HANDLE | C4IW_QP_ATTR_STREAM_MSG_BUFFER | C4IW_QP_ATTR_MPA_ATTR | C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE) }; int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, enum c4iw_qp_attr_mask mask, struct c4iw_qp_attributes *attrs, int internal); enum c4iw_qp_state { C4IW_QP_STATE_IDLE, C4IW_QP_STATE_RTS, C4IW_QP_STATE_ERROR, C4IW_QP_STATE_TERMINATE, C4IW_QP_STATE_CLOSING, C4IW_QP_STATE_TOT }; /* * IW_CXGBE event bits. * These bits are used for handling all events for a particular 'ep' serially. */ #define C4IW_EVENT_SOCKET 0x0001 #define C4IW_EVENT_TIMEOUT 0x0002 #define C4IW_EVENT_TERM 0x0004 static inline int c4iw_convert_state(enum ib_qp_state ib_state) { switch (ib_state) { case IB_QPS_RESET: case IB_QPS_INIT: return C4IW_QP_STATE_IDLE; case IB_QPS_RTS: return C4IW_QP_STATE_RTS; case IB_QPS_SQD: return C4IW_QP_STATE_CLOSING; case IB_QPS_SQE: return C4IW_QP_STATE_TERMINATE; case IB_QPS_ERR: return C4IW_QP_STATE_ERROR; default: return -1; } } static inline int to_ib_qp_state(int c4iw_qp_state) { switch (c4iw_qp_state) { case C4IW_QP_STATE_IDLE: return IB_QPS_INIT; case C4IW_QP_STATE_RTS: return IB_QPS_RTS; case C4IW_QP_STATE_CLOSING: return IB_QPS_SQD; case C4IW_QP_STATE_TERMINATE: return IB_QPS_SQE; case C4IW_QP_STATE_ERROR: return IB_QPS_ERR; } return IB_QPS_ERR; } #define C4IW_DRAIN_OPCODE FW_RI_SGE_EC_CR_RETURN static inline u32 c4iw_ib_to_tpt_access(int a) { return (a & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) | (a & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0) | (a & IB_ACCESS_LOCAL_WRITE ? FW_RI_MEM_ACCESS_LOCAL_WRITE : 0) | FW_RI_MEM_ACCESS_LOCAL_READ; } static inline u32 c4iw_ib_to_tpt_bind_access(int acc) { return (acc & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) | (acc & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0); } enum c4iw_mmid_state { C4IW_STAG_STATE_VALID, C4IW_STAG_STATE_INVALID }; #define C4IW_NODE_DESC "iw_cxgbe Chelsio Communications" #define MPA_KEY_REQ "MPA ID Req Frame" #define MPA_KEY_REP "MPA ID Rep Frame" #define MPA_MAX_PRIVATE_DATA 256 #define MPA_ENHANCED_RDMA_CONN 0x10 #define MPA_REJECT 0x20 #define MPA_CRC 0x40 #define MPA_MARKERS 0x80 #define MPA_FLAGS_MASK 0xE0 #define MPA_V2_PEER2PEER_MODEL 0x8000 #define MPA_V2_ZERO_LEN_FPDU_RTR 0x4000 #define MPA_V2_RDMA_WRITE_RTR 0x8000 #define MPA_V2_RDMA_READ_RTR 0x4000 #define MPA_V2_IRD_ORD_MASK 0x3FFF #define c4iw_put_ep(ep) { \ CTR4(KTR_IW_CXGBE, "put_ep (%s:%u) ep %p, refcnt %d", \ __func__, __LINE__, ep, atomic_read(&(ep)->kref.refcount)); \ WARN_ON(atomic_read(&(ep)->kref.refcount) < 1); \ kref_put(&((ep)->kref), _c4iw_free_ep); \ } #define c4iw_get_ep(ep) { \ CTR4(KTR_IW_CXGBE, "get_ep (%s:%u) ep %p, refcnt %d", \ __func__, __LINE__, ep, atomic_read(&(ep)->kref.refcount)); \ kref_get(&((ep)->kref)); \ } void _c4iw_free_ep(struct kref *kref); struct mpa_message { u8 key[16]; u8 flags; u8 revision; __be16 private_data_size; u8 private_data[0]; }; struct mpa_v2_conn_params { __be16 ird; __be16 ord; }; struct terminate_message { u8 layer_etype; u8 ecode; __be16 hdrct_rsvd; u8 len_hdrs[0]; }; #define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28) enum c4iw_layers_types { LAYER_RDMAP = 0x00, LAYER_DDP = 0x10, LAYER_MPA = 0x20, RDMAP_LOCAL_CATA = 0x00, RDMAP_REMOTE_PROT = 0x01, RDMAP_REMOTE_OP = 0x02, DDP_LOCAL_CATA = 0x00, DDP_TAGGED_ERR = 0x01, DDP_UNTAGGED_ERR = 0x02, DDP_LLP = 0x03 }; enum c4iw_rdma_ecodes { RDMAP_INV_STAG = 0x00, RDMAP_BASE_BOUNDS = 0x01, RDMAP_ACC_VIOL = 0x02, RDMAP_STAG_NOT_ASSOC = 0x03, RDMAP_TO_WRAP = 0x04, RDMAP_INV_VERS = 0x05, RDMAP_INV_OPCODE = 0x06, RDMAP_STREAM_CATA = 0x07, RDMAP_GLOBAL_CATA = 0x08, RDMAP_CANT_INV_STAG = 0x09, RDMAP_UNSPECIFIED = 0xff }; enum c4iw_ddp_ecodes { DDPT_INV_STAG = 0x00, DDPT_BASE_BOUNDS = 0x01, DDPT_STAG_NOT_ASSOC = 0x02, DDPT_TO_WRAP = 0x03, DDPT_INV_VERS = 0x04, DDPU_INV_QN = 0x01, DDPU_INV_MSN_NOBUF = 0x02, DDPU_INV_MSN_RANGE = 0x03, DDPU_INV_MO = 0x04, DDPU_MSG_TOOBIG = 0x05, DDPU_INV_VERS = 0x06 }; enum c4iw_mpa_ecodes { MPA_CRC_ERR = 0x02, MPA_MARKER_ERR = 0x03, MPA_LOCAL_CATA = 0x05, MPA_INSUFF_IRD = 0x06, MPA_NOMATCH_RTR = 0x07, }; enum c4iw_ep_state { IDLE = 0, LISTEN, CONNECTING, MPA_REQ_WAIT, MPA_REQ_SENT, MPA_REQ_RCVD, MPA_REP_SENT, FPDU_MODE, ABORTING, CLOSING, MORIBUND, DEAD, }; enum c4iw_ep_flags { PEER_ABORT_IN_PROGRESS = 0, ABORT_REQ_IN_PROGRESS = 1, RELEASE_RESOURCES = 2, CLOSE_SENT = 3, TIMEOUT = 4, QP_REFERENCED = 5, STOP_MPA_TIMER = 7, }; enum c4iw_ep_history { ACT_OPEN_REQ = 0, ACT_OFLD_CONN = 1, ACT_OPEN_RPL = 2, ACT_ESTAB = 3, PASS_ACCEPT_REQ = 4, PASS_ESTAB = 5, ABORT_UPCALL = 6, ESTAB_UPCALL = 7, CLOSE_UPCALL = 8, ULP_ACCEPT = 9, ULP_REJECT = 10, TIMEDOUT = 11, PEER_ABORT = 12, PEER_CLOSE = 13, CONNREQ_UPCALL = 14, ABORT_CONN = 15, DISCONN_UPCALL = 16, EP_DISC_CLOSE = 17, EP_DISC_ABORT = 18, CONN_RPL_UPCALL = 19, ACT_RETRY_NOMEM = 20, ACT_RETRY_INUSE = 21, CLOSE_CON_RPL = 22, EP_DISC_FAIL = 24, QP_REFED = 25, QP_DEREFED = 26, CM_ID_REFED = 27, CM_ID_DEREFED = 28 }; struct c4iw_ep_common { TAILQ_ENTRY(c4iw_ep_common) entry; /* Work queue attachment */ struct iw_cm_id *cm_id; struct c4iw_qp *qp; struct c4iw_dev *dev; enum c4iw_ep_state state; struct kref kref; struct mutex mutex; struct sockaddr_storage local_addr; struct sockaddr_storage remote_addr; struct c4iw_wr_wait wr_wait; unsigned long flags; unsigned long history; int rpl_err; int rpl_done; struct thread *thread; struct socket *so; int ep_events; }; struct c4iw_listen_ep { struct c4iw_ep_common com; unsigned int stid; int backlog; struct list_head listen_ep_list; /* list of all listener ep's bound to one port address */ }; struct c4iw_ep { struct c4iw_ep_common com; struct c4iw_listen_ep *parent_ep; struct timer_list timer; unsigned int atid; u32 hwtid; u32 snd_seq; u32 rcv_seq; struct l2t_entry *l2t; struct dst_entry *dst; struct c4iw_mpa_attributes mpa_attr; u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA]; unsigned int mpa_pkt_len; u32 ird; u32 ord; u32 tx_chan; u32 mtu; u16 mss; u16 plen; u16 rss_qid; u16 txq_idx; u16 ctrlq_idx; u8 tos; u8 retry_with_mpa_v1; u8 tried_with_mpa_v1; }; static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id) { return cm_id->provider_data; } static inline struct c4iw_listen_ep *to_listen_ep(struct iw_cm_id *cm_id) { return cm_id->provider_data; } static inline int compute_wscale(int win) { int wscale = 0; while (wscale < 14 && (65535< __FBSDID("$FreeBSD$"); #include "opt_inet.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct sge_iq; struct rss_header; struct cpl_set_tcb_rpl; #include #include "offload.h" #include "tom/t4_tom.h" #include "iw_cxgbe.h" #include "user.h" static int creds(struct toepcb *toep, struct inpcb *inp, size_t wrsize); static int max_fr_immd = T4_MAX_FR_IMMD;//SYSCTL parameter later... static int alloc_ird(struct c4iw_dev *dev, u32 ird) { int ret = 0; spin_lock_irq(&dev->lock); if (ird <= dev->avail_ird) dev->avail_ird -= ird; else ret = -ENOMEM; spin_unlock_irq(&dev->lock); if (ret) log(LOG_WARNING, "%s: device IRD resources exhausted\n", device_get_nameunit(dev->rdev.adap->dev)); return ret; } static void free_ird(struct c4iw_dev *dev, int ird) { spin_lock_irq(&dev->lock); dev->avail_ird += ird; spin_unlock_irq(&dev->lock); } static void set_state(struct c4iw_qp *qhp, enum c4iw_qp_state state) { unsigned long flag; spin_lock_irqsave(&qhp->lock, flag); qhp->attr.state = state; spin_unlock_irqrestore(&qhp->lock, flag); } static int destroy_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, struct c4iw_dev_ucontext *uctx) { struct c4iw_dev *rhp = rdev_to_c4iw_dev(rdev); /* * uP clears EQ contexts when the connection exits rdma mode, * so no need to post a RESET WR for these EQs. */ dma_free_coherent(rhp->ibdev.dma_device, wq->rq.memsize, wq->rq.queue, dma_unmap_addr(&wq->rq, mapping)); dma_free_coherent(rhp->ibdev.dma_device, wq->sq.memsize, wq->sq.queue, dma_unmap_addr(&wq->sq, mapping)); c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size); kfree(wq->rq.sw_rq); kfree(wq->sq.sw_sq); c4iw_put_qpid(rdev, wq->rq.qid, uctx); c4iw_put_qpid(rdev, wq->sq.qid, uctx); return 0; } static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, struct t4_cq *rcq, struct t4_cq *scq, struct c4iw_dev_ucontext *uctx) { struct adapter *sc = rdev->adap; struct c4iw_dev *rhp = rdev_to_c4iw_dev(rdev); int user = (uctx != &rdev->uctx); struct fw_ri_res_wr *res_wr; struct fw_ri_res *res; int wr_len; struct c4iw_wr_wait wr_wait; int ret = 0; int eqsize; struct wrqe *wr; u64 sq_bar2_qoffset = 0, rq_bar2_qoffset = 0; wq->sq.qid = c4iw_get_qpid(rdev, uctx); if (!wq->sq.qid) return -ENOMEM; wq->rq.qid = c4iw_get_qpid(rdev, uctx); if (!wq->rq.qid) { ret = -ENOMEM; goto free_sq_qid; } if (!user) { wq->sq.sw_sq = kzalloc(wq->sq.size * sizeof *wq->sq.sw_sq, GFP_KERNEL); if (!wq->sq.sw_sq) { ret = -ENOMEM; goto free_rq_qid; } wq->rq.sw_rq = kzalloc(wq->rq.size * sizeof *wq->rq.sw_rq, GFP_KERNEL); if (!wq->rq.sw_rq) { ret = -ENOMEM; goto free_sw_sq; } } /* * RQT must be a power of 2 and at least 16 deep. */ wq->rq.rqt_size = roundup_pow_of_two(max_t(u16, wq->rq.size, 16)); wq->rq.rqt_hwaddr = c4iw_rqtpool_alloc(rdev, wq->rq.rqt_size); if (!wq->rq.rqt_hwaddr) { ret = -ENOMEM; goto free_sw_rq; } /*QP memory, allocate DMAable memory for Send & Receive Queues */ wq->sq.queue = dma_alloc_coherent(rhp->ibdev.dma_device, wq->sq.memsize, &(wq->sq.dma_addr), GFP_KERNEL); if (!wq->sq.queue) { ret = -ENOMEM; goto free_hwaddr; } wq->sq.phys_addr = vtophys(wq->sq.queue); dma_unmap_addr_set(&wq->sq, mapping, wq->sq.dma_addr); memset(wq->sq.queue, 0, wq->sq.memsize); wq->rq.queue = dma_alloc_coherent(rhp->ibdev.dma_device, wq->rq.memsize, &(wq->rq.dma_addr), GFP_KERNEL); if (!wq->rq.queue) { ret = -ENOMEM; goto free_sq_dma; } wq->rq.phys_addr = vtophys(wq->rq.queue); dma_unmap_addr_set(&wq->rq, mapping, wq->rq.dma_addr); memset(wq->rq.queue, 0, wq->rq.memsize); CTR5(KTR_IW_CXGBE, "%s QP sq base va 0x%p pa 0x%llx rq base va 0x%p pa 0x%llx", __func__, wq->sq.queue, (unsigned long long)wq->sq.phys_addr, wq->rq.queue, (unsigned long long)wq->rq.phys_addr); /* Doorbell/WC regions, determine the BAR2 queue offset and qid. */ t4_bar2_sge_qregs(rdev->adap, wq->sq.qid, T4_BAR2_QTYPE_EGRESS, user, &sq_bar2_qoffset, &wq->sq.bar2_qid); t4_bar2_sge_qregs(rdev->adap, wq->rq.qid, T4_BAR2_QTYPE_EGRESS, user, &rq_bar2_qoffset, &wq->rq.bar2_qid); if (user) { /* Compute BAR2 DB/WC physical address(page-aligned) for * Userspace mapping. */ wq->sq.bar2_pa = (rdev->bar2_pa + sq_bar2_qoffset) & PAGE_MASK; wq->rq.bar2_pa = (rdev->bar2_pa + rq_bar2_qoffset) & PAGE_MASK; CTR3(KTR_IW_CXGBE, "%s BAR2 DB/WC sq base pa 0x%llx rq base pa 0x%llx", __func__, (unsigned long long)wq->sq.bar2_pa, (unsigned long long)wq->rq.bar2_pa); } else { /* Compute BAR2 DB/WC virtual address to access in kernel. */ wq->sq.bar2_va = (void __iomem *)((u64)rdev->bar2_kva + sq_bar2_qoffset); wq->rq.bar2_va = (void __iomem *)((u64)rdev->bar2_kva + rq_bar2_qoffset); CTR3(KTR_IW_CXGBE, "%s BAR2 DB/WC sq base va %p rq base va %p", __func__, (unsigned long long)wq->sq.bar2_va, (unsigned long long)wq->rq.bar2_va); } wq->rdev = rdev; wq->rq.msn = 1; /* build fw_ri_res_wr */ wr_len = sizeof *res_wr + 2 * sizeof *res; wr = alloc_wrqe(wr_len, &sc->sge.ctrlq[0]); if (wr == NULL) { ret = -ENOMEM; goto free_rq_dma; } res_wr = wrtod(wr); memset(res_wr, 0, wr_len); res_wr->op_nres = cpu_to_be32( V_FW_WR_OP(FW_RI_RES_WR) | V_FW_RI_RES_WR_NRES(2) | F_FW_WR_COMPL); res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); res_wr->cookie = (unsigned long) &wr_wait; res = res_wr->res; res->u.sqrq.restype = FW_RI_RES_TYPE_SQ; res->u.sqrq.op = FW_RI_RES_OP_WRITE; /* eqsize is the number of 64B entries plus the status page size. */ eqsize = wq->sq.size * T4_SQ_NUM_SLOTS + rdev->hw_queue.t4_eq_status_entries; res->u.sqrq.fetchszm_to_iqid = cpu_to_be32( V_FW_RI_RES_WR_HOSTFCMODE(0) | /* no host cidx updates */ V_FW_RI_RES_WR_CPRIO(0) | /* don't keep in chip cache */ V_FW_RI_RES_WR_PCIECHN(0) | /* set by uP at ri_init time */ V_FW_RI_RES_WR_IQID(scq->cqid)); res->u.sqrq.dcaen_to_eqsize = cpu_to_be32( V_FW_RI_RES_WR_DCAEN(0) | V_FW_RI_RES_WR_DCACPU(0) | V_FW_RI_RES_WR_FBMIN(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) | V_FW_RI_RES_WR_FBMAX(3) | V_FW_RI_RES_WR_CIDXFTHRESHO(0) | V_FW_RI_RES_WR_CIDXFTHRESH(0) | V_FW_RI_RES_WR_EQSIZE(eqsize)); res->u.sqrq.eqid = cpu_to_be32(wq->sq.qid); res->u.sqrq.eqaddr = cpu_to_be64(wq->sq.dma_addr); res++; res->u.sqrq.restype = FW_RI_RES_TYPE_RQ; res->u.sqrq.op = FW_RI_RES_OP_WRITE; /* eqsize is the number of 64B entries plus the status page size. */ eqsize = wq->rq.size * T4_RQ_NUM_SLOTS + rdev->hw_queue.t4_eq_status_entries; res->u.sqrq.fetchszm_to_iqid = cpu_to_be32( V_FW_RI_RES_WR_HOSTFCMODE(0) | /* no host cidx updates */ V_FW_RI_RES_WR_CPRIO(0) | /* don't keep in chip cache */ V_FW_RI_RES_WR_PCIECHN(0) | /* set by uP at ri_init time */ V_FW_RI_RES_WR_IQID(rcq->cqid)); res->u.sqrq.dcaen_to_eqsize = cpu_to_be32( V_FW_RI_RES_WR_DCAEN(0) | V_FW_RI_RES_WR_DCACPU(0) | V_FW_RI_RES_WR_FBMIN(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) | V_FW_RI_RES_WR_FBMAX(3) | V_FW_RI_RES_WR_CIDXFTHRESHO(0) | V_FW_RI_RES_WR_CIDXFTHRESH(0) | V_FW_RI_RES_WR_EQSIZE(eqsize)); res->u.sqrq.eqid = cpu_to_be32(wq->rq.qid); res->u.sqrq.eqaddr = cpu_to_be64(wq->rq.dma_addr); c4iw_init_wr_wait(&wr_wait); t4_wrq_tx(sc, wr); ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, wq->sq.qid, NULL, __func__); if (ret) goto free_rq_dma; CTR5(KTR_IW_CXGBE, "%s sqid 0x%x rqid 0x%x kdb 0x%p squdb 0x%llx rqudb 0x%llx", __func__, wq->sq.qid, wq->rq.qid, (unsigned long long)wq->sq.bar2_va, (unsigned long long)wq->rq.bar2_va); return 0; free_rq_dma: dma_free_coherent(rhp->ibdev.dma_device, wq->rq.memsize, wq->rq.queue, dma_unmap_addr(&wq->rq, mapping)); free_sq_dma: dma_free_coherent(rhp->ibdev.dma_device, wq->sq.memsize, wq->sq.queue, dma_unmap_addr(&wq->sq, mapping)); free_hwaddr: c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size); free_sw_rq: kfree(wq->rq.sw_rq); free_sw_sq: kfree(wq->sq.sw_sq); free_rq_qid: c4iw_put_qpid(rdev, wq->rq.qid, uctx); free_sq_qid: c4iw_put_qpid(rdev, wq->sq.qid, uctx); return ret; } static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp, - struct ib_send_wr *wr, int max, u32 *plenp) + const struct ib_send_wr *wr, int max, u32 *plenp) { u8 *dstp, *srcp; u32 plen = 0; int i; int rem, len; dstp = (u8 *)immdp->data; for (i = 0; i < wr->num_sge; i++) { if ((plen + wr->sg_list[i].length) > max) return -EMSGSIZE; srcp = (u8 *)(unsigned long)wr->sg_list[i].addr; plen += wr->sg_list[i].length; rem = wr->sg_list[i].length; while (rem) { if (dstp == (u8 *)&sq->queue[sq->size]) dstp = (u8 *)sq->queue; if (rem <= (u8 *)&sq->queue[sq->size] - dstp) len = rem; else len = (u8 *)&sq->queue[sq->size] - dstp; memcpy(dstp, srcp, len); dstp += len; srcp += len; rem -= len; } } len = roundup(plen + sizeof *immdp, 16) - (plen + sizeof *immdp); if (len) memset(dstp, 0, len); immdp->op = FW_RI_DATA_IMMD; immdp->r1 = 0; immdp->r2 = 0; immdp->immdlen = cpu_to_be32(plen); *plenp = plen; return 0; } static int build_isgl(__be64 *queue_start, __be64 *queue_end, struct fw_ri_isgl *isglp, struct ib_sge *sg_list, int num_sge, u32 *plenp) { int i; u32 plen = 0; __be64 *flitp = (__be64 *)isglp->sge; for (i = 0; i < num_sge; i++) { if ((plen + sg_list[i].length) < plen) return -EMSGSIZE; plen += sg_list[i].length; *flitp = cpu_to_be64(((u64)sg_list[i].lkey << 32) | sg_list[i].length); if (++flitp == queue_end) flitp = queue_start; *flitp = cpu_to_be64(sg_list[i].addr); if (++flitp == queue_end) flitp = queue_start; } *flitp = (__force __be64)0; isglp->op = FW_RI_DATA_ISGL; isglp->r1 = 0; isglp->nsge = cpu_to_be16(num_sge); isglp->r2 = 0; if (plenp) *plenp = plen; return 0; } static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe, - struct ib_send_wr *wr, u8 *len16) + const struct ib_send_wr *wr, u8 *len16) { u32 plen; int size; int ret; if (wr->num_sge > T4_MAX_SEND_SGE) return -EINVAL; switch (wr->opcode) { case IB_WR_SEND: if (wr->send_flags & IB_SEND_SOLICITED) wqe->send.sendop_pkd = cpu_to_be32( V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_SE)); else wqe->send.sendop_pkd = cpu_to_be32( V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND)); wqe->send.stag_inv = 0; break; case IB_WR_SEND_WITH_INV: if (wr->send_flags & IB_SEND_SOLICITED) wqe->send.sendop_pkd = cpu_to_be32( V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_SE_INV)); else wqe->send.sendop_pkd = cpu_to_be32( V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_INV)); wqe->send.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey); break; default: return -EINVAL; } wqe->send.r3 = 0; wqe->send.r4 = 0; plen = 0; if (wr->num_sge) { if (wr->send_flags & IB_SEND_INLINE) { ret = build_immd(sq, wqe->send.u.immd_src, wr, T4_MAX_SEND_INLINE, &plen); if (ret) return ret; size = sizeof wqe->send + sizeof(struct fw_ri_immd) + plen; } else { ret = build_isgl((__be64 *)sq->queue, (__be64 *)&sq->queue[sq->size], wqe->send.u.isgl_src, wr->sg_list, wr->num_sge, &plen); if (ret) return ret; size = sizeof wqe->send + sizeof(struct fw_ri_isgl) + wr->num_sge * sizeof(struct fw_ri_sge); } } else { wqe->send.u.immd_src[0].op = FW_RI_DATA_IMMD; wqe->send.u.immd_src[0].r1 = 0; wqe->send.u.immd_src[0].r2 = 0; wqe->send.u.immd_src[0].immdlen = 0; size = sizeof wqe->send + sizeof(struct fw_ri_immd); plen = 0; } *len16 = DIV_ROUND_UP(size, 16); wqe->send.plen = cpu_to_be32(plen); return 0; } static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe, - struct ib_send_wr *wr, u8 *len16) + const struct ib_send_wr *wr, u8 *len16) { u32 plen; int size; int ret; if (wr->num_sge > T4_MAX_SEND_SGE) return -EINVAL; wqe->write.immd_data = 0; wqe->write.stag_sink = cpu_to_be32(rdma_wr(wr)->rkey); wqe->write.to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr); if (wr->num_sge) { if (wr->send_flags & IB_SEND_INLINE) { ret = build_immd(sq, wqe->write.u.immd_src, wr, T4_MAX_WRITE_INLINE, &plen); if (ret) return ret; size = sizeof wqe->write + sizeof(struct fw_ri_immd) + plen; } else { ret = build_isgl((__be64 *)sq->queue, (__be64 *)&sq->queue[sq->size], wqe->write.u.isgl_src, wr->sg_list, wr->num_sge, &plen); if (ret) return ret; size = sizeof wqe->write + sizeof(struct fw_ri_isgl) + wr->num_sge * sizeof(struct fw_ri_sge); } } else { wqe->write.u.immd_src[0].op = FW_RI_DATA_IMMD; wqe->write.u.immd_src[0].r1 = 0; wqe->write.u.immd_src[0].r2 = 0; wqe->write.u.immd_src[0].immdlen = 0; size = sizeof wqe->write + sizeof(struct fw_ri_immd); plen = 0; } *len16 = DIV_ROUND_UP(size, 16); wqe->write.plen = cpu_to_be32(plen); return 0; } -static int build_rdma_read(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) +static int build_rdma_read(union t4_wr *wqe, const struct ib_send_wr *wr, u8 *len16) { if (wr->num_sge > 1) return -EINVAL; if (wr->num_sge && wr->sg_list[0].length) { wqe->read.stag_src = cpu_to_be32(rdma_wr(wr)->rkey); wqe->read.to_src_hi = cpu_to_be32((u32)(rdma_wr(wr)->remote_addr >> 32)); wqe->read.to_src_lo = cpu_to_be32((u32)rdma_wr(wr)->remote_addr); wqe->read.stag_sink = cpu_to_be32(wr->sg_list[0].lkey); wqe->read.plen = cpu_to_be32(wr->sg_list[0].length); wqe->read.to_sink_hi = cpu_to_be32((u32)(wr->sg_list[0].addr >> 32)); wqe->read.to_sink_lo = cpu_to_be32((u32)(wr->sg_list[0].addr)); } else { wqe->read.stag_src = cpu_to_be32(2); wqe->read.to_src_hi = 0; wqe->read.to_src_lo = 0; wqe->read.stag_sink = cpu_to_be32(2); wqe->read.plen = 0; wqe->read.to_sink_hi = 0; wqe->read.to_sink_lo = 0; } wqe->read.r2 = 0; wqe->read.r5 = 0; *len16 = DIV_ROUND_UP(sizeof wqe->read, 16); return 0; } static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe, - struct ib_recv_wr *wr, u8 *len16) + const struct ib_recv_wr *wr, u8 *len16) { int ret; ret = build_isgl((__be64 *)qhp->wq.rq.queue, (__be64 *)&qhp->wq.rq.queue[qhp->wq.rq.size], &wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL); if (ret) return ret; *len16 = DIV_ROUND_UP(sizeof wqe->recv + wr->num_sge * sizeof(struct fw_ri_sge), 16); return 0; } -static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr, +static int build_inv_stag(union t4_wr *wqe, const struct ib_send_wr *wr, u8 *len16) { wqe->inv.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey); wqe->inv.r2 = 0; *len16 = DIV_ROUND_UP(sizeof wqe->inv, 16); return 0; } static void free_qp_work(struct work_struct *work) { struct c4iw_ucontext *ucontext; struct c4iw_qp *qhp; struct c4iw_dev *rhp; qhp = container_of(work, struct c4iw_qp, free_work); ucontext = qhp->ucontext; rhp = qhp->rhp; CTR3(KTR_IW_CXGBE, "%s qhp %p ucontext %p", __func__, qhp, ucontext); destroy_qp(&rhp->rdev, &qhp->wq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx); if (ucontext) c4iw_put_ucontext(ucontext); kfree(qhp); } static void queue_qp_free(struct kref *kref) { struct c4iw_qp *qhp; qhp = container_of(kref, struct c4iw_qp, kref); CTR2(KTR_IW_CXGBE, "%s qhp %p", __func__, qhp); queue_work(qhp->rhp->rdev.free_workq, &qhp->free_work); } void c4iw_qp_add_ref(struct ib_qp *qp) { CTR2(KTR_IW_CXGBE, "%s ib_qp %p", __func__, qp); kref_get(&to_c4iw_qp(qp)->kref); } void c4iw_qp_rem_ref(struct ib_qp *qp) { CTR2(KTR_IW_CXGBE, "%s ib_qp %p", __func__, qp); kref_put(&to_c4iw_qp(qp)->kref, queue_qp_free); } -static void complete_sq_drain_wr(struct c4iw_qp *qhp, struct ib_send_wr *wr) +static void complete_sq_drain_wr(struct c4iw_qp *qhp, const struct ib_send_wr *wr) { struct t4_cqe cqe = {}; struct c4iw_cq *schp; unsigned long flag; struct t4_cq *cq; schp = to_c4iw_cq(qhp->ibqp.send_cq); cq = &schp->cq; PDBG("%s drain sq id %u\n", __func__, qhp->wq.sq.qid); cqe.u.drain_cookie = wr->wr_id; cqe.header = cpu_to_be32(V_CQE_STATUS(T4_ERR_SWFLUSH) | V_CQE_OPCODE(C4IW_DRAIN_OPCODE) | V_CQE_TYPE(1) | V_CQE_SWCQE(1) | V_CQE_QPID(qhp->wq.sq.qid)); spin_lock_irqsave(&schp->lock, flag); cqe.bits_type_ts = cpu_to_be64(V_CQE_GENBIT((u64)cq->gen)); cq->sw_queue[cq->sw_pidx] = cqe; t4_swcq_produce(cq); spin_unlock_irqrestore(&schp->lock, flag); spin_lock_irqsave(&schp->comp_handler_lock, flag); (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); spin_unlock_irqrestore(&schp->comp_handler_lock, flag); } -static void complete_rq_drain_wr(struct c4iw_qp *qhp, struct ib_recv_wr *wr) +static void complete_rq_drain_wr(struct c4iw_qp *qhp, const struct ib_recv_wr *wr) { struct t4_cqe cqe = {}; struct c4iw_cq *rchp; unsigned long flag; struct t4_cq *cq; rchp = to_c4iw_cq(qhp->ibqp.recv_cq); cq = &rchp->cq; PDBG("%s drain rq id %u\n", __func__, qhp->wq.sq.qid); cqe.u.drain_cookie = wr->wr_id; cqe.header = cpu_to_be32(V_CQE_STATUS(T4_ERR_SWFLUSH) | V_CQE_OPCODE(C4IW_DRAIN_OPCODE) | V_CQE_TYPE(0) | V_CQE_SWCQE(1) | V_CQE_QPID(qhp->wq.sq.qid)); spin_lock_irqsave(&rchp->lock, flag); cqe.bits_type_ts = cpu_to_be64(V_CQE_GENBIT((u64)cq->gen)); cq->sw_queue[cq->sw_pidx] = cqe; t4_swcq_produce(cq); spin_unlock_irqrestore(&rchp->lock, flag); spin_lock_irqsave(&rchp->comp_handler_lock, flag); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); } static int build_tpte_memreg(struct fw_ri_fr_nsmr_tpte_wr *fr, - struct ib_reg_wr *wr, struct c4iw_mr *mhp, u8 *len16) + const struct ib_reg_wr *wr, struct c4iw_mr *mhp, u8 *len16) { __be64 *p = (__be64 *)fr->pbl; if (wr->mr->page_size > C4IW_MAX_PAGE_SIZE) return -EINVAL; fr->r2 = cpu_to_be32(0); fr->stag = cpu_to_be32(mhp->ibmr.rkey); fr->tpte.valid_to_pdid = cpu_to_be32(F_FW_RI_TPTE_VALID | V_FW_RI_TPTE_STAGKEY((mhp->ibmr.rkey & M_FW_RI_TPTE_STAGKEY)) | V_FW_RI_TPTE_STAGSTATE(1) | V_FW_RI_TPTE_STAGTYPE(FW_RI_STAG_NSMR) | V_FW_RI_TPTE_PDID(mhp->attr.pdid)); fr->tpte.locread_to_qpid = cpu_to_be32( V_FW_RI_TPTE_PERM(c4iw_ib_to_tpt_access(wr->access)) | V_FW_RI_TPTE_ADDRTYPE(FW_RI_VA_BASED_TO) | V_FW_RI_TPTE_PS(ilog2(wr->mr->page_size) - 12)); fr->tpte.nosnoop_pbladdr = cpu_to_be32(V_FW_RI_TPTE_PBLADDR( PBL_OFF(&mhp->rhp->rdev, mhp->attr.pbl_addr)>>3)); fr->tpte.dca_mwbcnt_pstag = cpu_to_be32(0); fr->tpte.len_hi = cpu_to_be32(mhp->ibmr.length >> 32); fr->tpte.len_lo = cpu_to_be32(mhp->ibmr.length & 0xffffffff); fr->tpte.va_hi = cpu_to_be32(mhp->ibmr.iova >> 32); fr->tpte.va_lo_fbo = cpu_to_be32(mhp->ibmr.iova & 0xffffffff); p[0] = cpu_to_be64((u64)mhp->mpl[0]); p[1] = cpu_to_be64((u64)mhp->mpl[1]); *len16 = DIV_ROUND_UP(sizeof(*fr), 16); return 0; } static int build_memreg(struct t4_sq *sq, union t4_wr *wqe, - struct ib_reg_wr *wr, struct c4iw_mr *mhp, u8 *len16, + const struct ib_reg_wr *wr, struct c4iw_mr *mhp, u8 *len16, bool dsgl_supported) { struct fw_ri_immd *imdp; __be64 *p; int i; int pbllen = roundup(mhp->mpl_len * sizeof(u64), 32); int rem; if (mhp->mpl_len > t4_max_fr_depth(&mhp->rhp->rdev, use_dsgl)) return -EINVAL; if (wr->mr->page_size > C4IW_MAX_PAGE_SIZE) return -EINVAL; wqe->fr.qpbinde_to_dcacpu = 0; wqe->fr.pgsz_shift = ilog2(wr->mr->page_size) - 12; wqe->fr.addr_type = FW_RI_VA_BASED_TO; wqe->fr.mem_perms = c4iw_ib_to_tpt_access(wr->access); wqe->fr.len_hi = cpu_to_be32(mhp->ibmr.length >> 32); wqe->fr.len_lo = cpu_to_be32(mhp->ibmr.length & 0xffffffff); wqe->fr.stag = cpu_to_be32(wr->key); wqe->fr.va_hi = cpu_to_be32(mhp->ibmr.iova >> 32); wqe->fr.va_lo_fbo = cpu_to_be32(mhp->ibmr.iova & 0xffffffff); if (dsgl_supported && use_dsgl && (pbllen > max_fr_immd)) { struct fw_ri_dsgl *sglp; for (i = 0; i < mhp->mpl_len; i++) mhp->mpl[i] = (__force u64)cpu_to_be64((u64)mhp->mpl[i]); sglp = (struct fw_ri_dsgl *)(&wqe->fr + 1); sglp->op = FW_RI_DATA_DSGL; sglp->r1 = 0; sglp->nsge = cpu_to_be16(1); sglp->addr0 = cpu_to_be64(mhp->mpl_addr); sglp->len0 = cpu_to_be32(pbllen); *len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*sglp), 16); } else { imdp = (struct fw_ri_immd *)(&wqe->fr + 1); imdp->op = FW_RI_DATA_IMMD; imdp->r1 = 0; imdp->r2 = 0; imdp->immdlen = cpu_to_be32(pbllen); p = (__be64 *)(imdp + 1); rem = pbllen; for (i = 0; i < mhp->mpl_len; i++) { *p = cpu_to_be64((u64)mhp->mpl[i]); rem -= sizeof(*p); if (++p == (__be64 *)&sq->queue[sq->size]) p = (__be64 *)sq->queue; } BUG_ON(rem < 0); while (rem) { *p = 0; rem -= sizeof(*p); if (++p == (__be64 *)&sq->queue[sq->size]) p = (__be64 *)sq->queue; } *len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*imdp) + pbllen, 16); } return 0; } -int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int c4iw_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { int err = 0; u8 len16 = 0; enum fw_wr_opcodes fw_opcode = 0; enum fw_ri_wr_flags fw_flags; struct c4iw_qp *qhp; union t4_wr *wqe = NULL; u32 num_wrs; struct t4_swsqe *swsqe; unsigned long flag; u16 idx = 0; struct c4iw_rdev *rdev; qhp = to_c4iw_qp(ibqp); rdev = &qhp->rhp->rdev; spin_lock_irqsave(&qhp->lock, flag); if (t4_wq_in_error(&qhp->wq)) { spin_unlock_irqrestore(&qhp->lock, flag); complete_sq_drain_wr(qhp, wr); return err; } num_wrs = t4_sq_avail(&qhp->wq); if (num_wrs == 0) { spin_unlock_irqrestore(&qhp->lock, flag); *bad_wr = wr; return -ENOMEM; } while (wr) { if (num_wrs == 0) { err = -ENOMEM; *bad_wr = wr; break; } wqe = (union t4_wr *)((u8 *)qhp->wq.sq.queue + qhp->wq.sq.wq_pidx * T4_EQ_ENTRY_SIZE); fw_flags = 0; if (wr->send_flags & IB_SEND_SOLICITED) fw_flags |= FW_RI_SOLICITED_EVENT_FLAG; if (wr->send_flags & IB_SEND_SIGNALED || qhp->sq_sig_all) fw_flags |= FW_RI_COMPLETION_FLAG; swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx]; switch (wr->opcode) { case IB_WR_SEND_WITH_INV: case IB_WR_SEND: if (wr->send_flags & IB_SEND_FENCE) fw_flags |= FW_RI_READ_FENCE_FLAG; fw_opcode = FW_RI_SEND_WR; if (wr->opcode == IB_WR_SEND) swsqe->opcode = FW_RI_SEND; else swsqe->opcode = FW_RI_SEND_WITH_INV; err = build_rdma_send(&qhp->wq.sq, wqe, wr, &len16); break; case IB_WR_RDMA_WRITE: fw_opcode = FW_RI_RDMA_WRITE_WR; swsqe->opcode = FW_RI_RDMA_WRITE; err = build_rdma_write(&qhp->wq.sq, wqe, wr, &len16); break; case IB_WR_RDMA_READ: case IB_WR_RDMA_READ_WITH_INV: fw_opcode = FW_RI_RDMA_READ_WR; swsqe->opcode = FW_RI_READ_REQ; if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) { c4iw_invalidate_mr(qhp->rhp, wr->sg_list[0].lkey); fw_flags = FW_RI_RDMA_READ_INVALIDATE; } else { fw_flags = 0; } err = build_rdma_read(wqe, wr, &len16); if (err) break; swsqe->read_len = wr->sg_list[0].length; if (!qhp->wq.sq.oldest_read) qhp->wq.sq.oldest_read = swsqe; break; case IB_WR_REG_MR: { struct c4iw_mr *mhp = to_c4iw_mr(reg_wr(wr)->mr); swsqe->opcode = FW_RI_FAST_REGISTER; if (rdev->adap->params.fr_nsmr_tpte_wr_support && !mhp->attr.state && mhp->mpl_len <= 2) { fw_opcode = FW_RI_FR_NSMR_TPTE_WR; err = build_tpte_memreg(&wqe->fr_tpte, reg_wr(wr), mhp, &len16); } else { fw_opcode = FW_RI_FR_NSMR_WR; err = build_memreg(&qhp->wq.sq, wqe, reg_wr(wr), mhp, &len16, rdev->adap->params.ulptx_memwrite_dsgl); } if (err) break; mhp->attr.state = 1; break; } case IB_WR_LOCAL_INV: if (wr->send_flags & IB_SEND_FENCE) fw_flags |= FW_RI_LOCAL_FENCE_FLAG; fw_opcode = FW_RI_INV_LSTAG_WR; swsqe->opcode = FW_RI_LOCAL_INV; err = build_inv_stag(wqe, wr, &len16); c4iw_invalidate_mr(qhp->rhp, wr->ex.invalidate_rkey); break; default: CTR2(KTR_IW_CXGBE, "%s post of type =%d TBD!", __func__, wr->opcode); err = -EINVAL; } if (err) { *bad_wr = wr; break; } swsqe->idx = qhp->wq.sq.pidx; swsqe->complete = 0; swsqe->signaled = (wr->send_flags & IB_SEND_SIGNALED) || qhp->sq_sig_all; swsqe->flushed = 0; swsqe->wr_id = wr->wr_id; init_wr_hdr(wqe, qhp->wq.sq.pidx, fw_opcode, fw_flags, len16); CTR5(KTR_IW_CXGBE, "%s cookie 0x%llx pidx 0x%x opcode 0x%x read_len %u", __func__, (unsigned long long)wr->wr_id, qhp->wq.sq.pidx, swsqe->opcode, swsqe->read_len); wr = wr->next; num_wrs--; t4_sq_produce(&qhp->wq, len16); idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); } t4_ring_sq_db(&qhp->wq, idx, wqe, rdev->adap->iwt.wc_en); spin_unlock_irqrestore(&qhp->lock, flag); return err; } -int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int c4iw_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { int err = 0; struct c4iw_qp *qhp; union t4_recv_wr *wqe = NULL; u32 num_wrs; u8 len16 = 0; unsigned long flag; u16 idx = 0; qhp = to_c4iw_qp(ibqp); spin_lock_irqsave(&qhp->lock, flag); if (t4_wq_in_error(&qhp->wq)) { spin_unlock_irqrestore(&qhp->lock, flag); complete_rq_drain_wr(qhp, wr); return err; } num_wrs = t4_rq_avail(&qhp->wq); if (num_wrs == 0) { spin_unlock_irqrestore(&qhp->lock, flag); *bad_wr = wr; return -ENOMEM; } while (wr) { if (wr->num_sge > T4_MAX_RECV_SGE) { err = -EINVAL; *bad_wr = wr; break; } wqe = (union t4_recv_wr *)((u8 *)qhp->wq.rq.queue + qhp->wq.rq.wq_pidx * T4_EQ_ENTRY_SIZE); if (num_wrs) err = build_rdma_recv(qhp, wqe, wr, &len16); else err = -ENOMEM; if (err) { *bad_wr = wr; break; } qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].wr_id = wr->wr_id; wqe->recv.opcode = FW_RI_RECV_WR; wqe->recv.r1 = 0; wqe->recv.wrid = qhp->wq.rq.pidx; wqe->recv.r2[0] = 0; wqe->recv.r2[1] = 0; wqe->recv.r2[2] = 0; wqe->recv.len16 = len16; CTR3(KTR_IW_CXGBE, "%s cookie 0x%llx pidx %u", __func__, (unsigned long long) wr->wr_id, qhp->wq.rq.pidx); t4_rq_produce(&qhp->wq, len16); idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); wr = wr->next; num_wrs--; } t4_ring_rq_db(&qhp->wq, idx, wqe, qhp->rhp->rdev.adap->iwt.wc_en); spin_unlock_irqrestore(&qhp->lock, flag); return err; } static inline void build_term_codes(struct t4_cqe *err_cqe, u8 *layer_type, u8 *ecode) { int status; int tagged; int opcode; int rqtype; int send_inv; if (!err_cqe) { *layer_type = LAYER_RDMAP|DDP_LOCAL_CATA; *ecode = 0; return; } status = CQE_STATUS(err_cqe); opcode = CQE_OPCODE(err_cqe); rqtype = RQ_TYPE(err_cqe); send_inv = (opcode == FW_RI_SEND_WITH_INV) || (opcode == FW_RI_SEND_WITH_SE_INV); tagged = (opcode == FW_RI_RDMA_WRITE) || (rqtype && (opcode == FW_RI_READ_RESP)); switch (status) { case T4_ERR_STAG: if (send_inv) { *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; *ecode = RDMAP_CANT_INV_STAG; } else { *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; *ecode = RDMAP_INV_STAG; } break; case T4_ERR_PDID: *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; if ((opcode == FW_RI_SEND_WITH_INV) || (opcode == FW_RI_SEND_WITH_SE_INV)) *ecode = RDMAP_CANT_INV_STAG; else *ecode = RDMAP_STAG_NOT_ASSOC; break; case T4_ERR_QPID: *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; *ecode = RDMAP_STAG_NOT_ASSOC; break; case T4_ERR_ACCESS: *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; *ecode = RDMAP_ACC_VIOL; break; case T4_ERR_WRAP: *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; *ecode = RDMAP_TO_WRAP; break; case T4_ERR_BOUND: if (tagged) { *layer_type = LAYER_DDP|DDP_TAGGED_ERR; *ecode = DDPT_BASE_BOUNDS; } else { *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; *ecode = RDMAP_BASE_BOUNDS; } break; case T4_ERR_INVALIDATE_SHARED_MR: case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND: *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; *ecode = RDMAP_CANT_INV_STAG; break; case T4_ERR_ECC: case T4_ERR_ECC_PSTAG: case T4_ERR_INTERNAL_ERR: *layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA; *ecode = 0; break; case T4_ERR_OUT_OF_RQE: *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; *ecode = DDPU_INV_MSN_NOBUF; break; case T4_ERR_PBL_ADDR_BOUND: *layer_type = LAYER_DDP|DDP_TAGGED_ERR; *ecode = DDPT_BASE_BOUNDS; break; case T4_ERR_CRC: *layer_type = LAYER_MPA|DDP_LLP; *ecode = MPA_CRC_ERR; break; case T4_ERR_MARKER: *layer_type = LAYER_MPA|DDP_LLP; *ecode = MPA_MARKER_ERR; break; case T4_ERR_PDU_LEN_ERR: *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; *ecode = DDPU_MSG_TOOBIG; break; case T4_ERR_DDP_VERSION: if (tagged) { *layer_type = LAYER_DDP|DDP_TAGGED_ERR; *ecode = DDPT_INV_VERS; } else { *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; *ecode = DDPU_INV_VERS; } break; case T4_ERR_RDMA_VERSION: *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; *ecode = RDMAP_INV_VERS; break; case T4_ERR_OPCODE: *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; *ecode = RDMAP_INV_OPCODE; break; case T4_ERR_DDP_QUEUE_NUM: *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; *ecode = DDPU_INV_QN; break; case T4_ERR_MSN: case T4_ERR_MSN_GAP: case T4_ERR_MSN_RANGE: case T4_ERR_IRD_OVERFLOW: *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; *ecode = DDPU_INV_MSN_RANGE; break; case T4_ERR_TBIT: *layer_type = LAYER_DDP|DDP_LOCAL_CATA; *ecode = 0; break; case T4_ERR_MO: *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; *ecode = DDPU_INV_MO; break; default: *layer_type = LAYER_RDMAP|DDP_LOCAL_CATA; *ecode = 0; break; } } static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe, gfp_t gfp) { int ret; struct fw_ri_wr *wqe; struct terminate_message *term; struct wrqe *wr; struct socket *so = qhp->ep->com.so; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; CTR4(KTR_IW_CXGBE, "%s qhp %p qid 0x%x tid %u", __func__, qhp, qhp->wq.sq.qid, qhp->ep->hwtid); wr = alloc_wrqe(sizeof(*wqe), &toep->ofld_txq->wrq); if (wr == NULL) return; wqe = wrtod(wr); memset(wqe, 0, sizeof *wqe); wqe->op_compl = cpu_to_be32(V_FW_WR_OP(FW_RI_WR)); wqe->flowid_len16 = cpu_to_be32( V_FW_WR_FLOWID(qhp->ep->hwtid) | V_FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16))); wqe->u.terminate.type = FW_RI_TYPE_TERMINATE; wqe->u.terminate.immdlen = cpu_to_be32(sizeof *term); term = (struct terminate_message *)wqe->u.terminate.termmsg; if (qhp->attr.layer_etype == (LAYER_MPA|DDP_LLP)) { term->layer_etype = qhp->attr.layer_etype; term->ecode = qhp->attr.ecode; } else build_term_codes(err_cqe, &term->layer_etype, &term->ecode); ret = creds(toep, inp, sizeof(*wqe)); if (ret) { free_wrqe(wr); return; } t4_wrq_tx(qhp->rhp->rdev.adap, wr); } /* Assumes qhp lock is held. */ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, struct c4iw_cq *schp) { int count; int rq_flushed, sq_flushed; unsigned long flag; CTR4(KTR_IW_CXGBE, "%s qhp %p rchp %p schp %p", __func__, qhp, rchp, schp); /* locking hierarchy: cq lock first, then qp lock. */ spin_lock_irqsave(&rchp->lock, flag); spin_lock(&qhp->lock); if (qhp->wq.flushed) { spin_unlock(&qhp->lock); spin_unlock_irqrestore(&rchp->lock, flag); return; } qhp->wq.flushed = 1; c4iw_flush_hw_cq(rchp); c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count); rq_flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count); spin_unlock(&qhp->lock); spin_unlock_irqrestore(&rchp->lock, flag); /* locking hierarchy: cq lock first, then qp lock. */ spin_lock_irqsave(&schp->lock, flag); spin_lock(&qhp->lock); if (schp != rchp) c4iw_flush_hw_cq(schp); sq_flushed = c4iw_flush_sq(qhp); spin_unlock(&qhp->lock); spin_unlock_irqrestore(&schp->lock, flag); if (schp == rchp) { if (t4_clear_cq_armed(&rchp->cq) && (rq_flushed || sq_flushed)) { spin_lock_irqsave(&rchp->comp_handler_lock, flag); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); } } else { if (t4_clear_cq_armed(&rchp->cq) && rq_flushed) { spin_lock_irqsave(&rchp->comp_handler_lock, flag); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); } if (t4_clear_cq_armed(&schp->cq) && sq_flushed) { spin_lock_irqsave(&schp->comp_handler_lock, flag); (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); spin_unlock_irqrestore(&schp->comp_handler_lock, flag); } } } static void flush_qp(struct c4iw_qp *qhp) { struct c4iw_cq *rchp, *schp; unsigned long flag; rchp = to_c4iw_cq(qhp->ibqp.recv_cq); schp = to_c4iw_cq(qhp->ibqp.send_cq); t4_set_wq_in_error(&qhp->wq); if (qhp->ibqp.uobject) { t4_set_cq_in_error(&rchp->cq); spin_lock_irqsave(&rchp->comp_handler_lock, flag); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); if (schp != rchp) { t4_set_cq_in_error(&schp->cq); spin_lock_irqsave(&schp->comp_handler_lock, flag); (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); spin_unlock_irqrestore(&schp->comp_handler_lock, flag); } return; } __flush_qp(qhp, rchp, schp); } static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp, struct c4iw_ep *ep) { struct c4iw_rdev *rdev = &rhp->rdev; struct adapter *sc = rdev->adap; struct fw_ri_wr *wqe; int ret; struct wrqe *wr; struct socket *so = ep->com.so; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; KASSERT(rhp == qhp->rhp && ep == qhp->ep, ("%s: EDOOFUS", __func__)); CTR5(KTR_IW_CXGBE, "%s qhp %p qid 0x%x ep %p tid %u", __func__, qhp, qhp->wq.sq.qid, ep, ep->hwtid); wr = alloc_wrqe(sizeof(*wqe), &toep->ofld_txq->wrq); if (wr == NULL) return (0); wqe = wrtod(wr); memset(wqe, 0, sizeof *wqe); wqe->op_compl = cpu_to_be32(V_FW_WR_OP(FW_RI_WR) | F_FW_WR_COMPL); wqe->flowid_len16 = cpu_to_be32(V_FW_WR_FLOWID(ep->hwtid) | V_FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16))); wqe->cookie = (unsigned long) &ep->com.wr_wait; wqe->u.fini.type = FW_RI_TYPE_FINI; c4iw_init_wr_wait(&ep->com.wr_wait); ret = creds(toep, inp, sizeof(*wqe)); if (ret) { free_wrqe(wr); return ret; } t4_wrq_tx(sc, wr); ret = c4iw_wait_for_reply(rdev, &ep->com.wr_wait, ep->hwtid, qhp->wq.sq.qid, ep->com.so, __func__); return ret; } static void build_rtr_msg(u8 p2p_type, struct fw_ri_init *init) { CTR2(KTR_IW_CXGBE, "%s p2p_type = %d", __func__, p2p_type); memset(&init->u, 0, sizeof init->u); switch (p2p_type) { case FW_RI_INIT_P2PTYPE_RDMA_WRITE: init->u.write.opcode = FW_RI_RDMA_WRITE_WR; init->u.write.stag_sink = cpu_to_be32(1); init->u.write.to_sink = cpu_to_be64(1); init->u.write.u.immd_src[0].op = FW_RI_DATA_IMMD; init->u.write.len16 = DIV_ROUND_UP(sizeof init->u.write + sizeof(struct fw_ri_immd), 16); break; case FW_RI_INIT_P2PTYPE_READ_REQ: init->u.write.opcode = FW_RI_RDMA_READ_WR; init->u.read.stag_src = cpu_to_be32(1); init->u.read.to_src_lo = cpu_to_be32(1); init->u.read.stag_sink = cpu_to_be32(1); init->u.read.to_sink_lo = cpu_to_be32(1); init->u.read.len16 = DIV_ROUND_UP(sizeof init->u.read, 16); break; } } static int creds(struct toepcb *toep, struct inpcb *inp, size_t wrsize) { struct ofld_tx_sdesc *txsd; CTR3(KTR_IW_CXGBE, "%s:creB %p %u", __func__, toep , wrsize); INP_WLOCK(inp); if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) != 0) { INP_WUNLOCK(inp); return (EINVAL); } txsd = &toep->txsd[toep->txsd_pidx]; txsd->tx_credits = howmany(wrsize, 16); txsd->plen = 0; KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, ("%s: not enough credits (%d)", __func__, toep->tx_credits)); toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; INP_WUNLOCK(inp); CTR5(KTR_IW_CXGBE, "%s:creE %p %u %u %u", __func__, toep , txsd->tx_credits, toep->tx_credits, toep->txsd_pidx); return (0); } static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp) { struct fw_ri_wr *wqe; int ret; struct wrqe *wr; struct c4iw_ep *ep = qhp->ep; struct c4iw_rdev *rdev = &qhp->rhp->rdev; struct adapter *sc = rdev->adap; struct socket *so = ep->com.so; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; CTR5(KTR_IW_CXGBE, "%s qhp %p qid 0x%x ep %p tid %u", __func__, qhp, qhp->wq.sq.qid, ep, ep->hwtid); wr = alloc_wrqe(sizeof(*wqe), &toep->ofld_txq->wrq); if (wr == NULL) return (0); wqe = wrtod(wr); ret = alloc_ird(rhp, qhp->attr.max_ird); if (ret) { qhp->attr.max_ird = 0; free_wrqe(wr); return ret; } memset(wqe, 0, sizeof *wqe); wqe->op_compl = cpu_to_be32( V_FW_WR_OP(FW_RI_WR) | F_FW_WR_COMPL); wqe->flowid_len16 = cpu_to_be32(V_FW_WR_FLOWID(ep->hwtid) | V_FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16))); wqe->cookie = (unsigned long) &ep->com.wr_wait; wqe->u.init.type = FW_RI_TYPE_INIT; wqe->u.init.mpareqbit_p2ptype = V_FW_RI_WR_MPAREQBIT(qhp->attr.mpa_attr.initiator) | V_FW_RI_WR_P2PTYPE(qhp->attr.mpa_attr.p2p_type); wqe->u.init.mpa_attrs = FW_RI_MPA_IETF_ENABLE; if (qhp->attr.mpa_attr.recv_marker_enabled) wqe->u.init.mpa_attrs |= FW_RI_MPA_RX_MARKER_ENABLE; if (qhp->attr.mpa_attr.xmit_marker_enabled) wqe->u.init.mpa_attrs |= FW_RI_MPA_TX_MARKER_ENABLE; if (qhp->attr.mpa_attr.crc_enabled) wqe->u.init.mpa_attrs |= FW_RI_MPA_CRC_ENABLE; wqe->u.init.qp_caps = FW_RI_QP_RDMA_READ_ENABLE | FW_RI_QP_RDMA_WRITE_ENABLE | FW_RI_QP_BIND_ENABLE; if (!qhp->ibqp.uobject) wqe->u.init.qp_caps |= FW_RI_QP_FAST_REGISTER_ENABLE | FW_RI_QP_STAG0_ENABLE; wqe->u.init.nrqe = cpu_to_be16(t4_rqes_posted(&qhp->wq)); wqe->u.init.pdid = cpu_to_be32(qhp->attr.pd); wqe->u.init.qpid = cpu_to_be32(qhp->wq.sq.qid); wqe->u.init.sq_eqid = cpu_to_be32(qhp->wq.sq.qid); wqe->u.init.rq_eqid = cpu_to_be32(qhp->wq.rq.qid); wqe->u.init.scqid = cpu_to_be32(qhp->attr.scq); wqe->u.init.rcqid = cpu_to_be32(qhp->attr.rcq); wqe->u.init.ord_max = cpu_to_be32(qhp->attr.max_ord); wqe->u.init.ird_max = cpu_to_be32(qhp->attr.max_ird); wqe->u.init.iss = cpu_to_be32(ep->snd_seq); wqe->u.init.irs = cpu_to_be32(ep->rcv_seq); wqe->u.init.hwrqsize = cpu_to_be32(qhp->wq.rq.rqt_size); wqe->u.init.hwrqaddr = cpu_to_be32(qhp->wq.rq.rqt_hwaddr - sc->vres.rq.start); if (qhp->attr.mpa_attr.initiator) build_rtr_msg(qhp->attr.mpa_attr.p2p_type, &wqe->u.init); c4iw_init_wr_wait(&ep->com.wr_wait); ret = creds(toep, inp, sizeof(*wqe)); if (ret) { free_wrqe(wr); free_ird(rhp, qhp->attr.max_ird); return ret; } t4_wrq_tx(sc, wr); ret = c4iw_wait_for_reply(rdev, &ep->com.wr_wait, ep->hwtid, qhp->wq.sq.qid, ep->com.so, __func__); toep->params.ulp_mode = ULP_MODE_RDMA; free_ird(rhp, qhp->attr.max_ird); return ret; } int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, enum c4iw_qp_attr_mask mask, struct c4iw_qp_attributes *attrs, int internal) { int ret = 0; struct c4iw_qp_attributes newattr = qhp->attr; int disconnect = 0; int terminate = 0; int abort = 0; int free = 0; struct c4iw_ep *ep = NULL; CTR5(KTR_IW_CXGBE, "%s qhp %p sqid 0x%x rqid 0x%x ep %p", __func__, qhp, qhp->wq.sq.qid, qhp->wq.rq.qid, qhp->ep); CTR3(KTR_IW_CXGBE, "%s state %d -> %d", __func__, qhp->attr.state, (mask & C4IW_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1); mutex_lock(&qhp->mutex); /* Process attr changes if in IDLE */ if (mask & C4IW_QP_ATTR_VALID_MODIFY) { if (qhp->attr.state != C4IW_QP_STATE_IDLE) { ret = -EIO; goto out; } if (mask & C4IW_QP_ATTR_ENABLE_RDMA_READ) newattr.enable_rdma_read = attrs->enable_rdma_read; if (mask & C4IW_QP_ATTR_ENABLE_RDMA_WRITE) newattr.enable_rdma_write = attrs->enable_rdma_write; if (mask & C4IW_QP_ATTR_ENABLE_RDMA_BIND) newattr.enable_bind = attrs->enable_bind; if (mask & C4IW_QP_ATTR_MAX_ORD) { if (attrs->max_ord > c4iw_max_read_depth) { ret = -EINVAL; goto out; } newattr.max_ord = attrs->max_ord; } if (mask & C4IW_QP_ATTR_MAX_IRD) { if (attrs->max_ird > cur_max_read_depth(rhp)) { ret = -EINVAL; goto out; } newattr.max_ird = attrs->max_ird; } qhp->attr = newattr; } if (!(mask & C4IW_QP_ATTR_NEXT_STATE)) goto out; if (qhp->attr.state == attrs->next_state) goto out; /* Return EINPROGRESS if QP is already in transition state. * Eg: CLOSING->IDLE transition or *->ERROR transition. * This can happen while connection is switching(due to rdma_fini) * from iWARP/RDDP to TOE mode and any inflight RDMA RX data will * reach TOE driver -> TCP stack -> iWARP driver. In this way * iWARP driver keep receiving inflight RDMA RX data until socket * is closed or aborted. And if iWARP CM is in FPDU sate, then * it tries to put QP in TERM state and disconnects endpoint. * But as QP is already in transition state, this event is ignored. */ if ((qhp->attr.state >= C4IW_QP_STATE_ERROR) && (attrs->next_state == C4IW_QP_STATE_TERMINATE)) { ret = -EINPROGRESS; goto out; } switch (qhp->attr.state) { case C4IW_QP_STATE_IDLE: switch (attrs->next_state) { case C4IW_QP_STATE_RTS: if (!(mask & C4IW_QP_ATTR_LLP_STREAM_HANDLE)) { ret = -EINVAL; goto out; } if (!(mask & C4IW_QP_ATTR_MPA_ATTR)) { ret = -EINVAL; goto out; } qhp->attr.mpa_attr = attrs->mpa_attr; qhp->attr.llp_stream_handle = attrs->llp_stream_handle; qhp->ep = qhp->attr.llp_stream_handle; set_state(qhp, C4IW_QP_STATE_RTS); /* * Ref the endpoint here and deref when we * disassociate the endpoint from the QP. This * happens in CLOSING->IDLE transition or *->ERROR * transition. */ c4iw_get_ep(&qhp->ep->com); ret = rdma_init(rhp, qhp); if (ret) goto err; break; case C4IW_QP_STATE_ERROR: set_state(qhp, C4IW_QP_STATE_ERROR); flush_qp(qhp); break; default: ret = -EINVAL; goto out; } break; case C4IW_QP_STATE_RTS: switch (attrs->next_state) { case C4IW_QP_STATE_CLOSING: BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2); t4_set_wq_in_error(&qhp->wq); set_state(qhp, C4IW_QP_STATE_CLOSING); ep = qhp->ep; if (!internal) { abort = 0; disconnect = 1; c4iw_get_ep(&qhp->ep->com); } ret = rdma_fini(rhp, qhp, ep); if (ret) goto err; break; case C4IW_QP_STATE_TERMINATE: t4_set_wq_in_error(&qhp->wq); set_state(qhp, C4IW_QP_STATE_TERMINATE); qhp->attr.layer_etype = attrs->layer_etype; qhp->attr.ecode = attrs->ecode; ep = qhp->ep; if (!internal) { c4iw_get_ep(&qhp->ep->com); terminate = 1; disconnect = 1; } else { terminate = qhp->attr.send_term; ret = rdma_fini(rhp, qhp, ep); if (ret) goto err; } break; case C4IW_QP_STATE_ERROR: t4_set_wq_in_error(&qhp->wq); set_state(qhp, C4IW_QP_STATE_ERROR); if (!internal) { abort = 1; disconnect = 1; ep = qhp->ep; c4iw_get_ep(&qhp->ep->com); } goto err; break; default: ret = -EINVAL; goto out; } break; case C4IW_QP_STATE_CLOSING: /* * Allow kernel users to move to ERROR for qp draining. */ if (!internal && (qhp->ibqp.uobject || attrs->next_state != C4IW_QP_STATE_ERROR)) { ret = -EINVAL; goto out; } switch (attrs->next_state) { case C4IW_QP_STATE_IDLE: flush_qp(qhp); set_state(qhp, C4IW_QP_STATE_IDLE); qhp->attr.llp_stream_handle = NULL; c4iw_put_ep(&qhp->ep->com); qhp->ep = NULL; wake_up(&qhp->wait); break; case C4IW_QP_STATE_ERROR: goto err; default: ret = -EINVAL; goto err; } break; case C4IW_QP_STATE_ERROR: if (attrs->next_state != C4IW_QP_STATE_IDLE) { ret = -EINVAL; goto out; } if (!t4_sq_empty(&qhp->wq) || !t4_rq_empty(&qhp->wq)) { ret = -EINVAL; goto out; } set_state(qhp, C4IW_QP_STATE_IDLE); break; case C4IW_QP_STATE_TERMINATE: if (!internal) { ret = -EINVAL; goto out; } goto err; break; default: printf("%s in a bad state %d\n", __func__, qhp->attr.state); ret = -EINVAL; goto err; break; } goto out; err: CTR3(KTR_IW_CXGBE, "%s disassociating ep %p qpid 0x%x", __func__, qhp->ep, qhp->wq.sq.qid); /* disassociate the LLP connection */ qhp->attr.llp_stream_handle = NULL; if (!ep) ep = qhp->ep; qhp->ep = NULL; set_state(qhp, C4IW_QP_STATE_ERROR); free = 1; abort = 1; BUG_ON(!ep); flush_qp(qhp); wake_up(&qhp->wait); out: mutex_unlock(&qhp->mutex); if (terminate) post_terminate(qhp, NULL, internal ? GFP_ATOMIC : GFP_KERNEL); /* * If disconnect is 1, then we need to initiate a disconnect * on the EP. This can be a normal close (RTS->CLOSING) or * an abnormal close (RTS/CLOSING->ERROR). */ if (disconnect) { __c4iw_ep_disconnect(ep, abort, internal ? GFP_ATOMIC : GFP_KERNEL); c4iw_put_ep(&ep->com); } /* * If free is 1, then we've disassociated the EP from the QP * and we need to dereference the EP. */ if (free) c4iw_put_ep(&ep->com); CTR2(KTR_IW_CXGBE, "%s exit state %d", __func__, qhp->attr.state); return ret; } int c4iw_destroy_qp(struct ib_qp *ib_qp) { struct c4iw_dev *rhp; struct c4iw_qp *qhp; struct c4iw_qp_attributes attrs; CTR2(KTR_IW_CXGBE, "%s ib_qp %p", __func__, ib_qp); qhp = to_c4iw_qp(ib_qp); rhp = qhp->rhp; attrs.next_state = C4IW_QP_STATE_ERROR; if (qhp->attr.state == C4IW_QP_STATE_TERMINATE) c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); else c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); wait_event(qhp->wait, !qhp->ep); remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid); free_ird(rhp, qhp->attr.max_ird); c4iw_qp_rem_ref(ib_qp); CTR3(KTR_IW_CXGBE, "%s ib_qp %p qpid 0x%0x", __func__, ib_qp, qhp->wq.sq.qid); return 0; } struct ib_qp * c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, struct ib_udata *udata) { struct c4iw_dev *rhp; struct c4iw_qp *qhp; struct c4iw_pd *php; struct c4iw_cq *schp; struct c4iw_cq *rchp; struct c4iw_create_qp_resp uresp; unsigned int sqsize, rqsize; struct c4iw_ucontext *ucontext; int ret; struct c4iw_mm_entry *sq_key_mm = NULL, *rq_key_mm = NULL; struct c4iw_mm_entry *sq_db_key_mm = NULL, *rq_db_key_mm = NULL; CTR2(KTR_IW_CXGBE, "%s ib_pd %p", __func__, pd); if (attrs->qp_type != IB_QPT_RC) return ERR_PTR(-EINVAL); php = to_c4iw_pd(pd); rhp = php->rhp; schp = get_chp(rhp, ((struct c4iw_cq *)attrs->send_cq)->cq.cqid); rchp = get_chp(rhp, ((struct c4iw_cq *)attrs->recv_cq)->cq.cqid); if (!schp || !rchp) return ERR_PTR(-EINVAL); if (attrs->cap.max_inline_data > T4_MAX_SEND_INLINE) return ERR_PTR(-EINVAL); if (attrs->cap.max_recv_wr > rhp->rdev.hw_queue.t4_max_rq_size) return ERR_PTR(-E2BIG); rqsize = attrs->cap.max_recv_wr + 1; if (rqsize < 8) rqsize = 8; if (attrs->cap.max_send_wr > rhp->rdev.hw_queue.t4_max_sq_size) return ERR_PTR(-E2BIG); sqsize = attrs->cap.max_send_wr + 1; if (sqsize < 8) sqsize = 8; ucontext = pd->uobject ? to_c4iw_ucontext(pd->uobject->context) : NULL; qhp = kzalloc(sizeof(*qhp), GFP_KERNEL); if (!qhp) return ERR_PTR(-ENOMEM); qhp->wq.sq.size = sqsize; qhp->wq.sq.memsize = (sqsize + rhp->rdev.hw_queue.t4_eq_status_entries) * sizeof(*qhp->wq.sq.queue) + 16 * sizeof(__be64); qhp->wq.sq.flush_cidx = -1; qhp->wq.rq.size = rqsize; qhp->wq.rq.memsize = (rqsize + rhp->rdev.hw_queue.t4_eq_status_entries) * sizeof(*qhp->wq.rq.queue); if (ucontext) { qhp->wq.sq.memsize = roundup(qhp->wq.sq.memsize, PAGE_SIZE); qhp->wq.rq.memsize = roundup(qhp->wq.rq.memsize, PAGE_SIZE); } CTR5(KTR_IW_CXGBE, "%s sqsize %u sqmemsize %zu rqsize %u rqmemsize %zu", __func__, sqsize, qhp->wq.sq.memsize, rqsize, qhp->wq.rq.memsize); ret = create_qp(&rhp->rdev, &qhp->wq, &schp->cq, &rchp->cq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx); if (ret) goto err1; attrs->cap.max_recv_wr = rqsize - 1; attrs->cap.max_send_wr = sqsize - 1; attrs->cap.max_inline_data = T4_MAX_SEND_INLINE; qhp->rhp = rhp; qhp->attr.pd = php->pdid; qhp->attr.scq = ((struct c4iw_cq *) attrs->send_cq)->cq.cqid; qhp->attr.rcq = ((struct c4iw_cq *) attrs->recv_cq)->cq.cqid; qhp->attr.sq_num_entries = attrs->cap.max_send_wr; qhp->attr.rq_num_entries = attrs->cap.max_recv_wr; qhp->attr.sq_max_sges = attrs->cap.max_send_sge; qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge; qhp->attr.rq_max_sges = attrs->cap.max_recv_sge; qhp->attr.state = C4IW_QP_STATE_IDLE; qhp->attr.next_state = C4IW_QP_STATE_IDLE; qhp->attr.enable_rdma_read = 1; qhp->attr.enable_rdma_write = 1; qhp->attr.enable_bind = 1; qhp->attr.max_ord = 0; qhp->attr.max_ird = 0; qhp->sq_sig_all = attrs->sq_sig_type == IB_SIGNAL_ALL_WR; spin_lock_init(&qhp->lock); mutex_init(&qhp->mutex); init_waitqueue_head(&qhp->wait); kref_init(&qhp->kref); INIT_WORK(&qhp->free_work, free_qp_work); ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid); if (ret) goto err2; if (udata) { sq_key_mm = kmalloc(sizeof(*sq_key_mm), GFP_KERNEL); if (!sq_key_mm) { ret = -ENOMEM; goto err3; } rq_key_mm = kmalloc(sizeof(*rq_key_mm), GFP_KERNEL); if (!rq_key_mm) { ret = -ENOMEM; goto err4; } sq_db_key_mm = kmalloc(sizeof(*sq_db_key_mm), GFP_KERNEL); if (!sq_db_key_mm) { ret = -ENOMEM; goto err5; } rq_db_key_mm = kmalloc(sizeof(*rq_db_key_mm), GFP_KERNEL); if (!rq_db_key_mm) { ret = -ENOMEM; goto err6; } uresp.flags = 0; uresp.qid_mask = rhp->rdev.qpmask; uresp.sqid = qhp->wq.sq.qid; uresp.sq_size = qhp->wq.sq.size; uresp.sq_memsize = qhp->wq.sq.memsize; uresp.rqid = qhp->wq.rq.qid; uresp.rq_size = qhp->wq.rq.size; uresp.rq_memsize = qhp->wq.rq.memsize; spin_lock(&ucontext->mmap_lock); uresp.ma_sync_key = 0; uresp.sq_key = ucontext->key; ucontext->key += PAGE_SIZE; uresp.rq_key = ucontext->key; ucontext->key += PAGE_SIZE; uresp.sq_db_gts_key = ucontext->key; ucontext->key += PAGE_SIZE; uresp.rq_db_gts_key = ucontext->key; ucontext->key += PAGE_SIZE; spin_unlock(&ucontext->mmap_lock); ret = ib_copy_to_udata(udata, &uresp, sizeof uresp); if (ret) goto err7; sq_key_mm->key = uresp.sq_key; sq_key_mm->addr = qhp->wq.sq.phys_addr; sq_key_mm->len = PAGE_ALIGN(qhp->wq.sq.memsize); CTR4(KTR_IW_CXGBE, "%s sq_key_mm %x, %x, %d", __func__, sq_key_mm->key, sq_key_mm->addr, sq_key_mm->len); insert_mmap(ucontext, sq_key_mm); rq_key_mm->key = uresp.rq_key; rq_key_mm->addr = qhp->wq.rq.phys_addr; rq_key_mm->len = PAGE_ALIGN(qhp->wq.rq.memsize); CTR4(KTR_IW_CXGBE, "%s rq_key_mm %x, %x, %d", __func__, rq_key_mm->key, rq_key_mm->addr, rq_key_mm->len); insert_mmap(ucontext, rq_key_mm); sq_db_key_mm->key = uresp.sq_db_gts_key; sq_db_key_mm->addr = (u64)qhp->wq.sq.bar2_pa; sq_db_key_mm->len = PAGE_SIZE; CTR4(KTR_IW_CXGBE, "%s sq_db_key_mm %x, %x, %d", __func__, sq_db_key_mm->key, sq_db_key_mm->addr, sq_db_key_mm->len); insert_mmap(ucontext, sq_db_key_mm); rq_db_key_mm->key = uresp.rq_db_gts_key; rq_db_key_mm->addr = (u64)qhp->wq.rq.bar2_pa; rq_db_key_mm->len = PAGE_SIZE; CTR4(KTR_IW_CXGBE, "%s rq_db_key_mm %x, %x, %d", __func__, rq_db_key_mm->key, rq_db_key_mm->addr, rq_db_key_mm->len); insert_mmap(ucontext, rq_db_key_mm); c4iw_get_ucontext(ucontext); qhp->ucontext = ucontext; } qhp->ibqp.qp_num = qhp->wq.sq.qid; init_timer(&(qhp->timer)); CTR5(KTR_IW_CXGBE, "%s sq id %u size %u memsize %zu num_entries %u", __func__, qhp->wq.sq.qid, qhp->wq.sq.size, qhp->wq.sq.memsize, attrs->cap.max_send_wr); CTR5(KTR_IW_CXGBE, "%s rq id %u size %u memsize %zu num_entries %u", __func__, qhp->wq.rq.qid, qhp->wq.rq.size, qhp->wq.rq.memsize, attrs->cap.max_recv_wr); return &qhp->ibqp; err7: kfree(rq_db_key_mm); err6: kfree(sq_db_key_mm); err5: kfree(rq_key_mm); err4: kfree(sq_key_mm); err3: remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid); err2: destroy_qp(&rhp->rdev, &qhp->wq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx); err1: kfree(qhp); return ERR_PTR(ret); } int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct c4iw_dev *rhp; struct c4iw_qp *qhp; enum c4iw_qp_attr_mask mask = 0; struct c4iw_qp_attributes attrs; CTR2(KTR_IW_CXGBE, "%s ib_qp %p", __func__, ibqp); /* iwarp does not support the RTR state */ if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR)) attr_mask &= ~IB_QP_STATE; /* Make sure we still have something left to do */ if (!attr_mask) return 0; memset(&attrs, 0, sizeof attrs); qhp = to_c4iw_qp(ibqp); rhp = qhp->rhp; attrs.next_state = c4iw_convert_state(attr->qp_state); attrs.enable_rdma_read = (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) ? 1 : 0; attrs.enable_rdma_write = (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0; mask |= (attr_mask & IB_QP_STATE) ? C4IW_QP_ATTR_NEXT_STATE : 0; mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ? (C4IW_QP_ATTR_ENABLE_RDMA_READ | C4IW_QP_ATTR_ENABLE_RDMA_WRITE | C4IW_QP_ATTR_ENABLE_RDMA_BIND) : 0; return c4iw_modify_qp(rhp, qhp, mask, &attrs, 0); } struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn) { CTR3(KTR_IW_CXGBE, "%s ib_dev %p qpn 0x%x", __func__, dev, qpn); return (struct ib_qp *)get_qhp(to_c4iw_dev(dev), qpn); } int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_qp_init_attr *init_attr) { struct c4iw_qp *qhp = to_c4iw_qp(ibqp); memset(attr, 0, sizeof *attr); memset(init_attr, 0, sizeof *init_attr); attr->qp_state = to_ib_qp_state(qhp->attr.state); init_attr->cap.max_send_wr = qhp->attr.sq_num_entries; init_attr->cap.max_recv_wr = qhp->attr.rq_num_entries; init_attr->cap.max_send_sge = qhp->attr.sq_max_sges; init_attr->cap.max_recv_sge = qhp->attr.sq_max_sges; init_attr->cap.max_inline_data = T4_MAX_SEND_INLINE; init_attr->sq_sig_type = qhp->sq_sig_all ? IB_SIGNAL_ALL_WR : 0; return 0; } #endif diff --git a/sys/dev/iser/iser_memory.c b/sys/dev/iser/iser_memory.c index ca3b557866b1..9cf48248741c 100644 --- a/sys/dev/iser/iser_memory.c +++ b/sys/dev/iser/iser_memory.c @@ -1,285 +1,286 @@ /* $FreeBSD$ */ /*- * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "icl_iser.h" static struct fast_reg_descriptor * iser_reg_desc_get(struct ib_conn *ib_conn) { struct fast_reg_descriptor *desc; mtx_lock(&ib_conn->lock); desc = list_first_entry(&ib_conn->fastreg.pool, struct fast_reg_descriptor, list); list_del(&desc->list); mtx_unlock(&ib_conn->lock); return (desc); } static void iser_reg_desc_put(struct ib_conn *ib_conn, struct fast_reg_descriptor *desc) { mtx_lock(&ib_conn->lock); list_add(&desc->list, &ib_conn->fastreg.pool); mtx_unlock(&ib_conn->lock); } #define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) /** * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned * for RDMA sub-list of a scatter-gather list of memory buffers, and returns * the number of entries which are aligned correctly. Supports the case where * consecutive SG elements are actually fragments of the same physcial page. */ static int iser_data_buf_aligned_len(struct iser_data_buf *data, struct ib_device *ibdev) { struct scatterlist *sg, *sgl, *next_sg = NULL; u64 start_addr, end_addr; int i, ret_len, start_check = 0; if (data->dma_nents == 1) return (1); sgl = data->sgl; start_addr = ib_sg_dma_address(ibdev, sgl); for_each_sg(sgl, sg, data->dma_nents, i) { if (start_check && !IS_4K_ALIGNED(start_addr)) break; next_sg = sg_next(sg); if (!next_sg) break; end_addr = start_addr + ib_sg_dma_len(ibdev, sg); start_addr = ib_sg_dma_address(ibdev, next_sg); if (end_addr == start_addr) { start_check = 0; continue; } else start_check = 1; if (!IS_4K_ALIGNED(end_addr)) break; } ret_len = (next_sg) ? i : i+1; return (ret_len); } void iser_dma_unmap_task_data(struct icl_iser_pdu *iser_pdu, struct iser_data_buf *data, enum dma_data_direction dir) { struct ib_device *dev; dev = iser_pdu->iser_conn->ib_conn.device->ib_device; ib_dma_unmap_sg(dev, data->sgl, data->size, dir); } static int iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem, struct iser_mem_reg *reg) { struct scatterlist *sg = mem->sgl; reg->sge.lkey = device->mr->lkey; reg->rkey = device->mr->rkey; reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]); reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]); return (0); } /** * TODO: This should be a verb * iser_ib_inc_rkey - increments the key portion of the given rkey. Can be used * for calculating a new rkey for type 2 memory windows. * @rkey - the rkey to increment. */ static inline u32 iser_ib_inc_rkey(u32 rkey) { const u32 mask = 0x000000ff; return (((rkey + 1) & mask) | (rkey & ~mask)); } static void iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr) { u32 rkey; memset(inv_wr, 0, sizeof(*inv_wr)); inv_wr->opcode = IB_WR_LOCAL_INV; inv_wr->wr_id = ISER_FASTREG_LI_WRID; inv_wr->ex.invalidate_rkey = mr->rkey; rkey = iser_ib_inc_rkey(mr->rkey); ib_update_fast_reg_key(mr, rkey); } static int iser_fast_reg_mr(struct icl_iser_pdu *iser_pdu, struct iser_data_buf *mem, struct iser_reg_resources *rsc, struct iser_mem_reg *reg) { struct ib_conn *ib_conn = &iser_pdu->iser_conn->ib_conn; struct iser_device *device = ib_conn->device; struct ib_mr *mr = rsc->mr; struct ib_reg_wr fastreg_wr; struct ib_send_wr inv_wr; - struct ib_send_wr *bad_wr, *wr = NULL; + const struct ib_send_wr *bad_wr; + struct ib_send_wr *wr = NULL; int ret, n; /* if there a single dma entry, dma mr suffices */ if (mem->dma_nents == 1) return iser_reg_dma(device, mem, reg); if (!rsc->mr_valid) { iser_inv_rkey(&inv_wr, mr); wr = &inv_wr; } n = ib_map_mr_sg(mr, mem->sg, mem->size, NULL, SIZE_4K); if (unlikely(n != mem->size)) { ISER_ERR("failed to map sg (%d/%d)\n", n, mem->size); return n < 0 ? n : -EINVAL; } /* Prepare FASTREG WR */ memset(&fastreg_wr, 0, sizeof(fastreg_wr)); fastreg_wr.wr.opcode = IB_WR_REG_MR; fastreg_wr.wr.wr_id = ISER_FASTREG_LI_WRID; fastreg_wr.wr.num_sge = 0; fastreg_wr.mr = mr; fastreg_wr.key = mr->rkey; fastreg_wr.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; if (!wr) wr = &fastreg_wr.wr; else wr->next = &fastreg_wr.wr; ret = ib_post_send(ib_conn->qp, wr, &bad_wr); if (ret) { ISER_ERR("fast registration failed, ret:%d", ret); return (ret); } rsc->mr_valid = 0; reg->sge.lkey = mr->lkey; reg->rkey = mr->rkey; reg->sge.addr = mr->iova; reg->sge.length = mr->length; return (ret); } /** * iser_reg_rdma_mem - Registers memory intended for RDMA, * using Fast Registration WR (if possible) obtaining rkey and va * * returns 0 on success, errno code on failure */ int iser_reg_rdma_mem(struct icl_iser_pdu *iser_pdu, enum iser_data_dir cmd_dir) { struct ib_conn *ib_conn = &iser_pdu->iser_conn->ib_conn; struct iser_device *device = ib_conn->device; struct ib_device *ibdev = device->ib_device; struct iser_data_buf *mem = &iser_pdu->data[cmd_dir]; struct iser_mem_reg *mem_reg = &iser_pdu->rdma_reg[cmd_dir]; struct fast_reg_descriptor *desc = NULL; int err, aligned_len; aligned_len = iser_data_buf_aligned_len(mem, ibdev); if (aligned_len != mem->dma_nents) { ISER_ERR("bounce buffer is not supported"); return 1; } if (mem->dma_nents != 1) { desc = iser_reg_desc_get(ib_conn); mem_reg->mem_h = desc; } err = iser_fast_reg_mr(iser_pdu, mem, desc ? &desc->rsc : NULL, mem_reg); if (err) goto err_reg; return (0); err_reg: if (desc) iser_reg_desc_put(ib_conn, desc); return (err); } void iser_unreg_rdma_mem(struct icl_iser_pdu *iser_pdu, enum iser_data_dir cmd_dir) { struct iser_mem_reg *reg = &iser_pdu->rdma_reg[cmd_dir]; if (!reg->mem_h) return; iser_reg_desc_put(&iser_pdu->iser_conn->ib_conn, reg->mem_h); reg->mem_h = NULL; } int iser_dma_map_task_data(struct icl_iser_pdu *iser_pdu, struct iser_data_buf *data, enum iser_data_dir iser_dir, enum dma_data_direction dma_dir) { struct ib_device *dev; iser_pdu->dir[iser_dir] = 1; dev = iser_pdu->iser_conn->ib_conn.device->ib_device; data->dma_nents = ib_dma_map_sg(dev, data->sgl, data->size, dma_dir); if (data->dma_nents == 0) { ISER_ERR("dma_map_sg failed"); return (EINVAL); } return (0); } diff --git a/sys/dev/iser/iser_verbs.c b/sys/dev/iser/iser_verbs.c index c9d42da2657b..c613764bd952 100644 --- a/sys/dev/iser/iser_verbs.c +++ b/sys/dev/iser/iser_verbs.c @@ -1,946 +1,949 @@ /* $FreeBSD$ */ /*- * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "icl_iser.h" static MALLOC_DEFINE(M_ISER_VERBS, "iser_verbs", "iser verbs backend"); static int iser_cq_poll_limit = 512; static void iser_cq_event_callback(struct ib_event *cause, void *context) { ISER_ERR("got cq event %d", cause->event); } static void iser_qp_event_callback(struct ib_event *cause, void *context) { ISER_ERR("got qp event %d", cause->event); } static void iser_event_handler(struct ib_event_handler *handler, struct ib_event *event) { ISER_ERR("async event %d on device %s port %d", event->event, event->device->name, event->element.port_num); } /** * is_iser_tx_desc - Indicate if the completion wr_id * is a TX descriptor or not. * @iser_conn: iser connection * @wr_id: completion WR identifier * * Since we cannot rely on wc opcode in FLUSH errors * we must work around it by checking if the wr_id address * falls in the iser connection rx_descs buffer. If so * it is an RX descriptor, otherwize it is a TX. */ static inline bool is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id) { void *start = iser_conn->rx_descs; u64 len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs); void *end = (void *)((uintptr_t)start + (uintptr_t)len); if (start) { if (wr_id >= start && wr_id < end) return false; } else { return ((uintptr_t)wr_id != (uintptr_t)iser_conn->login_resp_buf); } return true; } /** * iser_handle_comp_error() - Handle error completion * @ib_conn: connection RDMA resources * @wc: work completion * * Notes: Update post_recv_buf_count in case of recv error completion. * For non-FLUSH error completion we should also notify iscsi layer that * connection is failed (in case we passed bind stage). */ static void iser_handle_comp_error(struct ib_conn *ib_conn, struct ib_wc *wc) { void *wr_id = (void *)(uintptr_t)wc->wr_id; struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, ib_conn); if (is_iser_tx_desc(iser_conn, wr_id)) { ISER_DBG("conn %p got send comp error", iser_conn); } else { ISER_DBG("conn %p got recv comp error", iser_conn); ib_conn->post_recv_buf_count--; } if (wc->status != IB_WC_WR_FLUSH_ERR) iser_conn->icl_conn.ic_error(&iser_conn->icl_conn); } /** * iser_handle_wc - handle a single work completion * @wc: work completion * * Soft-IRQ context, work completion can be either * SEND or RECV, and can turn out successful or * with error (or flush error). */ static void iser_handle_wc(struct ib_wc *wc) { struct ib_conn *ib_conn; struct iser_tx_desc *tx_desc; struct iser_rx_desc *rx_desc; ib_conn = wc->qp->qp_context; if (likely(wc->status == IB_WC_SUCCESS)) { if (wc->opcode == IB_WC_RECV) { rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id; iser_rcv_completion(rx_desc, wc->byte_len, ib_conn); } else if (wc->opcode == IB_WC_SEND) { tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id; iser_snd_completion(tx_desc, ib_conn); } else { ISER_ERR("Unknown wc opcode %d", wc->opcode); } } else { struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, ib_conn); if (wc->status != IB_WC_WR_FLUSH_ERR) { ISER_ERR("conn %p wr id %llx status %d vend_err %x", iser_conn, (unsigned long long)wc->wr_id, wc->status, wc->vendor_err); } else { ISER_DBG("flush error: conn %p wr id %llx", iser_conn, (unsigned long long)wc->wr_id); } if (wc->wr_id == ISER_BEACON_WRID) { /* all flush errors were consumed */ mtx_lock(&ib_conn->beacon.flush_lock); ISER_DBG("conn %p got ISER_BEACON_WRID", iser_conn); cv_signal(&ib_conn->beacon.flush_cv); mtx_unlock(&ib_conn->beacon.flush_lock); } else { iser_handle_comp_error(ib_conn, wc); } } } static void iser_cq_tasklet_fn(void *data, int pending) { struct iser_comp *comp = (struct iser_comp *)data; struct ib_cq *cq = comp->cq; struct ib_wc *const wcs = comp->wcs; int completed = 0; int i; int n; while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) { for (i = 0; i < n; i++) iser_handle_wc(&wcs[i]); completed += n; if (completed >= iser_cq_poll_limit) break; } /* * It is assumed here that arming CQ only once its empty * would not cause interrupts to be missed. */ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); } static void iser_cq_callback(struct ib_cq *cq, void *cq_context) { struct iser_comp *comp = cq_context; taskqueue_enqueue(comp->tq, &comp->task); } /** * iser_create_device_ib_res - creates Protection Domain (PD), Completion * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with * the adapator. * * returns 0 on success, -1 on failure */ static int iser_create_device_ib_res(struct iser_device *device) { struct ib_device *ib_dev = device->ib_device; int i, max_cqe; if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { ISER_ERR("device %s doesn't support Fastreg, " "can't register memory", device->ib_device->name); return (1); } device->comps_used = min(mp_ncpus, device->ib_device->num_comp_vectors); device->comps = malloc(device->comps_used * sizeof(*device->comps), M_ISER_VERBS, M_WAITOK | M_ZERO); if (!device->comps) goto comps_err; max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe); ISER_DBG("using %d CQs, device %s supports %d vectors max_cqe %d", device->comps_used, device->ib_device->name, device->ib_device->num_comp_vectors, max_cqe); device->pd = ib_alloc_pd(device->ib_device, IB_PD_UNSAFE_GLOBAL_RKEY); if (IS_ERR(device->pd)) goto pd_err; for (i = 0; i < device->comps_used; i++) { struct iser_comp *comp = &device->comps[i]; struct ib_cq_init_attr cq_attr = { .cqe = max_cqe, .comp_vector = i, }; comp->device = device; comp->cq = ib_create_cq(device->ib_device, iser_cq_callback, iser_cq_event_callback, (void *)comp, &cq_attr); if (IS_ERR(comp->cq)) { comp->cq = NULL; goto cq_err; } if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP)) goto cq_err; TASK_INIT(&comp->task, 0, iser_cq_tasklet_fn, comp); comp->tq = taskqueue_create_fast("iser_taskq", M_NOWAIT, taskqueue_thread_enqueue, &comp->tq); if (!comp->tq) goto tq_err; taskqueue_start_threads(&comp->tq, 1, PI_NET, "iser taskq"); } device->mr = device->pd->__internal_mr; if (IS_ERR(device->mr)) goto tq_err; INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device, iser_event_handler); if (ib_register_event_handler(&device->event_handler)) goto tq_err; return (0); tq_err: for (i = 0; i < device->comps_used; i++) { struct iser_comp *comp = &device->comps[i]; if (comp->tq) taskqueue_free(comp->tq); } cq_err: for (i = 0; i < device->comps_used; i++) { struct iser_comp *comp = &device->comps[i]; if (comp->cq) ib_destroy_cq(comp->cq); } ib_dealloc_pd(device->pd); pd_err: free(device->comps, M_ISER_VERBS); comps_err: ISER_ERR("failed to allocate an IB resource"); return (1); } /** * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR, * CQ and PD created with the device associated with the adapator. */ static void iser_free_device_ib_res(struct iser_device *device) { int i; for (i = 0; i < device->comps_used; i++) { struct iser_comp *comp = &device->comps[i]; taskqueue_free(comp->tq); ib_destroy_cq(comp->cq); comp->cq = NULL; } (void)ib_unregister_event_handler(&device->event_handler); (void)ib_dealloc_pd(device->pd); free(device->comps, M_ISER_VERBS); device->comps = NULL; device->mr = NULL; device->pd = NULL; } static int iser_alloc_reg_res(struct ib_device *ib_device, struct ib_pd *pd, struct iser_reg_resources *res) { int ret; res->mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, ISCSI_ISER_SG_TABLESIZE + 1); if (IS_ERR(res->mr)) { ret = -PTR_ERR(res->mr); ISER_ERR("Failed to allocate fast reg mr err=%d", ret); return (ret); } res->mr_valid = 1; return (0); } static void iser_free_reg_res(struct iser_reg_resources *rsc) { ib_dereg_mr(rsc->mr); } static struct fast_reg_descriptor * iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd) { struct fast_reg_descriptor *desc; int ret; desc = malloc(sizeof(*desc), M_ISER_VERBS, M_WAITOK | M_ZERO); if (!desc) { ISER_ERR("Failed to allocate a new fastreg descriptor"); return (NULL); } ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc); if (ret) { ISER_ERR("failed to allocate reg_resources"); goto err; } return (desc); err: free(desc, M_ISER_VERBS); return (NULL); } /** * iser_create_fmr_pool - Creates FMR pool and page_vector * * returns 0 on success, or errno code on failure */ int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max) { struct iser_device *device = ib_conn->device; struct fast_reg_descriptor *desc; int i; INIT_LIST_HEAD(&ib_conn->fastreg.pool); ib_conn->fastreg.pool_size = 0; for (i = 0; i < cmds_max; i++) { desc = iser_create_fastreg_desc(device->ib_device, device->pd); if (!desc) { ISER_ERR("Failed to create fastreg descriptor"); goto err; } list_add_tail(&desc->list, &ib_conn->fastreg.pool); ib_conn->fastreg.pool_size++; } return (0); err: iser_free_fastreg_pool(ib_conn); return (ENOMEM); } /** * iser_free_fmr_pool - releases the FMR pool and page vec */ void iser_free_fastreg_pool(struct ib_conn *ib_conn) { struct fast_reg_descriptor *desc, *tmp; int i = 0; if (list_empty(&ib_conn->fastreg.pool)) return; ISER_DBG("freeing conn %p fr pool", ib_conn); list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) { list_del(&desc->list); iser_free_reg_res(&desc->rsc); free(desc, M_ISER_VERBS); ++i; } if (i < ib_conn->fastreg.pool_size) ISER_WARN("pool still has %d regions registered", ib_conn->fastreg.pool_size - i); } /** * iser_create_ib_conn_res - Queue-Pair (QP) * * returns 0 on success, 1 on failure */ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) { struct iser_conn *iser_conn; struct iser_device *device; struct ib_device_attr *dev_attr; struct ib_qp_init_attr init_attr; int index, min_index = 0; int ret = -ENOMEM; iser_conn = container_of(ib_conn, struct iser_conn, ib_conn); device = ib_conn->device; dev_attr = &device->dev_attr; mtx_lock(&ig.connlist_mutex); /* select the CQ with the minimal number of usages */ for (index = 0; index < device->comps_used; index++) { if (device->comps[index].active_qps < device->comps[min_index].active_qps) min_index = index; } ib_conn->comp = &device->comps[min_index]; ib_conn->comp->active_qps++; mtx_unlock(&ig.connlist_mutex); ISER_INFO("cq index %d used for ib_conn %p", min_index, ib_conn); memset(&init_attr, 0, sizeof init_attr); init_attr.event_handler = iser_qp_event_callback; init_attr.qp_context = (void *)ib_conn; init_attr.send_cq = ib_conn->comp->cq; init_attr.recv_cq = ib_conn->comp->cq; init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; init_attr.cap.max_send_sge = 2; init_attr.cap.max_recv_sge = 1; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; init_attr.qp_type = IB_QPT_RC; if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) { init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS; iser_conn->max_cmds = ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS); } else { init_attr.cap.max_send_wr = dev_attr->max_qp_wr; iser_conn->max_cmds = ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr); } ISER_DBG("device %s supports max_send_wr %d", device->ib_device->name, dev_attr->max_qp_wr); ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); if (ret) goto out_err; ib_conn->qp = ib_conn->cma_id->qp; ISER_DBG("setting conn %p cma_id %p qp %p", ib_conn, ib_conn->cma_id, ib_conn->cma_id->qp); return (ret); out_err: mtx_lock(&ig.connlist_mutex); ib_conn->comp->active_qps--; mtx_unlock(&ig.connlist_mutex); ISER_ERR("unable to alloc mem or create resource, err %d", ret); return (ret); } /** * based on the resolved device node GUID see if there already allocated * device for this device. If there's no such, create one. */ static struct iser_device * iser_device_find_by_ib_device(struct rdma_cm_id *cma_id) { struct iser_device *device; sx_xlock(&ig.device_list_mutex); list_for_each_entry(device, &ig.device_list, ig_list) /* find if there's a match using the node GUID */ if (device->ib_device->node_guid == cma_id->device->node_guid) goto inc_refcnt; device = malloc(sizeof *device, M_ISER_VERBS, M_WAITOK | M_ZERO); if (device == NULL) goto out; /* assign this device to the device */ device->ib_device = cma_id->device; /* init the device and link it into ig device list */ if (iser_create_device_ib_res(device)) { free(device, M_ISER_VERBS); device = NULL; goto out; } list_add(&device->ig_list, &ig.device_list); inc_refcnt: device->refcount++; ISER_INFO("device %p refcount %d", device, device->refcount); out: sx_xunlock(&ig.device_list_mutex); return (device); } /* if there's no demand for this device, release it */ static void iser_device_try_release(struct iser_device *device) { sx_xlock(&ig.device_list_mutex); device->refcount--; ISER_INFO("device %p refcount %d", device, device->refcount); if (!device->refcount) { iser_free_device_ib_res(device); list_del(&device->ig_list); free(device, M_ISER_VERBS); device = NULL; } sx_xunlock(&ig.device_list_mutex); } /** * Called with state mutex held **/ static int iser_conn_state_comp_exch(struct iser_conn *iser_conn, enum iser_conn_state comp, enum iser_conn_state exch) { int ret; ret = (iser_conn->state == comp); if (ret) iser_conn->state = exch; return ret; } /** * iser_free_ib_conn_res - release IB related resources * @iser_conn: iser connection struct * @destroy: indicator if we need to try to release the * iser device and memory regoins pool (only iscsi * shutdown and DEVICE_REMOVAL will use this). * * This routine is called with the iser state mutex held * so the cm_id removal is out of here. It is Safe to * be invoked multiple times. */ void iser_free_ib_conn_res(struct iser_conn *iser_conn, bool destroy) { struct ib_conn *ib_conn = &iser_conn->ib_conn; struct iser_device *device = ib_conn->device; ISER_INFO("freeing conn %p cma_id %p qp %p", iser_conn, ib_conn->cma_id, ib_conn->qp); if (ib_conn->qp != NULL) { mtx_lock(&ig.connlist_mutex); ib_conn->comp->active_qps--; mtx_unlock(&ig.connlist_mutex); rdma_destroy_qp(ib_conn->cma_id); ib_conn->qp = NULL; } if (destroy) { if (iser_conn->login_buf) iser_free_login_buf(iser_conn); if (iser_conn->rx_descs) iser_free_rx_descriptors(iser_conn); if (device != NULL) { iser_device_try_release(device); ib_conn->device = NULL; } } } /** * triggers start of the disconnect procedures and wait for them to be done * Called with state mutex held */ int iser_conn_terminate(struct iser_conn *iser_conn) { struct ib_conn *ib_conn = &iser_conn->ib_conn; - struct ib_send_wr *bad_send_wr; - struct ib_recv_wr *bad_recv_wr; + const struct ib_send_wr *bad_send_wr; + const struct ib_recv_wr *bad_recv_wr; int err = 0; /* terminate the iser conn only if the conn state is UP */ if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP, ISER_CONN_TERMINATING)) return (0); ISER_INFO("iser_conn %p state %d\n", iser_conn, iser_conn->state); if (ib_conn->qp == NULL) { /* HOW can this be??? */ ISER_WARN("qp wasn't created"); return (1); } /* * Todo: This is a temporary workaround. * We serialize the connection closure using global lock in order to * receive all posted beacons completions. * Without Serialization, in case we open many connections (QPs) on * the same CQ, we might miss beacons because of missing interrupts. */ sx_xlock(&ig.close_conns_mutex); /* * In case we didn't already clean up the cma_id (peer initiated * a disconnection), we need to Cause the CMA to change the QP * state to ERROR. */ if (ib_conn->cma_id) { err = rdma_disconnect(ib_conn->cma_id); if (err) ISER_ERR("Failed to disconnect, conn: 0x%p err %d", iser_conn, err); mtx_lock(&ib_conn->beacon.flush_lock); memset(&ib_conn->beacon.send, 0, sizeof(struct ib_send_wr)); ib_conn->beacon.send.wr_id = ISER_BEACON_WRID; ib_conn->beacon.send.opcode = IB_WR_SEND; /* post an indication that all send flush errors were consumed */ err = ib_post_send(ib_conn->qp, &ib_conn->beacon.send, &bad_send_wr); if (err) { ISER_ERR("conn %p failed to post send_beacon", ib_conn); mtx_unlock(&ib_conn->beacon.flush_lock); goto out; } ISER_DBG("before send cv_wait: %p", iser_conn); cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock); ISER_DBG("after send cv_wait: %p", iser_conn); memset(&ib_conn->beacon.recv, 0, sizeof(struct ib_recv_wr)); ib_conn->beacon.recv.wr_id = ISER_BEACON_WRID; /* post an indication that all recv flush errors were consumed */ err = ib_post_recv(ib_conn->qp, &ib_conn->beacon.recv, &bad_recv_wr); if (err) { ISER_ERR("conn %p failed to post recv_beacon", ib_conn); mtx_unlock(&ib_conn->beacon.flush_lock); goto out; } ISER_DBG("before recv cv_wait: %p", iser_conn); cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock); mtx_unlock(&ib_conn->beacon.flush_lock); ISER_DBG("after recv cv_wait: %p", iser_conn); } out: sx_xunlock(&ig.close_conns_mutex); return (1); } /** * Called with state mutex held **/ static void iser_connect_error(struct rdma_cm_id *cma_id) { struct iser_conn *iser_conn; iser_conn = cma_id->context; ISER_ERR("conn %p", iser_conn); iser_conn->state = ISER_CONN_TERMINATING; cv_signal(&iser_conn->up_cv); } /** * Called with state mutex held **/ static void iser_addr_handler(struct rdma_cm_id *cma_id) { struct iser_device *device; struct iser_conn *iser_conn; struct ib_conn *ib_conn; int ret; iser_conn = cma_id->context; ib_conn = &iser_conn->ib_conn; device = iser_device_find_by_ib_device(cma_id); if (!device) { ISER_ERR("conn %p device lookup/creation failed", iser_conn); iser_connect_error(cma_id); return; } ib_conn->device = device; ret = rdma_resolve_route(cma_id, 1000); if (ret) { ISER_ERR("conn %p resolve route failed: %d", iser_conn, ret); iser_connect_error(cma_id); return; } } /** * Called with state mutex held **/ static void iser_route_handler(struct rdma_cm_id *cma_id) { struct rdma_conn_param conn_param; int ret; struct iser_cm_hdr req_hdr; struct iser_conn *iser_conn = cma_id->context; struct ib_conn *ib_conn = &iser_conn->ib_conn; struct iser_device *device = ib_conn->device; ret = iser_create_ib_conn_res(ib_conn); if (ret) goto failure; memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = device->dev_attr.max_qp_rd_atom; conn_param.retry_count = 7; conn_param.rnr_retry_count = 6; /* * Initiaotr depth should not be set, but in order to compat * with old targets, we keep this value set. */ conn_param.initiator_depth = 1; memset(&req_hdr, 0, sizeof(req_hdr)); req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED | ISER_SEND_W_INV_NOT_SUPPORTED); conn_param.private_data = (void *)&req_hdr; conn_param.private_data_len = sizeof(struct iser_cm_hdr); ret = rdma_connect(cma_id, &conn_param); if (ret) { ISER_ERR("conn %p failure connecting: %d", iser_conn, ret); goto failure; } return; failure: iser_connect_error(cma_id); } /** * Called with state mutex held **/ static void iser_connected_handler(struct rdma_cm_id *cma_id) { struct iser_conn *iser_conn; struct ib_qp_attr attr; struct ib_qp_init_attr init_attr; iser_conn = cma_id->context; (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); ISER_INFO("remote qpn:%x my qpn:%x", attr.dest_qp_num, cma_id->qp->qp_num); iser_conn->state = ISER_CONN_UP; cv_signal(&iser_conn->up_cv); } /** * Called with state mutex held **/ static void iser_cleanup_handler(struct rdma_cm_id *cma_id, bool destroy) { struct iser_conn *iser_conn = cma_id->context; if (iser_conn_terminate(iser_conn)) iser_conn->icl_conn.ic_error(&iser_conn->icl_conn); } int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { struct iser_conn *iser_conn; int ret = 0; iser_conn = cma_id->context; ISER_INFO("event %d status %d conn %p id %p", event->event, event->status, cma_id->context, cma_id); sx_xlock(&iser_conn->state_mutex); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: iser_addr_handler(cma_id); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: iser_route_handler(cma_id); break; case RDMA_CM_EVENT_ESTABLISHED: iser_connected_handler(cma_id); break; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: iser_connect_error(cma_id); break; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_ADDR_CHANGE: case RDMA_CM_EVENT_TIMEWAIT_EXIT: iser_cleanup_handler(cma_id, false); break; default: ISER_ERR("Unexpected RDMA CM event (%d)", event->event); break; } sx_xunlock(&iser_conn->state_mutex); return (ret); } int iser_post_recvl(struct iser_conn *iser_conn) { - struct ib_recv_wr rx_wr, *rx_wr_failed; + const struct ib_recv_wr *rx_wr_failed; + struct ib_recv_wr rx_wr; struct ib_conn *ib_conn = &iser_conn->ib_conn; struct ib_sge sge; int ib_ret; sge.addr = iser_conn->login_resp_dma; sge.length = ISER_RX_LOGIN_SIZE; sge.lkey = ib_conn->device->mr->lkey; rx_wr.wr_id = (uintptr_t)iser_conn->login_resp_buf; rx_wr.sg_list = &sge; rx_wr.num_sge = 1; rx_wr.next = NULL; ib_conn->post_recv_buf_count++; ib_ret = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed); if (ib_ret) { ISER_ERR("ib_post_recv failed ret=%d", ib_ret); ib_conn->post_recv_buf_count--; } return (ib_ret); } int iser_post_recvm(struct iser_conn *iser_conn, int count) { - struct ib_recv_wr *rx_wr, *rx_wr_failed; + const struct ib_recv_wr *rx_wr_failed; + struct ib_recv_wr *rx_wr; int i, ib_ret; struct ib_conn *ib_conn = &iser_conn->ib_conn; unsigned int my_rx_head = iser_conn->rx_desc_head; struct iser_rx_desc *rx_desc; for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { rx_desc = &iser_conn->rx_descs[my_rx_head]; rx_wr->wr_id = (uintptr_t)rx_desc; rx_wr->sg_list = &rx_desc->rx_sg; rx_wr->num_sge = 1; rx_wr->next = rx_wr + 1; my_rx_head = (my_rx_head + 1) % iser_conn->qp_max_recv_dtos; } rx_wr--; rx_wr->next = NULL; /* mark end of work requests list */ ib_conn->post_recv_buf_count += count; ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed); if (ib_ret) { ISER_ERR("ib_post_recv failed ret=%d", ib_ret); ib_conn->post_recv_buf_count -= count; } else iser_conn->rx_desc_head = my_rx_head; return (ib_ret); } /** * iser_start_send - Initiate a Send DTO operation * * returns 0 on success, -1 on failure */ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, bool signal) { int ib_ret; - struct ib_send_wr send_wr, *send_wr_failed; + const struct ib_send_wr *send_wr_failed; + struct ib_send_wr send_wr; ib_dma_sync_single_for_device(ib_conn->device->ib_device, tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); send_wr.next = NULL; send_wr.wr_id = (uintptr_t)tx_desc; send_wr.sg_list = tx_desc->tx_sg; send_wr.num_sge = tx_desc->num_sge; send_wr.opcode = IB_WR_SEND; send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0; ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed); if (ib_ret) ISER_ERR("ib_post_send failed, ret:%d", ib_ret); return (ib_ret); } diff --git a/sys/dev/mlx4/mlx4_ib/mlx4_ib.h b/sys/dev/mlx4/mlx4_ib/mlx4_ib.h index 100a06b75af7..544ed1913419 100644 --- a/sys/dev/mlx4/mlx4_ib/mlx4_ib.h +++ b/sys/dev/mlx4/mlx4_ib/mlx4_ib.h @@ -1,899 +1,899 @@ /* * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved. * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef MLX4_IB_H #define MLX4_IB_H #include #include #include #include #include #include #include #include #include #include #include #include #include #define MLX4_IB_DRV_NAME "mlx4_ib" #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) "<" MLX4_IB_DRV_NAME "> %s: " fmt, __func__ #define mlx4_ib_warn(ibdev, format, arg...) \ dev_warn((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg) enum { MLX4_IB_SQ_MIN_WQE_SHIFT = 6, MLX4_IB_MAX_HEADROOM = 2048 }; #define MLX4_IB_SQ_HEADROOM(shift) ((MLX4_IB_MAX_HEADROOM >> (shift)) + 1) #define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT)) /*module param to indicate if SM assigns the alias_GUID*/ extern int mlx4_ib_sm_guid_assign; extern struct proc_dir_entry *mlx4_mrs_dir_entry; #define MLX4_IB_UC_STEER_QPN_ALIGN 1 #define MLX4_IB_UC_MAX_NUM_QPS 256 enum hw_bar_type { HW_BAR_BF, HW_BAR_DB, HW_BAR_CLOCK, HW_BAR_COUNT }; struct mlx4_ib_vma_private_data { struct vm_area_struct *vma; }; struct mlx4_ib_ucontext { struct ib_ucontext ibucontext; struct mlx4_uar uar; struct list_head db_page_list; struct mutex db_page_mutex; struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT]; }; struct mlx4_ib_pd { struct ib_pd ibpd; u32 pdn; }; struct mlx4_ib_xrcd { struct ib_xrcd ibxrcd; u32 xrcdn; struct ib_pd *pd; struct ib_cq *cq; }; struct mlx4_ib_cq_buf { struct mlx4_buf buf; struct mlx4_mtt mtt; int entry_size; }; struct mlx4_ib_cq_resize { struct mlx4_ib_cq_buf buf; int cqe; }; struct mlx4_ib_cq { struct ib_cq ibcq; struct mlx4_cq mcq; struct mlx4_ib_cq_buf buf; struct mlx4_ib_cq_resize *resize_buf; struct mlx4_db db; spinlock_t lock; struct mutex resize_mutex; struct ib_umem *umem; struct ib_umem *resize_umem; int create_flags; /* List of qps that it serves.*/ struct list_head send_qp_list; struct list_head recv_qp_list; }; #define MLX4_MR_PAGES_ALIGN 0x40 struct mlx4_ib_mr { struct ib_mr ibmr; __be64 *pages; dma_addr_t page_map; u32 npages; u32 max_pages; struct mlx4_mr mmr; struct ib_umem *umem; size_t page_map_size; }; struct mlx4_ib_mw { struct ib_mw ibmw; struct mlx4_mw mmw; }; struct mlx4_ib_fmr { struct ib_fmr ibfmr; struct mlx4_fmr mfmr; }; #define MAX_REGS_PER_FLOW 2 struct mlx4_flow_reg_id { u64 id; u64 mirror; }; struct mlx4_ib_flow { struct ib_flow ibflow; /* translating DMFS verbs sniffer rule to FW API requires two reg IDs */ struct mlx4_flow_reg_id reg_id[MAX_REGS_PER_FLOW]; }; struct mlx4_ib_wq { u64 *wrid; spinlock_t lock; int wqe_cnt; int max_post; int max_gs; int offset; int wqe_shift; unsigned head; unsigned tail; }; enum { MLX4_IB_QP_CREATE_ROCE_V2_GSI = IB_QP_CREATE_RESERVED_START }; enum mlx4_ib_qp_flags { MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO, /* Mellanox specific flags start from IB_QP_CREATE_RESERVED_START */ MLX4_IB_ROCE_V2_GSI_QP = MLX4_IB_QP_CREATE_ROCE_V2_GSI, MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30, MLX4_IB_SRIOV_SQP = 1U << 31, }; struct mlx4_ib_gid_entry { struct list_head list; union ib_gid gid; int added; u8 port; }; enum mlx4_ib_qp_type { /* * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries * here (and in that order) since the MAD layer uses them as * indices into a 2-entry table. */ MLX4_IB_QPT_SMI = IB_QPT_SMI, MLX4_IB_QPT_GSI = IB_QPT_GSI, MLX4_IB_QPT_RC = IB_QPT_RC, MLX4_IB_QPT_UC = IB_QPT_UC, MLX4_IB_QPT_UD = IB_QPT_UD, MLX4_IB_QPT_RAW_IPV6 = IB_QPT_RAW_IPV6, MLX4_IB_QPT_RAW_ETHERTYPE = IB_QPT_RAW_ETHERTYPE, MLX4_IB_QPT_RAW_PACKET = IB_QPT_RAW_PACKET, MLX4_IB_QPT_XRC_INI = IB_QPT_XRC_INI, MLX4_IB_QPT_XRC_TGT = IB_QPT_XRC_TGT, MLX4_IB_QPT_PROXY_SMI_OWNER = 1 << 16, MLX4_IB_QPT_PROXY_SMI = 1 << 17, MLX4_IB_QPT_PROXY_GSI = 1 << 18, MLX4_IB_QPT_TUN_SMI_OWNER = 1 << 19, MLX4_IB_QPT_TUN_SMI = 1 << 20, MLX4_IB_QPT_TUN_GSI = 1 << 21, }; #define MLX4_IB_QPT_ANY_SRIOV (MLX4_IB_QPT_PROXY_SMI_OWNER | \ MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER | \ MLX4_IB_QPT_TUN_SMI | MLX4_IB_QPT_TUN_GSI) enum mlx4_ib_mad_ifc_flags { MLX4_MAD_IFC_IGNORE_MKEY = 1, MLX4_MAD_IFC_IGNORE_BKEY = 2, MLX4_MAD_IFC_IGNORE_KEYS = (MLX4_MAD_IFC_IGNORE_MKEY | MLX4_MAD_IFC_IGNORE_BKEY), MLX4_MAD_IFC_NET_VIEW = 4, }; enum { MLX4_NUM_TUNNEL_BUFS = 256, }; struct mlx4_ib_tunnel_header { struct mlx4_av av; __be32 remote_qpn; __be32 qkey; __be16 vlan; u8 mac[6]; __be16 pkey_index; u8 reserved[6]; }; struct mlx4_ib_buf { void *addr; dma_addr_t map; }; struct mlx4_rcv_tunnel_hdr { __be32 flags_src_qp; /* flags[6:5] is defined for VLANs: * 0x0 - no vlan was in the packet * 0x01 - C-VLAN was in the packet */ u8 g_ml_path; /* gid bit stands for ipv6/4 header in RoCE */ u8 reserved; __be16 pkey_index; __be16 sl_vid; __be16 slid_mac_47_32; __be32 mac_31_0; }; struct mlx4_ib_proxy_sqp_hdr { struct ib_grh grh; struct mlx4_rcv_tunnel_hdr tun; } __packed; struct mlx4_roce_smac_vlan_info { u64 smac; int smac_index; int smac_port; u64 candidate_smac; int candidate_smac_index; int candidate_smac_port; u16 vid; int vlan_index; int vlan_port; u16 candidate_vid; int candidate_vlan_index; int candidate_vlan_port; int update_vid; }; struct mlx4_ib_qp { struct ib_qp ibqp; struct mlx4_qp mqp; struct mlx4_buf buf; struct mlx4_db db; struct mlx4_ib_wq rq; u32 doorbell_qpn; __be32 sq_signal_bits; unsigned sq_next_wqe; int sq_max_wqes_per_wr; int sq_spare_wqes; struct mlx4_ib_wq sq; enum mlx4_ib_qp_type mlx4_ib_qp_type; struct ib_umem *umem; struct mlx4_mtt mtt; int buf_size; struct mutex mutex; u16 xrcdn; u32 flags; u8 port; u8 alt_port; u8 atomic_rd_en; u8 resp_depth; u8 sq_no_prefetch; u8 state; int mlx_type; struct list_head gid_list; struct list_head steering_rules; struct mlx4_ib_buf *sqp_proxy_rcv; struct mlx4_roce_smac_vlan_info pri; struct mlx4_roce_smac_vlan_info alt; u64 reg_id; struct list_head qps_list; struct list_head cq_recv_list; struct list_head cq_send_list; struct counter_index *counter_index; }; struct mlx4_ib_srq { struct ib_srq ibsrq; struct mlx4_srq msrq; struct mlx4_buf buf; struct mlx4_db db; u64 *wrid; spinlock_t lock; int head; int tail; u16 wqe_ctr; struct ib_umem *umem; struct mlx4_mtt mtt; struct mutex mutex; }; struct mlx4_ib_ah { struct ib_ah ibah; union mlx4_ext_av av; }; /****************************************/ /* alias guid support */ /****************************************/ #define NUM_PORT_ALIAS_GUID 2 #define NUM_ALIAS_GUID_IN_REC 8 #define NUM_ALIAS_GUID_REC_IN_PORT 16 #define GUID_REC_SIZE 8 #define NUM_ALIAS_GUID_PER_PORT 128 #define MLX4_NOT_SET_GUID (0x00LL) #define MLX4_GUID_FOR_DELETE_VAL (~(0x00LL)) enum mlx4_guid_alias_rec_status { MLX4_GUID_INFO_STATUS_IDLE, MLX4_GUID_INFO_STATUS_SET, }; #define GUID_STATE_NEED_PORT_INIT 0x01 enum mlx4_guid_alias_rec_method { MLX4_GUID_INFO_RECORD_SET = IB_MGMT_METHOD_SET, MLX4_GUID_INFO_RECORD_DELETE = IB_SA_METHOD_DELETE, }; struct mlx4_sriov_alias_guid_info_rec_det { u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC]; ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/ enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/ unsigned int guids_retry_schedule[NUM_ALIAS_GUID_IN_REC]; u64 time_to_run; }; struct mlx4_sriov_alias_guid_port_rec_det { struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT]; struct workqueue_struct *wq; struct delayed_work alias_guid_work; u8 port; u32 state_flags; struct mlx4_sriov_alias_guid *parent; struct list_head cb_list; }; struct mlx4_sriov_alias_guid { struct mlx4_sriov_alias_guid_port_rec_det ports_guid[MLX4_MAX_PORTS]; spinlock_t ag_work_lock; struct ib_sa_client *sa_client; }; struct mlx4_ib_demux_work { struct work_struct work; struct mlx4_ib_dev *dev; int slave; int do_init; u8 port; }; struct mlx4_ib_tun_tx_buf { struct mlx4_ib_buf buf; struct ib_ah *ah; }; struct mlx4_ib_demux_pv_qp { struct ib_qp *qp; enum ib_qp_type proxy_qpt; struct mlx4_ib_buf *ring; struct mlx4_ib_tun_tx_buf *tx_ring; spinlock_t tx_lock; unsigned tx_ix_head; unsigned tx_ix_tail; }; enum mlx4_ib_demux_pv_state { DEMUX_PV_STATE_DOWN, DEMUX_PV_STATE_STARTING, DEMUX_PV_STATE_ACTIVE, DEMUX_PV_STATE_DOWNING, }; struct mlx4_ib_demux_pv_ctx { int port; int slave; enum mlx4_ib_demux_pv_state state; int has_smi; struct ib_device *ib_dev; struct ib_cq *cq; struct ib_pd *pd; struct work_struct work; struct workqueue_struct *wq; struct mlx4_ib_demux_pv_qp qp[2]; }; struct mlx4_ib_demux_ctx { struct ib_device *ib_dev; int port; struct workqueue_struct *wq; struct workqueue_struct *ud_wq; spinlock_t ud_lock; atomic64_t subnet_prefix; __be64 guid_cache[128]; struct mlx4_ib_dev *dev; /* the following lock protects both mcg_table and mcg_mgid0_list */ struct mutex mcg_table_lock; struct rb_root mcg_table; struct list_head mcg_mgid0_list; struct workqueue_struct *mcg_wq; struct mlx4_ib_demux_pv_ctx **tun; atomic_t tid; int flushing; /* flushing the work queue */ }; struct mlx4_ib_sriov { struct mlx4_ib_demux_ctx demux[MLX4_MAX_PORTS]; struct mlx4_ib_demux_pv_ctx *sqps[MLX4_MAX_PORTS]; /* when using this spinlock you should use "irq" because * it may be called from interrupt context.*/ spinlock_t going_down_lock; int is_going_down; struct mlx4_sriov_alias_guid alias_guid; /* CM paravirtualization fields */ struct list_head cm_list; spinlock_t id_map_lock; struct rb_root sl_id_map; struct idr pv_id_table; }; struct gid_cache_context { int real_index; int refcount; }; struct gid_entry { union ib_gid gid; enum ib_gid_type gid_type; struct gid_cache_context *ctx; }; struct mlx4_port_gid_table { struct gid_entry gids[MLX4_MAX_PORT_GIDS]; }; struct mlx4_ib_iboe { spinlock_t lock; struct ifnet *netdevs[MLX4_MAX_PORTS]; atomic64_t mac[MLX4_MAX_PORTS]; struct notifier_block nb; struct mlx4_port_gid_table gids[MLX4_MAX_PORTS]; }; struct pkey_mgt { u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; u16 phys_pkey_cache[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; struct list_head pkey_port_list[MLX4_MFUNC_MAX]; struct kobject *device_parent[MLX4_MFUNC_MAX]; }; struct mlx4_ib_iov_sysfs_attr { void *ctx; struct kobject *kobj; unsigned long data; u32 entry_num; char name[15]; struct device_attribute dentry; struct device *dev; }; struct mlx4_ib_iov_sysfs_attr_ar { struct mlx4_ib_iov_sysfs_attr dentries[3 * NUM_ALIAS_GUID_PER_PORT + 1]; }; struct mlx4_ib_iov_port { char name[100]; u8 num; struct mlx4_ib_dev *dev; struct list_head list; struct mlx4_ib_iov_sysfs_attr_ar *dentr_ar; struct ib_port_attr attr; struct kobject *cur_port; struct kobject *admin_alias_parent; struct kobject *gids_parent; struct kobject *pkeys_parent; struct kobject *mcgs_parent; struct mlx4_ib_iov_sysfs_attr mcg_dentry; }; struct counter_index { struct list_head list; u32 index; u8 allocated; }; struct mlx4_ib_counters { struct list_head counters_list; struct mutex mutex; /* mutex for accessing counters list */ u32 default_counter; }; #define MLX4_DIAG_COUNTERS_TYPES 2 struct mlx4_ib_diag_counters { const char **name; u32 *offset; u32 num_counters; }; struct mlx4_ib_dev { struct ib_device ib_dev; struct mlx4_dev *dev; int num_ports; void __iomem *uar_map; struct mlx4_uar priv_uar; u32 priv_pdn; MLX4_DECLARE_DOORBELL_LOCK(uar_lock); struct ib_mad_agent *send_agent[MLX4_MAX_PORTS][2]; struct ib_ah *sm_ah[MLX4_MAX_PORTS]; spinlock_t sm_lock; atomic64_t sl2vl[MLX4_MAX_PORTS]; struct mlx4_ib_sriov sriov; struct mutex cap_mask_mutex; bool ib_active; struct mlx4_ib_iboe iboe; struct mlx4_ib_counters counters_table[MLX4_MAX_PORTS]; int *eq_table; struct kobject *iov_parent; struct kobject *ports_parent; struct kobject *dev_ports_parent[MLX4_MFUNC_MAX]; struct mlx4_ib_iov_port iov_ports[MLX4_MAX_PORTS]; struct pkey_mgt pkeys; unsigned long *ib_uc_qpns_bitmap; int steer_qpn_count; int steer_qpn_base; int steering_support; struct mlx4_ib_qp *qp1_proxy[MLX4_MAX_PORTS]; /* lock when destroying qp1_proxy and getting netdev events */ struct mutex qp1_proxy_lock[MLX4_MAX_PORTS]; u8 bond_next_port; /* protect resources needed as part of reset flow */ spinlock_t reset_flow_resource_lock; struct list_head qp_list; struct mlx4_ib_diag_counters diag_counters[MLX4_DIAG_COUNTERS_TYPES]; }; struct ib_event_work { struct work_struct work; struct mlx4_ib_dev *ib_dev; struct mlx4_eqe ib_eqe; int port; }; struct mlx4_ib_qp_tunnel_init_attr { struct ib_qp_init_attr init_attr; int slave; enum ib_qp_type proxy_qp_type; u8 port; }; struct mlx4_uverbs_ex_query_device { __u32 comp_mask; __u32 reserved; }; enum query_device_resp_mask { QUERY_DEVICE_RESP_MASK_TIMESTAMP = 1UL << 0, }; struct mlx4_uverbs_ex_query_device_resp { __u32 comp_mask; __u32 response_length; __u64 hca_core_clock_offset; }; static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) { return container_of(ibdev, struct mlx4_ib_dev, ib_dev); } static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) { return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext); } static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd) { return container_of(ibpd, struct mlx4_ib_pd, ibpd); } static inline struct mlx4_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) { return container_of(ibxrcd, struct mlx4_ib_xrcd, ibxrcd); } static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq) { return container_of(ibcq, struct mlx4_ib_cq, ibcq); } static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq) { return container_of(mcq, struct mlx4_ib_cq, mcq); } static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr) { return container_of(ibmr, struct mlx4_ib_mr, ibmr); } static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw) { return container_of(ibmw, struct mlx4_ib_mw, ibmw); } static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) { return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr); } static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow) { return container_of(ibflow, struct mlx4_ib_flow, ibflow); } static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp) { return container_of(ibqp, struct mlx4_ib_qp, ibqp); } static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp) { return container_of(mqp, struct mlx4_ib_qp, mqp); } static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq) { return container_of(ibsrq, struct mlx4_ib_srq, ibsrq); } static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq) { return container_of(msrq, struct mlx4_ib_srq, msrq); } static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah) { return container_of(ibah, struct mlx4_ib_ah, ibah); } static inline u8 mlx4_ib_bond_next_port(struct mlx4_ib_dev *dev) { dev->bond_next_port = (dev->bond_next_port + 1) % dev->num_ports; return dev->bond_next_port + 1; } int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev); void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev); int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, struct mlx4_db *db); void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db); struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc); int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, struct ib_umem *umem); struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); int mlx4_ib_dereg_mr(struct ib_mr *mr); struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata); int mlx4_ib_dealloc_mw(struct ib_mw *mw); struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata); int mlx4_ib_destroy_cq(struct ib_cq *cq); int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, struct ib_udata *udata); int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); int mlx4_ib_destroy_ah(struct ib_ah *ah); struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *init_attr, struct ib_udata *udata); int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); int mlx4_ib_destroy_srq(struct ib_srq *srq); void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index); -int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); int mlx4_ib_destroy_qp(struct ib_qp *qp); int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); -int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); -int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, int port, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const void *in_mad, void *response_mad); int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *in, size_t in_mad_size, struct ib_mad_hdr *out, size_t *out_mad_size, u16 *out_mad_pkey_index); int mlx4_ib_mad_init(struct mlx4_ib_dev *dev); void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev); struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, u64 iova); int mlx4_ib_unmap_fmr(struct list_head *fmr_list); int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr); int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view); int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey, int netw_view); int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid, int netw_view); static inline bool mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) { u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3; if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET) return true; return !!(ah->av.ib.g_slid & 0x80); } int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx); void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq); void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave); int mlx4_ib_mcg_init(void); void mlx4_ib_mcg_destroy(void); int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid); int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave, struct ib_sa_mad *sa_mad); int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, struct ib_sa_mad *mad); int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, union ib_gid *gid); void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num, enum ib_event_type type); void mlx4_ib_tunnels_update_work(struct work_struct *work); int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type qpt, struct ib_wc *wc, struct ib_grh *grh, struct ib_mad *mad); int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr, u8 *s_mac, u16 vlan_id, struct ib_mad *mad); __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx); int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, struct ib_mad *mad); int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, struct ib_mad *mad); void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev); void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave_id); /* alias guid support */ void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port); int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev); void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev); void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port); void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, int block_num, u8 port_num, u8 *p_data); void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num, u8 port_num, u8 *p_data); int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, struct attribute *attr); void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, struct attribute *attr); ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index); void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave, int port, int slave_init); int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ; void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device); __be64 mlx4_ib_gen_node_guid(void); int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn); void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count); int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, int is_attach); int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_pd *pd, struct ib_udata *udata); int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev, u8 port_num, int index); void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev, int port); void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port); #endif /* MLX4_IB_H */ diff --git a/sys/dev/mlx4/mlx4_ib/mlx4_ib_mad.c b/sys/dev/mlx4/mlx4_ib/mlx4_ib_mad.c index 183ccecd2171..455b85f27942 100644 --- a/sys/dev/mlx4/mlx4_ib/mlx4_ib_mad.c +++ b/sys/dev/mlx4/mlx4_ib/mlx4_ib_mad.c @@ -1,2346 +1,2347 @@ /* * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include "mlx4_ib.h" enum { MLX4_IB_VENDOR_CLASS1 = 0x9, MLX4_IB_VENDOR_CLASS2 = 0xa }; #define MLX4_TUN_SEND_WRID_SHIFT 34 #define MLX4_TUN_QPN_SHIFT 32 #define MLX4_TUN_WRID_RECV (((u64) 1) << MLX4_TUN_SEND_WRID_SHIFT) #define MLX4_TUN_SET_WRID_QPN(a) (((u64) ((a) & 0x3)) << MLX4_TUN_QPN_SHIFT) #define MLX4_TUN_IS_RECV(a) (((a) >> MLX4_TUN_SEND_WRID_SHIFT) & 0x1) #define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3) /* Port mgmt change event handling */ #define GET_BLK_PTR_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr) #define GET_MASK_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask) #define NUM_IDX_IN_PKEY_TBL_BLK 32 #define GUID_TBL_ENTRY_SIZE 8 /* size in bytes */ #define GUID_TBL_BLK_NUM_ENTRIES 8 #define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES) struct mlx4_mad_rcv_buf { struct ib_grh grh; u8 payload[256]; } __packed; struct mlx4_mad_snd_buf { u8 payload[256]; } __packed; struct mlx4_tunnel_mad { struct ib_grh grh; struct mlx4_ib_tunnel_header hdr; struct ib_mad mad; } __packed; struct mlx4_rcv_tunnel_mad { struct mlx4_rcv_tunnel_hdr hdr; struct ib_grh grh; struct ib_mad mad; } __packed; static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num); static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num); static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, int block, u32 change_bitmap); __be64 mlx4_ib_gen_node_guid(void) { #define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40)) return cpu_to_be64(NODE_GUID_HI | random()); } __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx) { return cpu_to_be64(atomic_inc_return(&ctx->tid)) | cpu_to_be64(0xff00000000000000LL); } int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, int port, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const void *in_mad, void *response_mad) { struct mlx4_cmd_mailbox *inmailbox, *outmailbox; void *inbox; int err; u32 in_modifier = port; u8 op_modifier = 0; inmailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(inmailbox)) return PTR_ERR(inmailbox); inbox = inmailbox->buf; outmailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(outmailbox)) { mlx4_free_cmd_mailbox(dev->dev, inmailbox); return PTR_ERR(outmailbox); } memcpy(inbox, in_mad, 256); /* * Key check traps can't be generated unless we have in_wc to * tell us where to send the trap. */ if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_MKEY) || !in_wc) op_modifier |= 0x1; if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_BKEY) || !in_wc) op_modifier |= 0x2; if (mlx4_is_mfunc(dev->dev) && (mad_ifc_flags & MLX4_MAD_IFC_NET_VIEW || in_wc)) op_modifier |= 0x8; if (in_wc) { struct { __be32 my_qpn; u32 reserved1; __be32 rqpn; u8 sl; u8 g_path; u16 reserved2[2]; __be16 pkey; u32 reserved3[11]; u8 grh[40]; } *ext_info; memset(inbox + 256, 0, 256); ext_info = inbox + 256; ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num); ext_info->rqpn = cpu_to_be32(in_wc->src_qp); ext_info->sl = in_wc->sl << 4; ext_info->g_path = in_wc->dlid_path_bits | (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0); ext_info->pkey = cpu_to_be16(in_wc->pkey_index); if (in_grh) memcpy(ext_info->grh, in_grh, 40); op_modifier |= 0x4; in_modifier |= in_wc->slid << 16; } err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier, mlx4_is_master(dev->dev) ? (op_modifier & ~0x8) : op_modifier, MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, (op_modifier & 0x8) ? MLX4_CMD_NATIVE : MLX4_CMD_WRAPPED); if (!err) memcpy(response_mad, outmailbox->buf, 256); mlx4_free_cmd_mailbox(dev->dev, inmailbox); mlx4_free_cmd_mailbox(dev->dev, outmailbox); return err; } static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl) { struct ib_ah *new_ah; struct ib_ah_attr ah_attr; unsigned long flags; if (!dev->send_agent[port_num - 1][0]) return; memset(&ah_attr, 0, sizeof ah_attr); ah_attr.dlid = lid; ah_attr.sl = sl; ah_attr.port_num = port_num; new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd, &ah_attr); if (IS_ERR(new_ah)) return; spin_lock_irqsave(&dev->sm_lock, flags); if (dev->sm_ah[port_num - 1]) ib_destroy_ah(dev->sm_ah[port_num - 1]); dev->sm_ah[port_num - 1] = new_ah; spin_unlock_irqrestore(&dev->sm_lock, flags); } /* * Snoop SM MADs for port info, GUID info, and P_Key table sets, so we can * synthesize LID change, Client-Rereg, GID change, and P_Key change events. */ static void smp_snoop(struct ib_device *ibdev, u8 port_num, const struct ib_mad *mad, u16 prev_lid) { struct ib_port_info *pinfo; u16 lid; __be16 *base; u32 bn, pkey_change_bitmap; int i; struct mlx4_ib_dev *dev = to_mdev(ibdev); if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && mad->mad_hdr.method == IB_MGMT_METHOD_SET) switch (mad->mad_hdr.attr_id) { case IB_SMP_ATTR_PORT_INFO: if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV) return; pinfo = (struct ib_port_info *) ((struct ib_smp *) mad)->data; lid = be16_to_cpu(pinfo->lid); update_sm_ah(dev, port_num, be16_to_cpu(pinfo->sm_lid), pinfo->neighbormtu_mastersmsl & 0xf); if (pinfo->clientrereg_resv_subnetto & 0x80) handle_client_rereg_event(dev, port_num); if (prev_lid != lid) handle_lid_change_event(dev, port_num); break; case IB_SMP_ATTR_PKEY_TABLE: if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV) return; if (!mlx4_is_mfunc(dev->dev)) { mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_PKEY_CHANGE); break; } /* at this point, we are running in the master. * Slaves do not receive SMPs. */ bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 0xFFFF; base = (__be16 *) &(((struct ib_smp *)mad)->data[0]); pkey_change_bitmap = 0; for (i = 0; i < 32; i++) { pr_debug("PKEY[%d] = x%x\n", i + bn*32, be16_to_cpu(base[i])); if (be16_to_cpu(base[i]) != dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32]) { pkey_change_bitmap |= (1 << i); dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32] = be16_to_cpu(base[i]); } } pr_debug("PKEY Change event: port=%d, " "block=0x%x, change_bitmap=0x%x\n", port_num, bn, pkey_change_bitmap); if (pkey_change_bitmap) { mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_PKEY_CHANGE); if (!dev->sriov.is_going_down) __propagate_pkey_ev(dev, port_num, bn, pkey_change_bitmap); } break; case IB_SMP_ATTR_GUID_INFO: if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV) return; /* paravirtualized master's guid is guid 0 -- does not change */ if (!mlx4_is_master(dev->dev)) mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_GID_CHANGE); /*if master, notify relevant slaves*/ if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) { bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod); mlx4_ib_update_cache_on_guid_change(dev, bn, port_num, (u8 *)(&((struct ib_smp *)mad)->data)); mlx4_ib_notify_slaves_on_guid_change(dev, bn, port_num, (u8 *)(&((struct ib_smp *)mad)->data)); } break; case IB_SMP_ATTR_SL_TO_VL_TABLE: /* cache sl to vl mapping changes for use in * filling QP1 LRH VL field when sending packets */ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV && dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT) return; if (!mlx4_is_slave(dev->dev)) { union sl2vl_tbl_to_u64 sl2vl64; int jj; for (jj = 0; jj < 8; jj++) { sl2vl64.sl8[jj] = ((struct ib_smp *)mad)->data[jj]; pr_debug("sl2vl[%d] = %02x\n", jj, sl2vl64.sl8[jj]); } atomic64_set(&dev->sl2vl[port_num - 1], sl2vl64.sl64); } break; default: break; } } static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, int block, u32 change_bitmap) { int i, ix, slave, err; int have_event = 0; for (slave = 0; slave < dev->dev->caps.sqp_demux; slave++) { if (slave == mlx4_master_func_num(dev->dev)) continue; if (!mlx4_is_slave_active(dev->dev, slave)) continue; have_event = 0; for (i = 0; i < 32; i++) { if (!(change_bitmap & (1 << i))) continue; for (ix = 0; ix < dev->dev->caps.pkey_table_len[port_num]; ix++) { if (dev->pkeys.virt2phys_pkey[slave][port_num - 1] [ix] == i + 32 * block) { err = mlx4_gen_pkey_eqe(dev->dev, slave, port_num); pr_debug("propagate_pkey_ev: slave %d," " port %d, ix %d (%d)\n", slave, port_num, ix, err); have_event = 1; break; } } if (have_event) break; } } } static void node_desc_override(struct ib_device *dev, struct ib_mad *mad) { unsigned long flags; if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP && mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) { spin_lock_irqsave(&to_mdev(dev)->sm_lock, flags); memcpy(((struct ib_smp *) mad)->data, dev->node_desc, IB_DEVICE_NODE_DESC_MAX); spin_unlock_irqrestore(&to_mdev(dev)->sm_lock, flags); } } static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, const struct ib_mad *mad) { int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED; struct ib_mad_send_buf *send_buf; struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn]; int ret; unsigned long flags; if (agent) { send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_ATOMIC, IB_MGMT_BASE_VERSION); if (IS_ERR(send_buf)) return; /* * We rely here on the fact that MLX QPs don't use the * address handle after the send is posted (this is * wrong following the IB spec strictly, but we know * it's OK for our devices). */ spin_lock_irqsave(&dev->sm_lock, flags); memcpy(send_buf->mad, mad, sizeof *mad); if ((send_buf->ah = dev->sm_ah[port_num - 1])) ret = ib_post_send_mad(send_buf, NULL); else ret = -EINVAL; spin_unlock_irqrestore(&dev->sm_lock, flags); if (ret) ib_free_send_mad(send_buf); } } static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave, struct ib_sa_mad *sa_mad) { int ret = 0; /* dispatch to different sa handlers */ switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { case IB_SA_ATTR_MC_MEMBER_REC: ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad); break; default: break; } return ret; } int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid) { struct mlx4_ib_dev *dev = to_mdev(ibdev); int i; for (i = 0; i < dev->dev->caps.sqp_demux; i++) { if (dev->sriov.demux[port - 1].guid_cache[i] == guid) return i; } return -1; } static int find_slave_port_pkey_ix(struct mlx4_ib_dev *dev, int slave, u8 port, u16 pkey, u16 *ix) { int i, ret; u8 unassigned_pkey_ix, pkey_ix, partial_ix = 0xFF; u16 slot_pkey; if (slave == mlx4_master_func_num(dev->dev)) return ib_find_cached_pkey(&dev->ib_dev, port, pkey, ix); unassigned_pkey_ix = dev->dev->phys_caps.pkey_phys_table_len[port] - 1; for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) { if (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == unassigned_pkey_ix) continue; pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][i]; ret = ib_get_cached_pkey(&dev->ib_dev, port, pkey_ix, &slot_pkey); if (ret) continue; if ((slot_pkey & 0x7FFF) == (pkey & 0x7FFF)) { if (slot_pkey & 0x8000) { *ix = (u16) pkey_ix; return 0; } else { /* take first partial pkey index found */ if (partial_ix == 0xFF) partial_ix = pkey_ix; } } } if (partial_ix < 0xFF) { *ix = (u16) partial_ix; return 0; } return -EINVAL; } int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type dest_qpt, struct ib_wc *wc, struct ib_grh *grh, struct ib_mad *mad) { struct ib_sge list; struct ib_ud_wr wr; - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; struct mlx4_ib_demux_pv_ctx *tun_ctx; struct mlx4_ib_demux_pv_qp *tun_qp; struct mlx4_rcv_tunnel_mad *tun_mad; struct ib_ah_attr attr; struct ib_ah *ah; struct ib_qp *src_qp = NULL; unsigned tun_tx_ix = 0; int dqpn; int ret = 0; u16 tun_pkey_ix; u16 cached_pkey; u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; if (dest_qpt > IB_QPT_GSI) return -EINVAL; tun_ctx = dev->sriov.demux[port-1].tun[slave]; /* check if proxy qp created */ if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE) return -EAGAIN; if (!dest_qpt) tun_qp = &tun_ctx->qp[0]; else tun_qp = &tun_ctx->qp[1]; /* compute P_Key index to put in tunnel header for slave */ if (dest_qpt) { u16 pkey_ix; ret = ib_get_cached_pkey(&dev->ib_dev, port, wc->pkey_index, &cached_pkey); if (ret) return -EINVAL; ret = find_slave_port_pkey_ix(dev, slave, port, cached_pkey, &pkey_ix); if (ret) return -EINVAL; tun_pkey_ix = pkey_ix; } else tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; dqpn = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave + port + (dest_qpt * 2) - 1; /* get tunnel tx data buf for slave */ src_qp = tun_qp->qp; /* create ah. Just need an empty one with the port num for the post send. * The driver will set the force loopback bit in post_send */ memset(&attr, 0, sizeof attr); attr.port_num = port; if (is_eth) { memcpy(&attr.grh.dgid.raw[0], &grh->dgid.raw[0], 16); attr.ah_flags = IB_AH_GRH; } ah = ib_create_ah(tun_ctx->pd, &attr); if (IS_ERR(ah)) return -ENOMEM; /* allocate tunnel tx buf after pass failure returns */ spin_lock(&tun_qp->tx_lock); if (tun_qp->tx_ix_head - tun_qp->tx_ix_tail >= (MLX4_NUM_TUNNEL_BUFS - 1)) ret = -EAGAIN; else tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); spin_unlock(&tun_qp->tx_lock); if (ret) goto end; tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr); if (tun_qp->tx_ring[tun_tx_ix].ah) ib_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah); tun_qp->tx_ring[tun_tx_ix].ah = ah; ib_dma_sync_single_for_cpu(&dev->ib_dev, tun_qp->tx_ring[tun_tx_ix].buf.map, sizeof (struct mlx4_rcv_tunnel_mad), DMA_TO_DEVICE); /* copy over to tunnel buffer */ if (grh) memcpy(&tun_mad->grh, grh, sizeof *grh); memcpy(&tun_mad->mad, mad, sizeof *mad); /* adjust tunnel data */ tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix); tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF); tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0; if (is_eth) { u16 vlan = 0; if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan, NULL)) { /* VST mode */ if (vlan != wc->vlan_id) { /* Packet vlan is not the VST-assigned vlan. * Drop the packet. */ ret = -EPERM; goto out; } else { /* Remove the vlan tag before forwarding * the packet to the VF. */ vlan = 0xffff; } } else { vlan = wc->vlan_id; } tun_mad->hdr.sl_vid = cpu_to_be16(vlan); memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4); memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2); } else { tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); } ib_dma_sync_single_for_device(&dev->ib_dev, tun_qp->tx_ring[tun_tx_ix].buf.map, sizeof (struct mlx4_rcv_tunnel_mad), DMA_TO_DEVICE); list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map; list.length = sizeof (struct mlx4_rcv_tunnel_mad); list.lkey = tun_ctx->pd->local_dma_lkey; wr.ah = ah; wr.port_num = port; wr.remote_qkey = IB_QP_SET_QKEY; wr.remote_qpn = dqpn; wr.wr.next = NULL; wr.wr.wr_id = ((u64) tun_tx_ix) | MLX4_TUN_SET_WRID_QPN(dest_qpt); wr.wr.sg_list = &list; wr.wr.num_sge = 1; wr.wr.opcode = IB_WR_SEND; wr.wr.send_flags = IB_SEND_SIGNALED; ret = ib_post_send(src_qp, &wr.wr, &bad_wr); if (!ret) return 0; out: spin_lock(&tun_qp->tx_lock); tun_qp->tx_ix_tail++; spin_unlock(&tun_qp->tx_lock); tun_qp->tx_ring[tun_tx_ix].ah = NULL; end: ib_destroy_ah(ah); return ret; } static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port, struct ib_wc *wc, struct ib_grh *grh, struct ib_mad *mad) { struct mlx4_ib_dev *dev = to_mdev(ibdev); int err, other_port; int slave = -1; u8 *slave_id; int is_eth = 0; if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) is_eth = 0; else is_eth = 1; if (is_eth) { if (!(wc->wc_flags & IB_WC_GRH)) { mlx4_ib_warn(ibdev, "RoCE grh not present.\n"); return -EINVAL; } if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) { mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n"); return -EINVAL; } err = mlx4_get_slave_from_roce_gid(dev->dev, port, grh->dgid.raw, &slave); if (err && mlx4_is_mf_bonded(dev->dev)) { other_port = (port == 1) ? 2 : 1; err = mlx4_get_slave_from_roce_gid(dev->dev, other_port, grh->dgid.raw, &slave); if (!err) { port = other_port; pr_debug("resolved slave %d from gid %pI6 wire port %d other %d\n", slave, grh->dgid.raw, port, other_port); } } if (err) { mlx4_ib_warn(ibdev, "failed matching grh\n"); return -ENOENT; } if (slave >= dev->dev->caps.sqp_demux) { mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", slave, dev->dev->caps.sqp_demux); return -ENOENT; } if (mlx4_ib_demux_cm_handler(ibdev, port, NULL, mad)) return 0; err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); if (err) pr_debug("failed sending to slave %d via tunnel qp (%d)\n", slave, err); return 0; } /* Initially assume that this mad is for us */ slave = mlx4_master_func_num(dev->dev); /* See if the slave id is encoded in a response mad */ if (mad->mad_hdr.method & 0x80) { slave_id = (u8 *) &mad->mad_hdr.tid; slave = *slave_id; if (slave != 255) /*255 indicates the dom0*/ *slave_id = 0; /* remap tid */ } /* If a grh is present, we demux according to it */ if (wc->wc_flags & IB_WC_GRH) { slave = mlx4_ib_find_real_gid(ibdev, port, grh->dgid.global.interface_id); if (slave < 0) { mlx4_ib_warn(ibdev, "failed matching grh\n"); return -ENOENT; } } /* Class-specific handling */ switch (mad->mad_hdr.mgmt_class) { case IB_MGMT_CLASS_SUBN_LID_ROUTED: case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: /* 255 indicates the dom0 */ if (slave != 255 && slave != mlx4_master_func_num(dev->dev)) { if (!mlx4_vf_smi_enabled(dev->dev, slave, port)) return -EPERM; /* for a VF. drop unsolicited MADs */ if (!(mad->mad_hdr.method & IB_MGMT_METHOD_RESP)) { mlx4_ib_warn(ibdev, "demux QP0. rejecting unsolicited mad for slave %d class 0x%x, method 0x%x\n", slave, mad->mad_hdr.mgmt_class, mad->mad_hdr.method); return -EINVAL; } } break; case IB_MGMT_CLASS_SUBN_ADM: if (mlx4_ib_demux_sa_handler(ibdev, port, slave, (struct ib_sa_mad *) mad)) return 0; break; case IB_MGMT_CLASS_CM: if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad)) return 0; break; case IB_MGMT_CLASS_DEVICE_MGMT: if (mad->mad_hdr.method != IB_MGMT_METHOD_GET_RESP) return 0; break; default: /* Drop unsupported classes for slaves in tunnel mode */ if (slave != mlx4_master_func_num(dev->dev)) { pr_debug("dropping unsupported ingress mad from class:%d " "for slave:%d\n", mad->mad_hdr.mgmt_class, slave); return 0; } } /*make sure that no slave==255 was not handled yet.*/ if (slave >= dev->dev->caps.sqp_demux) { mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", slave, dev->dev->caps.sqp_demux); return -ENOENT; } err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); if (err) pr_debug("failed sending to slave %d via tunnel qp (%d)\n", slave, err); return 0; } static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad *in_mad, struct ib_mad *out_mad) { u16 slid, prev_lid = 0; int err; struct ib_port_attr pattr; if (in_wc && in_wc->qp->qp_num) { pr_debug("received MAD: slid:%d sqpn:%d " "dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n", in_wc->slid, in_wc->src_qp, in_wc->dlid_path_bits, in_wc->qp->qp_num, in_wc->wc_flags, in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method, be16_to_cpu(in_mad->mad_hdr.attr_id)); if (in_wc->wc_flags & IB_WC_GRH) { pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n", (unsigned long long)be64_to_cpu(in_grh->sgid.global.subnet_prefix), (unsigned long long)be64_to_cpu(in_grh->sgid.global.interface_id)); pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n", (unsigned long long)be64_to_cpu(in_grh->dgid.global.subnet_prefix), (unsigned long long)be64_to_cpu(in_grh->dgid.global.interface_id)); } } slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) { forward_trap(to_mdev(ibdev), port_num, in_mad); return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; } if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && in_mad->mad_hdr.method != IB_MGMT_METHOD_SET && in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS) return IB_MAD_RESULT_SUCCESS; /* * Don't process SMInfo queries -- the SMA can't handle them. */ if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO) return IB_MAD_RESULT_SUCCESS; } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1 || in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2 || in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) { if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && in_mad->mad_hdr.method != IB_MGMT_METHOD_SET) return IB_MAD_RESULT_SUCCESS; } else return IB_MAD_RESULT_SUCCESS; if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && in_mad->mad_hdr.method == IB_MGMT_METHOD_SET && in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && !ib_query_port(ibdev, port_num, &pattr)) prev_lid = pattr.lid; err = mlx4_MAD_IFC(to_mdev(ibdev), (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) | (mad_flags & IB_MAD_IGNORE_BKEY ? MLX4_MAD_IFC_IGNORE_BKEY : 0) | MLX4_MAD_IFC_NET_VIEW, port_num, in_wc, in_grh, in_mad, out_mad); if (err) return IB_MAD_RESULT_FAILURE; if (!out_mad->mad_hdr.status) { smp_snoop(ibdev, port_num, in_mad, prev_lid); /* slaves get node desc from FW */ if (!mlx4_is_slave(to_mdev(ibdev)->dev)) node_desc_override(ibdev, out_mad); } /* set return bit in status of directed route responses */ if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) out_mad->mad_hdr.status |= cpu_to_be16(1 << 15); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) /* no response for trap repress */ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; } static void edit_counter(struct mlx4_counter *cnt, void *counters, __be16 attr_id) { switch (attr_id) { case IB_PMA_PORT_COUNTERS: { struct ib_pma_portcounters *pma_cnt = (struct ib_pma_portcounters *)counters; ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_data, (be64_to_cpu(cnt->tx_bytes) >> 2)); ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_data, (be64_to_cpu(cnt->rx_bytes) >> 2)); ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_packets, be64_to_cpu(cnt->tx_frames)); ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_packets, be64_to_cpu(cnt->rx_frames)); break; } case IB_PMA_PORT_COUNTERS_EXT: { struct ib_pma_portcounters_ext *pma_cnt_ext = (struct ib_pma_portcounters_ext *)counters; pma_cnt_ext->port_xmit_data = cpu_to_be64(be64_to_cpu(cnt->tx_bytes) >> 2); pma_cnt_ext->port_rcv_data = cpu_to_be64(be64_to_cpu(cnt->rx_bytes) >> 2); pma_cnt_ext->port_xmit_packets = cnt->tx_frames; pma_cnt_ext->port_rcv_packets = cnt->rx_frames; break; } default: break; } } static int iboe_process_mad_port_info(void *out_mad) { struct ib_class_port_info cpi = {}; cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; memcpy(out_mad, &cpi, sizeof(cpi)); return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; } static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad *in_mad, struct ib_mad *out_mad) { struct mlx4_counter counter_stats; struct mlx4_ib_dev *dev = to_mdev(ibdev); struct counter_index *tmp_counter; int err = IB_MAD_RESULT_FAILURE, stats_avail = 0; if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT) return -EINVAL; if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO) return iboe_process_mad_port_info((void *)(out_mad->data + 40)); memset(&counter_stats, 0, sizeof(counter_stats)); mutex_lock(&dev->counters_table[port_num - 1].mutex); list_for_each_entry(tmp_counter, &dev->counters_table[port_num - 1].counters_list, list) { err = mlx4_get_counter_stats(dev->dev, tmp_counter->index, &counter_stats, 0); if (err) { err = IB_MAD_RESULT_FAILURE; stats_avail = 0; break; } stats_avail = 1; } mutex_unlock(&dev->counters_table[port_num - 1].mutex); if (stats_avail) { memset(out_mad->data, 0, sizeof out_mad->data); switch (counter_stats.counter_mode & 0xf) { case 0: edit_counter(&counter_stats, (void *)(out_mad->data + 40), in_mad->mad_hdr.attr_id); err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; break; default: err = IB_MAD_RESULT_FAILURE; } } return err; } int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *in, size_t in_mad_size, struct ib_mad_hdr *out, size_t *out_mad_size, u16 *out_mad_pkey_index) { struct mlx4_ib_dev *dev = to_mdev(ibdev); const struct ib_mad *in_mad = (const struct ib_mad *)in; struct ib_mad *out_mad = (struct ib_mad *)out; enum rdma_link_layer link = rdma_port_get_link_layer(ibdev, port_num); if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || *out_mad_size != sizeof(*out_mad))) return IB_MAD_RESULT_FAILURE; /* iboe_process_mad() which uses the HCA flow-counters to implement IB PMA * queries, should be called only by VFs and for that specific purpose */ if (link == IB_LINK_LAYER_INFINIBAND) { if (mlx4_is_slave(dev->dev) && (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT && (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS || in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT || in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO))) return iboe_process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, in_mad, out_mad); return ib_process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, in_mad, out_mad); } if (link == IB_LINK_LAYER_ETHERNET) return iboe_process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, in_mad, out_mad); return -EINVAL; } static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) { if (mad_send_wc->send_buf->context[0]) ib_destroy_ah(mad_send_wc->send_buf->context[0]); ib_free_send_mad(mad_send_wc->send_buf); } int mlx4_ib_mad_init(struct mlx4_ib_dev *dev) { struct ib_mad_agent *agent; int p, q; int ret; enum rdma_link_layer ll; for (p = 0; p < dev->num_ports; ++p) { ll = rdma_port_get_link_layer(&dev->ib_dev, p + 1); for (q = 0; q <= 1; ++q) { if (ll == IB_LINK_LAYER_INFINIBAND) { agent = ib_register_mad_agent(&dev->ib_dev, p + 1, q ? IB_QPT_GSI : IB_QPT_SMI, NULL, 0, send_handler, NULL, NULL, 0); if (IS_ERR(agent)) { ret = PTR_ERR(agent); goto err; } dev->send_agent[p][q] = agent; } else dev->send_agent[p][q] = NULL; } } return 0; err: for (p = 0; p < dev->num_ports; ++p) for (q = 0; q <= 1; ++q) if (dev->send_agent[p][q]) ib_unregister_mad_agent(dev->send_agent[p][q]); return ret; } void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev) { struct ib_mad_agent *agent; int p, q; for (p = 0; p < dev->num_ports; ++p) { for (q = 0; q <= 1; ++q) { agent = dev->send_agent[p][q]; if (agent) { dev->send_agent[p][q] = NULL; ib_unregister_mad_agent(agent); } } if (dev->sm_ah[p]) ib_destroy_ah(dev->sm_ah[p]); } } static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num) { mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_LID_CHANGE); if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num, MLX4_EQ_PORT_INFO_LID_CHANGE_MASK); } static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num) { /* re-configure the alias-guid and mcg's */ if (mlx4_is_master(dev->dev)) { mlx4_ib_invalidate_all_guid_record(dev, port_num); if (!dev->sriov.is_going_down) { mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0); mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num, MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK); } } /* Update the sl to vl table from inside client rereg * only if in secure-host mode (snooping is not possible) * and the sl-to-vl change event is not generated by FW. */ if (!mlx4_is_slave(dev->dev) && dev->dev->flags & MLX4_FLAG_SECURE_HOST && !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT)) { if (mlx4_is_master(dev->dev)) /* already in work queue from mlx4_ib_event queueing * mlx4_handle_port_mgmt_change_event, which calls * this procedure. Therefore, call sl2vl_update directly. */ mlx4_ib_sl2vl_update(dev, port_num); else mlx4_sched_ib_sl2vl_update_work(dev, port_num); } mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER); } static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, struct mlx4_eqe *eqe) { __propagate_pkey_ev(dev, port_num, GET_BLK_PTR_FROM_EQE(eqe), GET_MASK_FROM_EQE(eqe)); } static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u8 port_num, u32 guid_tbl_blk_num, u32 change_bitmap) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; u16 i; if (!mlx4_is_mfunc(dev->dev) || !mlx4_is_master(dev->dev)) return; in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) { mlx4_ib_warn(&dev->ib_dev, "failed to allocate memory for guid info mads\n"); goto out; } guid_tbl_blk_num *= 4; for (i = 0; i < 4; i++) { if (change_bitmap && (!((change_bitmap >> (8 * i)) & 0xff))) continue; memset(in_mad, 0, sizeof *in_mad); memset(out_mad, 0, sizeof *out_mad); in_mad->base_version = 1; in_mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; in_mad->class_version = 1; in_mad->method = IB_MGMT_METHOD_GET; in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; in_mad->attr_mod = cpu_to_be32(guid_tbl_blk_num + i); if (mlx4_MAD_IFC(dev, MLX4_MAD_IFC_IGNORE_KEYS | MLX4_MAD_IFC_NET_VIEW, port_num, NULL, NULL, in_mad, out_mad)) { mlx4_ib_warn(&dev->ib_dev, "Failed in get GUID INFO MAD_IFC\n"); goto out; } mlx4_ib_update_cache_on_guid_change(dev, guid_tbl_blk_num + i, port_num, (u8 *)(&((struct ib_smp *)out_mad)->data)); mlx4_ib_notify_slaves_on_guid_change(dev, guid_tbl_blk_num + i, port_num, (u8 *)(&((struct ib_smp *)out_mad)->data)); } out: kfree(in_mad); kfree(out_mad); return; } void handle_port_mgmt_change_event(struct work_struct *work) { struct ib_event_work *ew = container_of(work, struct ib_event_work, work); struct mlx4_ib_dev *dev = ew->ib_dev; struct mlx4_eqe *eqe = &(ew->ib_eqe); u8 port = eqe->event.port_mgmt_change.port; u32 changed_attr; u32 tbl_block; u32 change_bitmap; switch (eqe->subtype) { case MLX4_DEV_PMC_SUBTYPE_PORT_INFO: changed_attr = be32_to_cpu(eqe->event.port_mgmt_change.params.port_info.changed_attr); /* Update the SM ah - This should be done before handling the other changed attributes so that MADs can be sent to the SM */ if (changed_attr & MSTR_SM_CHANGE_MASK) { u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid); u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf; update_sm_ah(dev, port, lid, sl); } /* Check if it is a lid change event */ if (changed_attr & MLX4_EQ_PORT_INFO_LID_CHANGE_MASK) handle_lid_change_event(dev, port); /* Generate GUID changed event */ if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) { if (mlx4_is_master(dev->dev)) { union ib_gid gid; int err = 0; if (!eqe->event.port_mgmt_change.params.port_info.gid_prefix) err = __mlx4_ib_query_gid(&dev->ib_dev, port, 0, &gid, 1); else gid.global.subnet_prefix = eqe->event.port_mgmt_change.params.port_info.gid_prefix; if (err) { pr_warn("Could not change QP1 subnet prefix for port %d: query_gid error (%d)\n", port, err); } else { pr_debug("Changing QP1 subnet prefix for port %d. old=0x%llx. new=0x%llx\n", port, (long long)atomic64_read(&dev->sriov.demux[port - 1].subnet_prefix), (long long)be64_to_cpu(gid.global.subnet_prefix)); atomic64_set(&dev->sriov.demux[port - 1].subnet_prefix, be64_to_cpu(gid.global.subnet_prefix)); } } mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE); /*if master, notify all slaves*/ if (mlx4_is_master(dev->dev)) mlx4_gen_slaves_port_mgt_ev(dev->dev, port, MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK); } if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK) handle_client_rereg_event(dev, port); break; case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE: mlx4_ib_dispatch_event(dev, port, IB_EVENT_PKEY_CHANGE); if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) propagate_pkey_ev(dev, port, eqe); break; case MLX4_DEV_PMC_SUBTYPE_GUID_INFO: /* paravirtualized master's guid is guid 0 -- does not change */ if (!mlx4_is_master(dev->dev)) mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE); /*if master, notify relevant slaves*/ else if (!dev->sriov.is_going_down) { tbl_block = GET_BLK_PTR_FROM_EQE(eqe); change_bitmap = GET_MASK_FROM_EQE(eqe); handle_slaves_guid_change(dev, port, tbl_block, change_bitmap); } break; case MLX4_DEV_PMC_SUBTYPE_SL_TO_VL_MAP: /* cache sl to vl mapping changes for use in * filling QP1 LRH VL field when sending packets */ if (!mlx4_is_slave(dev->dev)) { union sl2vl_tbl_to_u64 sl2vl64; int jj; for (jj = 0; jj < 8; jj++) { sl2vl64.sl8[jj] = eqe->event.port_mgmt_change.params.sl2vl_tbl_change_info.sl2vl_table[jj]; pr_debug("sl2vl[%d] = %02x\n", jj, sl2vl64.sl8[jj]); } atomic64_set(&dev->sl2vl[port - 1], sl2vl64.sl64); } break; default: pr_warn("Unsupported subtype 0x%x for " "Port Management Change event\n", eqe->subtype); } kfree(ew); } void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num, enum ib_event_type type) { struct ib_event event; event.device = &dev->ib_dev; event.element.port_num = port_num; event.event = type; ib_dispatch_event(&event); } static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg) { unsigned long flags; struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context; struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); spin_lock_irqsave(&dev->sriov.going_down_lock, flags); if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE) queue_work(ctx->wq, &ctx->work); spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); } static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx, struct mlx4_ib_demux_pv_qp *tun_qp, int index) { struct ib_sge sg_list; - struct ib_recv_wr recv_wr, *bad_recv_wr; + struct ib_recv_wr recv_wr; + const struct ib_recv_wr *bad_recv_wr; int size; size = (tun_qp->qp->qp_type == IB_QPT_UD) ? sizeof (struct mlx4_tunnel_mad) : sizeof (struct mlx4_mad_rcv_buf); sg_list.addr = tun_qp->ring[index].map; sg_list.length = size; sg_list.lkey = ctx->pd->local_dma_lkey; recv_wr.next = NULL; recv_wr.sg_list = &sg_list; recv_wr.num_sge = 1; recv_wr.wr_id = (u64) index | MLX4_TUN_WRID_RECV | MLX4_TUN_SET_WRID_QPN(tun_qp->proxy_qpt); ib_dma_sync_single_for_device(ctx->ib_dev, tun_qp->ring[index].map, size, DMA_FROM_DEVICE); return ib_post_recv(tun_qp->qp, &recv_wr, &bad_recv_wr); } static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port, int slave, struct ib_sa_mad *sa_mad) { int ret = 0; /* dispatch to different sa handlers */ switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { case IB_SA_ATTR_MC_MEMBER_REC: ret = mlx4_ib_mcg_multiplex_handler(ibdev, port, slave, sa_mad); break; default: break; } return ret; } static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave) { int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave; return (qpn >= proxy_start && qpn <= proxy_start + 1); } int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr, u8 *s_mac, u16 vlan_id, struct ib_mad *mad) { struct ib_sge list; struct ib_ud_wr wr; - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; struct mlx4_ib_demux_pv_ctx *sqp_ctx; struct mlx4_ib_demux_pv_qp *sqp; struct mlx4_mad_snd_buf *sqp_mad; struct ib_ah *ah; struct ib_qp *send_qp = NULL; unsigned wire_tx_ix = 0; int ret = 0; u16 wire_pkey_ix; int src_qpnum; u8 sgid_index; sqp_ctx = dev->sriov.sqps[port-1]; /* check if proxy qp created */ if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE) return -EAGAIN; if (dest_qpt == IB_QPT_SMI) { src_qpnum = 0; sqp = &sqp_ctx->qp[0]; wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; } else { src_qpnum = 1; sqp = &sqp_ctx->qp[1]; wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][pkey_index]; } send_qp = sqp->qp; /* create ah */ sgid_index = attr->grh.sgid_index; attr->grh.sgid_index = 0; ah = ib_create_ah(sqp_ctx->pd, attr); if (IS_ERR(ah)) return -ENOMEM; attr->grh.sgid_index = sgid_index; to_mah(ah)->av.ib.gid_index = sgid_index; /* get rid of force-loopback bit */ to_mah(ah)->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF); spin_lock(&sqp->tx_lock); if (sqp->tx_ix_head - sqp->tx_ix_tail >= (MLX4_NUM_TUNNEL_BUFS - 1)) ret = -EAGAIN; else wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); spin_unlock(&sqp->tx_lock); if (ret) goto out; sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr); if (sqp->tx_ring[wire_tx_ix].ah) ib_destroy_ah(sqp->tx_ring[wire_tx_ix].ah); sqp->tx_ring[wire_tx_ix].ah = ah; ib_dma_sync_single_for_cpu(&dev->ib_dev, sqp->tx_ring[wire_tx_ix].buf.map, sizeof (struct mlx4_mad_snd_buf), DMA_TO_DEVICE); memcpy(&sqp_mad->payload, mad, sizeof *mad); ib_dma_sync_single_for_device(&dev->ib_dev, sqp->tx_ring[wire_tx_ix].buf.map, sizeof (struct mlx4_mad_snd_buf), DMA_TO_DEVICE); list.addr = sqp->tx_ring[wire_tx_ix].buf.map; list.length = sizeof (struct mlx4_mad_snd_buf); list.lkey = sqp_ctx->pd->local_dma_lkey; wr.ah = ah; wr.port_num = port; wr.pkey_index = wire_pkey_ix; wr.remote_qkey = qkey; wr.remote_qpn = remote_qpn; wr.wr.next = NULL; wr.wr.wr_id = ((u64) wire_tx_ix) | MLX4_TUN_SET_WRID_QPN(src_qpnum); wr.wr.sg_list = &list; wr.wr.num_sge = 1; wr.wr.opcode = IB_WR_SEND; wr.wr.send_flags = IB_SEND_SIGNALED; if (s_mac) memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6); if (vlan_id < 0x1000) vlan_id |= (attr->sl & 7) << 13; to_mah(ah)->av.eth.vlan = cpu_to_be16(vlan_id); ret = ib_post_send(send_qp, &wr.wr, &bad_wr); if (!ret) return 0; spin_lock(&sqp->tx_lock); sqp->tx_ix_tail++; spin_unlock(&sqp->tx_lock); sqp->tx_ring[wire_tx_ix].ah = NULL; out: ib_destroy_ah(ah); return ret; } static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port) { if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) return slave; return mlx4_get_base_gid_ix(dev->dev, slave, port); } static void fill_in_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port, struct ib_ah_attr *ah_attr) { if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) ah_attr->grh.sgid_index = slave; else ah_attr->grh.sgid_index += get_slave_base_gid_ix(dev, slave, port); } static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc) { struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); struct mlx4_ib_demux_pv_qp *tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc->wr_id)]; int wr_ix = wc->wr_id & (MLX4_NUM_TUNNEL_BUFS - 1); struct mlx4_tunnel_mad *tunnel = tun_qp->ring[wr_ix].addr; struct mlx4_ib_ah ah; struct ib_ah_attr ah_attr; u8 *slave_id; int slave; int port; u16 vlan_id; /* Get slave that sent this packet */ if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn || wc->src_qp >= dev->dev->phys_caps.base_proxy_sqpn + 8 * MLX4_MFUNC_MAX || (wc->src_qp & 0x1) != ctx->port - 1 || wc->src_qp & 0x4) { mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d\n", wc->src_qp); return; } slave = ((wc->src_qp & ~0x7) - dev->dev->phys_caps.base_proxy_sqpn) / 8; if (slave != ctx->slave) { mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: " "belongs to another slave\n", wc->src_qp); return; } /* Map transaction ID */ ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map, sizeof (struct mlx4_tunnel_mad), DMA_FROM_DEVICE); switch (tunnel->mad.mad_hdr.method) { case IB_MGMT_METHOD_SET: case IB_MGMT_METHOD_GET: case IB_MGMT_METHOD_REPORT: case IB_SA_METHOD_GET_TABLE: case IB_SA_METHOD_DELETE: case IB_SA_METHOD_GET_MULTI: case IB_SA_METHOD_GET_TRACE_TBL: slave_id = (u8 *) &tunnel->mad.mad_hdr.tid; if (*slave_id) { mlx4_ib_warn(ctx->ib_dev, "egress mad has non-null tid msb:%d " "class:%d slave:%d\n", *slave_id, tunnel->mad.mad_hdr.mgmt_class, slave); return; } else *slave_id = slave; default: /* nothing */; } /* Class-specific handling */ switch (tunnel->mad.mad_hdr.mgmt_class) { case IB_MGMT_CLASS_SUBN_LID_ROUTED: case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: if (slave != mlx4_master_func_num(dev->dev) && !mlx4_vf_smi_enabled(dev->dev, slave, ctx->port)) return; break; case IB_MGMT_CLASS_SUBN_ADM: if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave, (struct ib_sa_mad *) &tunnel->mad)) return; break; case IB_MGMT_CLASS_CM: if (mlx4_ib_multiplex_cm_handler(ctx->ib_dev, ctx->port, slave, (struct ib_mad *) &tunnel->mad)) return; break; case IB_MGMT_CLASS_DEVICE_MGMT: if (tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_GET && tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_SET) return; break; default: /* Drop unsupported classes for slaves in tunnel mode */ if (slave != mlx4_master_func_num(dev->dev)) { mlx4_ib_warn(ctx->ib_dev, "dropping unsupported egress mad from class:%d " "for slave:%d\n", tunnel->mad.mad_hdr.mgmt_class, slave); return; } } /* We are using standard ib_core services to send the mad, so generate a * stadard address handle by decoding the tunnelled mlx4_ah fields */ memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av)); ah.ibah.device = ctx->ib_dev; port = be32_to_cpu(ah.av.ib.port_pd) >> 24; port = mlx4_slave_convert_port(dev->dev, slave, port); if (port < 0) return; ah.av.ib.port_pd = cpu_to_be32(port << 24 | (be32_to_cpu(ah.av.ib.port_pd) & 0xffffff)); mlx4_ib_query_ah(&ah.ibah, &ah_attr); if (ah_attr.ah_flags & IB_AH_GRH) fill_in_real_sgid_index(dev, slave, ctx->port, &ah_attr); memcpy(ah_attr.dmac, tunnel->hdr.mac, 6); vlan_id = be16_to_cpu(tunnel->hdr.vlan); /* if slave have default vlan use it */ mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave, &vlan_id, &ah_attr.sl); mlx4_ib_send_to_wire(dev, slave, ctx->port, is_proxy_qp0(dev, wc->src_qp, slave) ? IB_QPT_SMI : IB_QPT_GSI, be16_to_cpu(tunnel->hdr.pkey_index), be32_to_cpu(tunnel->hdr.remote_qpn), be32_to_cpu(tunnel->hdr.qkey), &ah_attr, wc->smac, vlan_id, &tunnel->mad); } static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, enum ib_qp_type qp_type, int is_tun) { int i; struct mlx4_ib_demux_pv_qp *tun_qp; int rx_buf_size, tx_buf_size; if (qp_type > IB_QPT_GSI) return -EINVAL; tun_qp = &ctx->qp[qp_type]; tun_qp->ring = kzalloc(sizeof (struct mlx4_ib_buf) * MLX4_NUM_TUNNEL_BUFS, GFP_KERNEL); if (!tun_qp->ring) return -ENOMEM; tun_qp->tx_ring = kcalloc(MLX4_NUM_TUNNEL_BUFS, sizeof (struct mlx4_ib_tun_tx_buf), GFP_KERNEL); if (!tun_qp->tx_ring) { kfree(tun_qp->ring); tun_qp->ring = NULL; return -ENOMEM; } if (is_tun) { rx_buf_size = sizeof (struct mlx4_tunnel_mad); tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); } else { rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); tx_buf_size = sizeof (struct mlx4_mad_snd_buf); } for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL); if (!tun_qp->ring[i].addr) goto err; tun_qp->ring[i].map = ib_dma_map_single(ctx->ib_dev, tun_qp->ring[i].addr, rx_buf_size, DMA_FROM_DEVICE); if (ib_dma_mapping_error(ctx->ib_dev, tun_qp->ring[i].map)) { kfree(tun_qp->ring[i].addr); goto err; } } for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { tun_qp->tx_ring[i].buf.addr = kmalloc(tx_buf_size, GFP_KERNEL); if (!tun_qp->tx_ring[i].buf.addr) goto tx_err; tun_qp->tx_ring[i].buf.map = ib_dma_map_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.addr, tx_buf_size, DMA_TO_DEVICE); if (ib_dma_mapping_error(ctx->ib_dev, tun_qp->tx_ring[i].buf.map)) { kfree(tun_qp->tx_ring[i].buf.addr); goto tx_err; } tun_qp->tx_ring[i].ah = NULL; } spin_lock_init(&tun_qp->tx_lock); tun_qp->tx_ix_head = 0; tun_qp->tx_ix_tail = 0; tun_qp->proxy_qpt = qp_type; return 0; tx_err: while (i > 0) { --i; ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, tx_buf_size, DMA_TO_DEVICE); kfree(tun_qp->tx_ring[i].buf.addr); } kfree(tun_qp->tx_ring); tun_qp->tx_ring = NULL; i = MLX4_NUM_TUNNEL_BUFS; err: while (i > 0) { --i; ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, rx_buf_size, DMA_FROM_DEVICE); kfree(tun_qp->ring[i].addr); } kfree(tun_qp->ring); tun_qp->ring = NULL; return -ENOMEM; } static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx, enum ib_qp_type qp_type, int is_tun) { int i; struct mlx4_ib_demux_pv_qp *tun_qp; int rx_buf_size, tx_buf_size; if (qp_type > IB_QPT_GSI) return; tun_qp = &ctx->qp[qp_type]; if (is_tun) { rx_buf_size = sizeof (struct mlx4_tunnel_mad); tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); } else { rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); tx_buf_size = sizeof (struct mlx4_mad_snd_buf); } for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, rx_buf_size, DMA_FROM_DEVICE); kfree(tun_qp->ring[i].addr); } for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, tx_buf_size, DMA_TO_DEVICE); kfree(tun_qp->tx_ring[i].buf.addr); if (tun_qp->tx_ring[i].ah) ib_destroy_ah(tun_qp->tx_ring[i].ah); } kfree(tun_qp->tx_ring); kfree(tun_qp->ring); } static void mlx4_ib_tunnel_comp_worker(struct work_struct *work) { struct mlx4_ib_demux_pv_ctx *ctx; struct mlx4_ib_demux_pv_qp *tun_qp; struct ib_wc wc; int ret; ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); while (ib_poll_cq(ctx->cq, 1, &wc) == 1) { tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; if (wc.status == IB_WC_SUCCESS) { switch (wc.opcode) { case IB_WC_RECV: mlx4_ib_multiplex_mad(ctx, &wc); ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)); if (ret) pr_err("Failed reposting tunnel " "buf:%lld\n", (unsigned long long)wc.wr_id); break; case IB_WC_SEND: pr_debug("received tunnel send completion:" "wrid=0x%llx, status=0x%x\n", (unsigned long long)wc.wr_id, wc.status); ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah); tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah = NULL; spin_lock(&tun_qp->tx_lock); tun_qp->tx_ix_tail++; spin_unlock(&tun_qp->tx_lock); break; default: break; } } else { pr_debug("mlx4_ib: completion error in tunnel: %d." " status = %d, wrid = 0x%llx\n", ctx->slave, wc.status, (unsigned long long)wc.wr_id); if (!MLX4_TUN_IS_RECV(wc.wr_id)) { ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah); tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah = NULL; spin_lock(&tun_qp->tx_lock); tun_qp->tx_ix_tail++; spin_unlock(&tun_qp->tx_lock); } } } } static void pv_qp_event_handler(struct ib_event *event, void *qp_context) { struct mlx4_ib_demux_pv_ctx *sqp = qp_context; /* It's worse than that! He's dead, Jim! */ pr_err("Fatal error (%d) on a MAD QP on port %d\n", event->event, sqp->port); } static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx, enum ib_qp_type qp_type, int create_tun) { int i, ret; struct mlx4_ib_demux_pv_qp *tun_qp; struct mlx4_ib_qp_tunnel_init_attr qp_init_attr; struct ib_qp_attr attr; int qp_attr_mask_INIT; if (qp_type > IB_QPT_GSI) return -EINVAL; tun_qp = &ctx->qp[qp_type]; memset(&qp_init_attr, 0, sizeof qp_init_attr); qp_init_attr.init_attr.send_cq = ctx->cq; qp_init_attr.init_attr.recv_cq = ctx->cq; qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; qp_init_attr.init_attr.cap.max_send_wr = MLX4_NUM_TUNNEL_BUFS; qp_init_attr.init_attr.cap.max_recv_wr = MLX4_NUM_TUNNEL_BUFS; qp_init_attr.init_attr.cap.max_send_sge = 1; qp_init_attr.init_attr.cap.max_recv_sge = 1; if (create_tun) { qp_init_attr.init_attr.qp_type = IB_QPT_UD; qp_init_attr.init_attr.create_flags = (enum ib_qp_create_flags)MLX4_IB_SRIOV_TUNNEL_QP; qp_init_attr.port = ctx->port; qp_init_attr.slave = ctx->slave; qp_init_attr.proxy_qp_type = qp_type; qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY | IB_QP_PORT; } else { qp_init_attr.init_attr.qp_type = qp_type; qp_init_attr.init_attr.create_flags = (enum ib_qp_create_flags)MLX4_IB_SRIOV_SQP; qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY; } qp_init_attr.init_attr.port_num = ctx->port; qp_init_attr.init_attr.qp_context = ctx; qp_init_attr.init_attr.event_handler = pv_qp_event_handler; tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr); if (IS_ERR(tun_qp->qp)) { ret = PTR_ERR(tun_qp->qp); tun_qp->qp = NULL; pr_err("Couldn't create %s QP (%d)\n", create_tun ? "tunnel" : "special", ret); return ret; } memset(&attr, 0, sizeof attr); attr.qp_state = IB_QPS_INIT; ret = 0; if (create_tun) ret = find_slave_port_pkey_ix(to_mdev(ctx->ib_dev), ctx->slave, ctx->port, IB_DEFAULT_PKEY_FULL, &attr.pkey_index); if (ret || !create_tun) attr.pkey_index = to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0]; attr.qkey = IB_QP1_QKEY; attr.port_num = ctx->port; ret = ib_modify_qp(tun_qp->qp, &attr, qp_attr_mask_INIT); if (ret) { pr_err("Couldn't change %s qp state to INIT (%d)\n", create_tun ? "tunnel" : "special", ret); goto err_qp; } attr.qp_state = IB_QPS_RTR; ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE); if (ret) { pr_err("Couldn't change %s qp state to RTR (%d)\n", create_tun ? "tunnel" : "special", ret); goto err_qp; } attr.qp_state = IB_QPS_RTS; attr.sq_psn = 0; ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN); if (ret) { pr_err("Couldn't change %s qp state to RTS (%d)\n", create_tun ? "tunnel" : "special", ret); goto err_qp; } for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i); if (ret) { pr_err(" mlx4_ib_post_pv_buf error" " (err = %d, i = %d)\n", ret, i); goto err_qp; } } return 0; err_qp: ib_destroy_qp(tun_qp->qp); tun_qp->qp = NULL; return ret; } /* * IB MAD completion callback for real SQPs */ static void mlx4_ib_sqp_comp_worker(struct work_struct *work) { struct mlx4_ib_demux_pv_ctx *ctx; struct mlx4_ib_demux_pv_qp *sqp; struct ib_wc wc; struct ib_grh *grh; struct ib_mad *mad; ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); while (mlx4_ib_poll_cq(ctx->cq, 1, &wc) == 1) { sqp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; if (wc.status == IB_WC_SUCCESS) { switch (wc.opcode) { case IB_WC_SEND: ib_destroy_ah(sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah); sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah = NULL; spin_lock(&sqp->tx_lock); sqp->tx_ix_tail++; spin_unlock(&sqp->tx_lock); break; case IB_WC_RECV: mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *) (sqp->ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->payload); grh = &(((struct mlx4_mad_rcv_buf *) (sqp->ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->grh); mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad); if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1))) pr_err("Failed reposting SQP " "buf:%lld\n", (unsigned long long)wc.wr_id); break; default: BUG_ON(1); break; } } else { pr_debug("mlx4_ib: completion error in tunnel: %d." " status = %d, wrid = 0x%llx\n", ctx->slave, wc.status, (unsigned long long)wc.wr_id); if (!MLX4_TUN_IS_RECV(wc.wr_id)) { ib_destroy_ah(sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah); sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah = NULL; spin_lock(&sqp->tx_lock); sqp->tx_ix_tail++; spin_unlock(&sqp->tx_lock); } } } } static int alloc_pv_object(struct mlx4_ib_dev *dev, int slave, int port, struct mlx4_ib_demux_pv_ctx **ret_ctx) { struct mlx4_ib_demux_pv_ctx *ctx; *ret_ctx = NULL; ctx = kzalloc(sizeof (struct mlx4_ib_demux_pv_ctx), GFP_KERNEL); if (!ctx) { pr_err("failed allocating pv resource context " "for port %d, slave %d\n", port, slave); return -ENOMEM; } ctx->ib_dev = &dev->ib_dev; ctx->port = port; ctx->slave = slave; *ret_ctx = ctx; return 0; } static void free_pv_object(struct mlx4_ib_dev *dev, int slave, int port) { if (dev->sriov.demux[port - 1].tun[slave]) { kfree(dev->sriov.demux[port - 1].tun[slave]); dev->sriov.demux[port - 1].tun[slave] = NULL; } } static int create_pv_resources(struct ib_device *ibdev, int slave, int port, int create_tun, struct mlx4_ib_demux_pv_ctx *ctx) { int ret, cq_size; struct ib_cq_init_attr cq_attr = {}; if (ctx->state != DEMUX_PV_STATE_DOWN) return -EEXIST; ctx->state = DEMUX_PV_STATE_STARTING; /* have QP0 only if link layer is IB */ if (rdma_port_get_link_layer(ibdev, ctx->port) == IB_LINK_LAYER_INFINIBAND) ctx->has_smi = 1; if (ctx->has_smi) { ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_SMI, create_tun); if (ret) { pr_err("Failed allocating qp0 tunnel bufs (%d)\n", ret); goto err_out; } } ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_GSI, create_tun); if (ret) { pr_err("Failed allocating qp1 tunnel bufs (%d)\n", ret); goto err_out_qp0; } cq_size = 2 * MLX4_NUM_TUNNEL_BUFS; if (ctx->has_smi) cq_size *= 2; cq_attr.cqe = cq_size; ctx->cq = ib_create_cq(ctx->ib_dev, mlx4_ib_tunnel_comp_handler, NULL, ctx, &cq_attr); if (IS_ERR(ctx->cq)) { ret = PTR_ERR(ctx->cq); pr_err("Couldn't create tunnel CQ (%d)\n", ret); goto err_buf; } ctx->pd = ib_alloc_pd(ctx->ib_dev, 0); if (IS_ERR(ctx->pd)) { ret = PTR_ERR(ctx->pd); pr_err("Couldn't create tunnel PD (%d)\n", ret); goto err_cq; } if (ctx->has_smi) { ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun); if (ret) { pr_err("Couldn't create %s QP0 (%d)\n", create_tun ? "tunnel for" : "", ret); goto err_pd; } } ret = create_pv_sqp(ctx, IB_QPT_GSI, create_tun); if (ret) { pr_err("Couldn't create %s QP1 (%d)\n", create_tun ? "tunnel for" : "", ret); goto err_qp0; } if (create_tun) INIT_WORK(&ctx->work, mlx4_ib_tunnel_comp_worker); else INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker); ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq; ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); if (ret) { pr_err("Couldn't arm tunnel cq (%d)\n", ret); goto err_wq; } ctx->state = DEMUX_PV_STATE_ACTIVE; return 0; err_wq: ctx->wq = NULL; ib_destroy_qp(ctx->qp[1].qp); ctx->qp[1].qp = NULL; err_qp0: if (ctx->has_smi) ib_destroy_qp(ctx->qp[0].qp); ctx->qp[0].qp = NULL; err_pd: ib_dealloc_pd(ctx->pd); ctx->pd = NULL; err_cq: ib_destroy_cq(ctx->cq); ctx->cq = NULL; err_buf: mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, create_tun); err_out_qp0: if (ctx->has_smi) mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, create_tun); err_out: ctx->state = DEMUX_PV_STATE_DOWN; return ret; } static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port, struct mlx4_ib_demux_pv_ctx *ctx, int flush) { if (!ctx) return; if (ctx->state > DEMUX_PV_STATE_DOWN) { ctx->state = DEMUX_PV_STATE_DOWNING; if (flush) flush_workqueue(ctx->wq); if (ctx->has_smi) { ib_destroy_qp(ctx->qp[0].qp); ctx->qp[0].qp = NULL; mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, 1); } ib_destroy_qp(ctx->qp[1].qp); ctx->qp[1].qp = NULL; mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1); ib_dealloc_pd(ctx->pd); ctx->pd = NULL; ib_destroy_cq(ctx->cq); ctx->cq = NULL; ctx->state = DEMUX_PV_STATE_DOWN; } } static int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave, int port, int do_init) { int ret = 0; if (!do_init) { clean_vf_mcast(&dev->sriov.demux[port - 1], slave); /* for master, destroy real sqp resources */ if (slave == mlx4_master_func_num(dev->dev)) destroy_pv_resources(dev, slave, port, dev->sriov.sqps[port - 1], 1); /* destroy the tunnel qp resources */ destroy_pv_resources(dev, slave, port, dev->sriov.demux[port - 1].tun[slave], 1); return 0; } /* create the tunnel qp resources */ ret = create_pv_resources(&dev->ib_dev, slave, port, 1, dev->sriov.demux[port - 1].tun[slave]); /* for master, create the real sqp resources */ if (!ret && slave == mlx4_master_func_num(dev->dev)) ret = create_pv_resources(&dev->ib_dev, slave, port, 0, dev->sriov.sqps[port - 1]); return ret; } void mlx4_ib_tunnels_update_work(struct work_struct *work) { struct mlx4_ib_demux_work *dmxw; dmxw = container_of(work, struct mlx4_ib_demux_work, work); mlx4_ib_tunnels_update(dmxw->dev, dmxw->slave, (int) dmxw->port, dmxw->do_init); kfree(dmxw); return; } static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, struct mlx4_ib_demux_ctx *ctx, int port) { char name[12]; int ret = 0; int i; ctx->tun = kcalloc(dev->dev->caps.sqp_demux, sizeof (struct mlx4_ib_demux_pv_ctx *), GFP_KERNEL); if (!ctx->tun) return -ENOMEM; ctx->dev = dev; ctx->port = port; ctx->ib_dev = &dev->ib_dev; for (i = 0; i < min(dev->dev->caps.sqp_demux, (u16)(dev->dev->persist->num_vfs + 1)); i++) { struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev->dev, i); if (!test_bit(port - 1, actv_ports.ports)) continue; ret = alloc_pv_object(dev, i, port, &ctx->tun[i]); if (ret) { ret = -ENOMEM; goto err_mcg; } } ret = mlx4_ib_mcg_port_init(ctx); if (ret) { pr_err("Failed initializing mcg para-virt (%d)\n", ret); goto err_mcg; } snprintf(name, sizeof name, "mlx4_ibt%d", port); ctx->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); if (!ctx->wq) { pr_err("Failed to create tunnelling WQ for port %d\n", port); ret = -ENOMEM; goto err_wq; } snprintf(name, sizeof name, "mlx4_ibud%d", port); ctx->ud_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); if (!ctx->ud_wq) { pr_err("Failed to create up/down WQ for port %d\n", port); ret = -ENOMEM; goto err_udwq; } return 0; err_udwq: destroy_workqueue(ctx->wq); ctx->wq = NULL; err_wq: mlx4_ib_mcg_port_cleanup(ctx, 1); err_mcg: for (i = 0; i < dev->dev->caps.sqp_demux; i++) free_pv_object(dev, i, port); kfree(ctx->tun); ctx->tun = NULL; return ret; } static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx) { if (sqp_ctx->state > DEMUX_PV_STATE_DOWN) { sqp_ctx->state = DEMUX_PV_STATE_DOWNING; flush_workqueue(sqp_ctx->wq); if (sqp_ctx->has_smi) { ib_destroy_qp(sqp_ctx->qp[0].qp); sqp_ctx->qp[0].qp = NULL; mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_SMI, 0); } ib_destroy_qp(sqp_ctx->qp[1].qp); sqp_ctx->qp[1].qp = NULL; mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0); ib_dealloc_pd(sqp_ctx->pd); sqp_ctx->pd = NULL; ib_destroy_cq(sqp_ctx->cq); sqp_ctx->cq = NULL; sqp_ctx->state = DEMUX_PV_STATE_DOWN; } } static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx) { int i; if (ctx) { struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); mlx4_ib_mcg_port_cleanup(ctx, 1); for (i = 0; i < dev->dev->caps.sqp_demux; i++) { if (!ctx->tun[i]) continue; if (ctx->tun[i]->state > DEMUX_PV_STATE_DOWN) ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING; } flush_workqueue(ctx->wq); for (i = 0; i < dev->dev->caps.sqp_demux; i++) { destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0); free_pv_object(dev, i, ctx->port); } kfree(ctx->tun); destroy_workqueue(ctx->ud_wq); destroy_workqueue(ctx->wq); } } static void mlx4_ib_master_tunnels(struct mlx4_ib_dev *dev, int do_init) { int i; if (!mlx4_is_master(dev->dev)) return; /* initialize or tear down tunnel QPs for the master */ for (i = 0; i < dev->dev->caps.num_ports; i++) mlx4_ib_tunnels_update(dev, mlx4_master_func_num(dev->dev), i + 1, do_init); return; } int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev) { int i = 0; int err; if (!mlx4_is_mfunc(dev->dev)) return 0; dev->sriov.is_going_down = 0; spin_lock_init(&dev->sriov.going_down_lock); mlx4_ib_cm_paravirt_init(dev); mlx4_ib_warn(&dev->ib_dev, "multi-function enabled\n"); if (mlx4_is_slave(dev->dev)) { mlx4_ib_warn(&dev->ib_dev, "operating in qp1 tunnel mode\n"); return 0; } for (i = 0; i < dev->dev->caps.sqp_demux; i++) { if (i == mlx4_master_func_num(dev->dev)) mlx4_put_slave_node_guid(dev->dev, i, dev->ib_dev.node_guid); else mlx4_put_slave_node_guid(dev->dev, i, mlx4_ib_gen_node_guid()); } err = mlx4_ib_init_alias_guid_service(dev); if (err) { mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n"); goto paravirt_err; } err = mlx4_ib_device_register_sysfs(dev); if (err) { mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n"); goto sysfs_err; } mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 clients\n", dev->dev->caps.sqp_demux); for (i = 0; i < dev->num_ports; i++) { union ib_gid gid; err = __mlx4_ib_query_gid(&dev->ib_dev, i + 1, 0, &gid, 1); if (err) goto demux_err; dev->sriov.demux[i].guid_cache[0] = gid.global.interface_id; atomic64_set(&dev->sriov.demux[i].subnet_prefix, be64_to_cpu(gid.global.subnet_prefix)); err = alloc_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1, &dev->sriov.sqps[i]); if (err) goto demux_err; err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1); if (err) goto free_pv; } mlx4_ib_master_tunnels(dev, 1); return 0; free_pv: free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1); demux_err: while (--i >= 0) { free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1); mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); } mlx4_ib_device_unregister_sysfs(dev); sysfs_err: mlx4_ib_destroy_alias_guid_service(dev); paravirt_err: mlx4_ib_cm_paravirt_clean(dev, -1); return err; } void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev) { int i; unsigned long flags; if (!mlx4_is_mfunc(dev->dev)) return; spin_lock_irqsave(&dev->sriov.going_down_lock, flags); dev->sriov.is_going_down = 1; spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); if (mlx4_is_master(dev->dev)) { for (i = 0; i < dev->num_ports; i++) { flush_workqueue(dev->sriov.demux[i].ud_wq); mlx4_ib_free_sqp_ctx(dev->sriov.sqps[i]); kfree(dev->sriov.sqps[i]); dev->sriov.sqps[i] = NULL; mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); } mlx4_ib_cm_paravirt_clean(dev, -1); mlx4_ib_destroy_alias_guid_service(dev); mlx4_ib_device_unregister_sysfs(dev); } } diff --git a/sys/dev/mlx4/mlx4_ib/mlx4_ib_qp.c b/sys/dev/mlx4/mlx4_ib/mlx4_ib_qp.c index 280ab6359dd1..9eebbd6bb978 100644 --- a/sys/dev/mlx4/mlx4_ib/mlx4_ib_qp.c +++ b/sys/dev/mlx4/mlx4_ib/mlx4_ib_qp.c @@ -1,3522 +1,3522 @@ /* * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mlx4_ib.h" #include static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq); static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq); enum { MLX4_IB_ACK_REQ_FREQ = 8, }; enum { MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83, MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, MLX4_IB_LINK_TYPE_IB = 0, MLX4_IB_LINK_TYPE_ETH = 1 }; enum { /* * Largest possible UD header: send with GRH and immediate * data plus 18 bytes for an Ethernet header with VLAN/802.1Q * tag. (LRH would only use 8 bytes, so Ethernet is the * biggest case) */ MLX4_IB_UD_HEADER_SIZE = 82, MLX4_IB_LSO_HEADER_SPARE = 128, }; enum { MLX4_IB_IBOE_ETHERTYPE = 0x8915 }; struct mlx4_ib_sqp { struct mlx4_ib_qp qp; int pkey_index; u32 qkey; u32 send_psn; struct ib_ud_header ud_header; u8 header_buf[MLX4_IB_UD_HEADER_SIZE]; struct ib_qp *roce_v2_gsi; }; enum { MLX4_IB_MIN_SQ_STRIDE = 6, MLX4_IB_CACHE_LINE_SIZE = 64, }; enum { MLX4_RAW_QP_MTU = 7, MLX4_RAW_QP_MSGMAX = 31, }; #ifndef ETH_ALEN #define ETH_ALEN 6 #endif static const __be32 mlx4_ib_opcode[] = { [IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND), [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO), [IB_WR_SEND_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_SEND_IMM), [IB_WR_RDMA_WRITE] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE), [IB_WR_RDMA_WRITE_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM), [IB_WR_RDMA_READ] = cpu_to_be32(MLX4_OPCODE_RDMA_READ), [IB_WR_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS), [IB_WR_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA), [IB_WR_SEND_WITH_INV] = cpu_to_be32(MLX4_OPCODE_SEND_INVAL), [IB_WR_LOCAL_INV] = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL), [IB_WR_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR), [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS), [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA), }; static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) { return container_of(mqp, struct mlx4_ib_sqp, qp); } static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { if (!mlx4_is_master(dev->dev)) return 0; return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn && qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn + 8 * MLX4_MFUNC_MAX; } static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { int proxy_sqp = 0; int real_sqp = 0; int i; /* PPF or Native -- real SQP */ real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) && qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn && qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3); if (real_sqp) return 1; /* VF or PF -- proxy SQP */ if (mlx4_is_mfunc(dev->dev)) { for (i = 0; i < dev->dev->caps.num_ports; i++) { if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i] || qp->mqp.qpn == dev->dev->caps.qp1_proxy[i]) { proxy_sqp = 1; break; } } } if (proxy_sqp) return 1; return !!(qp->flags & MLX4_IB_ROCE_V2_GSI_QP); } /* used for INIT/CLOSE port logic */ static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { int proxy_qp0 = 0; int real_qp0 = 0; int i; /* PPF or Native -- real QP0 */ real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) && qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn && qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1); if (real_qp0) return 1; /* VF or PF -- proxy QP0 */ if (mlx4_is_mfunc(dev->dev)) { for (i = 0; i < dev->dev->caps.num_ports; i++) { if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i]) { proxy_qp0 = 1; break; } } } return proxy_qp0; } static void *get_wqe(struct mlx4_ib_qp *qp, int offset) { return mlx4_buf_offset(&qp->buf, offset); } static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n) { return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); } static void *get_send_wqe(struct mlx4_ib_qp *qp, int n) { return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift)); } /* * Stamp a SQ WQE so that it is invalid if prefetched by marking the * first four bytes of every 64 byte chunk with * 0x7FFFFFF | (invalid_ownership_value << 31). * * When the max work request size is less than or equal to the WQE * basic block size, as an optimization, we can stamp all WQEs with * 0xffffffff, and skip the very first chunk of each WQE. */ static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) { __be32 *wqe; int i; int s; int ind; void *buf; __be32 stamp; struct mlx4_wqe_ctrl_seg *ctrl; if (qp->sq_max_wqes_per_wr > 1) { s = roundup(size, 1U << qp->sq.wqe_shift); for (i = 0; i < s; i += 64) { ind = (i >> qp->sq.wqe_shift) + n; stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) : cpu_to_be32(0xffffffff); buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1)); *wqe = stamp; } } else { ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); s = (ctrl->fence_size & 0x3f) << 4; for (i = 64; i < s; i += 64) { wqe = buf + i; *wqe = cpu_to_be32(0xffffffff); } } } static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) { struct mlx4_wqe_ctrl_seg *ctrl; struct mlx4_wqe_inline_seg *inl; void *wqe; int s; ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); s = sizeof(struct mlx4_wqe_ctrl_seg); if (qp->ibqp.qp_type == IB_QPT_UD) { struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl; struct mlx4_av *av = (struct mlx4_av *)dgram->av; memset(dgram, 0, sizeof *dgram); av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn); s += sizeof(struct mlx4_wqe_datagram_seg); } /* Pad the remainder of the WQE with an inline data segment. */ if (size > s) { inl = wqe + s; inl->byte_count = cpu_to_be32(1U << 31 | (size - s - sizeof *inl)); } ctrl->srcrb_flags = 0; ctrl->fence_size = size / 16; /* * Make sure descriptor is fully written before setting ownership bit * (because HW can start executing as soon as we do). */ wmb(); ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) | (n & qp->sq.wqe_cnt ? cpu_to_be32(1U << 31) : 0); stamp_send_wqe(qp, n + qp->sq_spare_wqes, size); } /* Post NOP WQE to prevent wrap-around in the middle of WR */ static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) { unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1)); if (unlikely(s < qp->sq_max_wqes_per_wr)) { post_nop_wqe(qp, ind, s << qp->sq.wqe_shift); ind += s; } return ind; } static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) { struct ib_event event; struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; if (type == MLX4_EVENT_TYPE_PATH_MIG) to_mibqp(qp)->port = to_mibqp(qp)->alt_port; if (ibqp->event_handler) { event.device = ibqp->device; event.element.qp = ibqp; switch (type) { case MLX4_EVENT_TYPE_PATH_MIG: event.event = IB_EVENT_PATH_MIG; break; case MLX4_EVENT_TYPE_COMM_EST: event.event = IB_EVENT_COMM_EST; break; case MLX4_EVENT_TYPE_SQ_DRAINED: event.event = IB_EVENT_SQ_DRAINED; break; case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE: event.event = IB_EVENT_QP_LAST_WQE_REACHED; break; case MLX4_EVENT_TYPE_WQ_CATAS_ERROR: event.event = IB_EVENT_QP_FATAL; break; case MLX4_EVENT_TYPE_PATH_MIG_FAILED: event.event = IB_EVENT_PATH_MIG_ERR; break; case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: event.event = IB_EVENT_QP_REQ_ERR; break; case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: event.event = IB_EVENT_QP_ACCESS_ERR; break; default: pr_warn("Unexpected event type %d " "on QP %06x\n", type, qp->qpn); return; } ibqp->event_handler(&event, ibqp->qp_context); } } static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags) { /* * UD WQEs must have a datagram segment. * RC and UC WQEs might have a remote address segment. * MLX WQEs need two extra inline data segments (for the UD * header and space for the ICRC). */ switch (type) { case MLX4_IB_QPT_UD: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_datagram_seg) + ((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0); case MLX4_IB_QPT_PROXY_SMI_OWNER: case MLX4_IB_QPT_PROXY_SMI: case MLX4_IB_QPT_PROXY_GSI: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_datagram_seg) + 64; case MLX4_IB_QPT_TUN_SMI_OWNER: case MLX4_IB_QPT_TUN_GSI: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_datagram_seg); case MLX4_IB_QPT_UC: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_raddr_seg); case MLX4_IB_QPT_RC: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_masked_atomic_seg) + sizeof (struct mlx4_wqe_raddr_seg); case MLX4_IB_QPT_SMI: case MLX4_IB_QPT_GSI: return sizeof (struct mlx4_wqe_ctrl_seg) + ALIGN(MLX4_IB_UD_HEADER_SIZE + DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE, MLX4_INLINE_ALIGN) * sizeof (struct mlx4_wqe_inline_seg), sizeof (struct mlx4_wqe_data_seg)) + ALIGN(4 + sizeof (struct mlx4_wqe_inline_seg), sizeof (struct mlx4_wqe_data_seg)); default: return sizeof (struct mlx4_wqe_ctrl_seg); } } static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, int is_user, int has_rq, struct mlx4_ib_qp *qp) { /* Sanity check RQ size before proceeding */ if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE || cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)) return -EINVAL; if (!has_rq) { if (cap->max_recv_wr) return -EINVAL; qp->rq.wqe_cnt = qp->rq.max_gs = 0; } else { /* HW requires >= 1 RQ entry with >= 1 gather entry */ if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) return -EINVAL; qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr)); qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg)); } /* leave userspace return values as they were, so as not to break ABI */ if (is_user) { cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; cap->max_recv_sge = qp->rq.max_gs; } else { cap->max_recv_wr = qp->rq.max_post = min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt); cap->max_recv_sge = min(qp->rq.max_gs, min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)); } return 0; } static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp, bool shrink_wqe) { int s; /* Sanity check SQ size before proceeding */ if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) || cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) || cap->max_inline_data + send_wqe_overhead(type, qp->flags) + sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) return -EINVAL; /* * For MLX transport we need 2 extra S/G entries: * one for the header and one for the checksum at the end */ if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI || type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) && cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) return -EINVAL; s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg), cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) + send_wqe_overhead(type, qp->flags); if (s > dev->dev->caps.max_sq_desc_sz) return -EINVAL; /* * Hermon supports shrinking WQEs, such that a single work * request can include multiple units of 1 << wqe_shift. This * way, work requests can differ in size, and do not have to * be a power of 2 in size, saving memory and speeding up send * WR posting. Unfortunately, if we do this then the * wqe_index field in CQEs can't be used to look up the WR ID * anymore, so we do this only if selective signaling is off. * * Further, on 32-bit platforms, we can't use vmap() to make * the QP buffer virtually contiguous. Thus we have to use * constant-sized WRs to make sure a WR is always fully within * a single page-sized chunk. * * Finally, we use NOP work requests to pad the end of the * work queue, to avoid wrap-around in the middle of WR. We * set NEC bit to avoid getting completions with error for * these NOP WRs, but since NEC is only supported starting * with firmware 2.2.232, we use constant-sized WRs for older * firmware. * * And, since MLX QPs only support SEND, we use constant-sized * WRs in this case. * * We look for the smallest value of wqe_shift such that the * resulting number of wqes does not exceed device * capabilities. * * We set WQE size to at least 64 bytes, this way stamping * invalidates each WQE. */ if (shrink_wqe && dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && qp->sq_signal_bits && BITS_PER_LONG == 64 && type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI && !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) qp->sq.wqe_shift = ilog2(64); else qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); for (;;) { qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift); /* * We need to leave 2 KB + 1 WR of headroom in the SQ to * allow HW to prefetch. */ qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr; qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr * qp->sq_max_wqes_per_wr + qp->sq_spare_wqes); if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes) break; if (qp->sq_max_wqes_per_wr <= 1) return -EINVAL; ++qp->sq.wqe_shift; } qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz, (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) - send_wqe_overhead(type, qp->flags)) / sizeof (struct mlx4_wqe_data_seg); qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + (qp->sq.wqe_cnt << qp->sq.wqe_shift); if (qp->rq.wqe_shift > qp->sq.wqe_shift) { qp->rq.offset = 0; qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; } else { qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; qp->sq.offset = 0; } cap->max_send_wr = qp->sq.max_post = (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr; cap->max_send_sge = min(qp->sq.max_gs, min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)); /* We don't support inline sends for kernel QPs (yet) */ cap->max_inline_data = 0; return 0; } static int set_user_sq_size(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, struct mlx4_ib_create_qp *ucmd) { /* Sanity check SQ size before proceeding */ if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes || ucmd->log_sq_stride > ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) || ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE) return -EINVAL; qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count; qp->sq.wqe_shift = ucmd->log_sq_stride; qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + (qp->sq.wqe_cnt << qp->sq.wqe_shift); return 0; } static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) { int i; qp->sqp_proxy_rcv = kmalloc(sizeof (struct mlx4_ib_buf) * qp->rq.wqe_cnt, GFP_KERNEL); if (!qp->sqp_proxy_rcv) return -ENOMEM; for (i = 0; i < qp->rq.wqe_cnt; i++) { qp->sqp_proxy_rcv[i].addr = kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr), GFP_KERNEL); if (!qp->sqp_proxy_rcv[i].addr) goto err; qp->sqp_proxy_rcv[i].map = ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr, sizeof (struct mlx4_ib_proxy_sqp_hdr), DMA_FROM_DEVICE); if (ib_dma_mapping_error(dev, qp->sqp_proxy_rcv[i].map)) { kfree(qp->sqp_proxy_rcv[i].addr); goto err; } } return 0; err: while (i > 0) { --i; ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, sizeof (struct mlx4_ib_proxy_sqp_hdr), DMA_FROM_DEVICE); kfree(qp->sqp_proxy_rcv[i].addr); } kfree(qp->sqp_proxy_rcv); qp->sqp_proxy_rcv = NULL; return -ENOMEM; } static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) { int i; for (i = 0; i < qp->rq.wqe_cnt; i++) { ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, sizeof (struct mlx4_ib_proxy_sqp_hdr), DMA_FROM_DEVICE); kfree(qp->sqp_proxy_rcv[i].addr); } kfree(qp->sqp_proxy_rcv); } static int qp_has_rq(struct ib_qp_init_attr *attr) { if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT) return 0; return !attr->srq; } static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn) { int i; for (i = 0; i < dev->caps.num_ports; i++) { if (qpn == dev->caps.qp0_proxy[i]) return !!dev->caps.qp0_qkey[i]; } return 0; } static void mlx4_ib_free_qp_counter(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { mutex_lock(&dev->counters_table[qp->port - 1].mutex); mlx4_counter_free(dev->dev, qp->counter_index->index); list_del(&qp->counter_index->list); mutex_unlock(&dev->counters_table[qp->port - 1].mutex); kfree(qp->counter_index); qp->counter_index = NULL; } static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp, gfp_t gfp) { int qpn; int err; struct ib_qp_cap backup_cap; struct mlx4_ib_sqp *sqp; struct mlx4_ib_qp *qp; enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; struct mlx4_ib_cq *mcq; unsigned long flags; /* When tunneling special qps, we use a plain UD qp */ if (sqpn) { if (mlx4_is_mfunc(dev->dev) && (!mlx4_is_master(dev->dev) || !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) { if (init_attr->qp_type == IB_QPT_GSI) qp_type = MLX4_IB_QPT_PROXY_GSI; else { if (mlx4_is_master(dev->dev) || qp0_enabled_vf(dev->dev, sqpn)) qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER; else qp_type = MLX4_IB_QPT_PROXY_SMI; } } qpn = sqpn; /* add extra sg entry for tunneling */ init_attr->cap.max_recv_sge++; } else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) { struct mlx4_ib_qp_tunnel_init_attr *tnl_init = container_of(init_attr, struct mlx4_ib_qp_tunnel_init_attr, init_attr); if ((tnl_init->proxy_qp_type != IB_QPT_SMI && tnl_init->proxy_qp_type != IB_QPT_GSI) || !mlx4_is_master(dev->dev)) return -EINVAL; if (tnl_init->proxy_qp_type == IB_QPT_GSI) qp_type = MLX4_IB_QPT_TUN_GSI; else if (tnl_init->slave == mlx4_master_func_num(dev->dev) || mlx4_vf_smi_enabled(dev->dev, tnl_init->slave, tnl_init->port)) qp_type = MLX4_IB_QPT_TUN_SMI_OWNER; else qp_type = MLX4_IB_QPT_TUN_SMI; /* we are definitely in the PPF here, since we are creating * tunnel QPs. base_tunnel_sqpn is therefore valid. */ qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave + tnl_init->proxy_qp_type * 2 + tnl_init->port - 1; sqpn = qpn; } if (!*caller_qp) { if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI || (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) { sqp = kzalloc(sizeof (struct mlx4_ib_sqp), gfp); if (!sqp) return -ENOMEM; qp = &sqp->qp; qp->pri.vid = 0xFFFF; qp->alt.vid = 0xFFFF; } else { qp = kzalloc(sizeof (struct mlx4_ib_qp), gfp); if (!qp) return -ENOMEM; qp->pri.vid = 0xFFFF; qp->alt.vid = 0xFFFF; } } else qp = *caller_qp; qp->mlx4_ib_qp_type = qp_type; mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); INIT_LIST_HEAD(&qp->gid_list); INIT_LIST_HEAD(&qp->steering_rules); qp->state = IB_QPS_RESET; if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp); if (err) goto err; if (pd->uobject) { struct mlx4_ib_create_qp ucmd; if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { err = -EFAULT; goto err; } qp->sq_no_prefetch = ucmd.sq_no_prefetch; err = set_user_sq_size(dev, qp, &ucmd); if (err) goto err; qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, qp->buf_size, 0, 0); if (IS_ERR(qp->umem)) { err = PTR_ERR(qp->umem); goto err; } err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem), ilog2(qp->umem->page_size), &qp->mtt); if (err) goto err_buf; err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem); if (err) goto err_mtt; if (qp_has_rq(init_attr)) { err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), ucmd.db_addr, &qp->db); if (err) goto err_mtt; } } else { qp->sq_no_prefetch = 0; if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) qp->flags |= MLX4_IB_QP_LSO; if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) qp->flags |= MLX4_IB_QP_NETIF; else goto err; } memcpy(&backup_cap, &init_attr->cap, sizeof(backup_cap)); err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp, true); if (err) goto err; if (qp_has_rq(init_attr)) { err = mlx4_db_alloc(dev->dev, &qp->db, 0, gfp); if (err) goto err; *qp->db.db = 0; } if (mlx4_buf_alloc(dev->dev, qp->buf_size, qp->buf_size, &qp->buf, gfp)) { memcpy(&init_attr->cap, &backup_cap, sizeof(backup_cap)); err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp, false); if (err) goto err_db; if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) { err = -ENOMEM; goto err_db; } } err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift, &qp->mtt); if (err) goto err_buf; err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, gfp); if (err) goto err_mtt; qp->sq.wrid = kmalloc_array(qp->sq.wqe_cnt, sizeof(u64), gfp | __GFP_NOWARN); if (!qp->sq.wrid) qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp, 0 /*PAGE_KERNEL*/); qp->rq.wrid = kmalloc_array(qp->rq.wqe_cnt, sizeof(u64), gfp | __GFP_NOWARN); if (!qp->rq.wrid) qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp, 0 /*PAGE_KERNEL*/); if (!qp->sq.wrid || !qp->rq.wrid) { err = -ENOMEM; goto err_wrid; } } if (sqpn) { if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) { if (alloc_proxy_bufs(pd->device, qp)) { err = -ENOMEM; goto err_wrid; } } } else { /* Raw packet QPNs may not have bits 6,7 set in their qp_num; * otherwise, the WQE BlueFlame setup flow wrongly causes * VLAN insertion. */ if (init_attr->qp_type == IB_QPT_RAW_PACKET) err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn, (init_attr->cap.max_send_wr ? MLX4_RESERVE_ETH_BF_QP : 0) | (init_attr->cap.max_recv_wr ? MLX4_RESERVE_A0_QP : 0)); else if (qp->flags & MLX4_IB_QP_NETIF) err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn); else err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn, 0); if (err) goto err_proxy; } if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, gfp); if (err) goto err_qpn; if (init_attr->qp_type == IB_QPT_XRC_TGT) qp->mqp.qpn |= (1 << 23); /* * Hardware wants QPN written in big-endian order (after * shifting) for send doorbell. Precompute this value to save * a little bit when posting sends. */ qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); qp->mqp.event = mlx4_ib_qp_event; if (!*caller_qp) *caller_qp = qp; spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq), to_mcq(init_attr->recv_cq)); /* Maintain device to QPs access, needed for further handling * via reset flow */ list_add_tail(&qp->qps_list, &dev->qp_list); /* Maintain CQ to QPs access, needed for further handling * via reset flow */ mcq = to_mcq(init_attr->send_cq); list_add_tail(&qp->cq_send_list, &mcq->send_qp_list); mcq = to_mcq(init_attr->recv_cq); list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list); mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq), to_mcq(init_attr->recv_cq)); spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); return 0; err_qpn: if (!sqpn) { if (qp->flags & MLX4_IB_QP_NETIF) mlx4_ib_steer_qp_free(dev, qpn, 1); else mlx4_qp_release_range(dev->dev, qpn, 1); } err_proxy: if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) free_proxy_bufs(pd->device, qp); err_wrid: if (pd->uobject) { if (qp_has_rq(init_attr)) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db); } else { kvfree(qp->sq.wrid); kvfree(qp->rq.wrid); } err_mtt: mlx4_mtt_cleanup(dev->dev, &qp->mtt); err_buf: if (pd->uobject) ib_umem_release(qp->umem); else mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); err_db: if (!pd->uobject && qp_has_rq(init_attr)) mlx4_db_free(dev->dev, &qp->db); err: if (!*caller_qp) kfree(qp); return err; } static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state) { switch (state) { case IB_QPS_RESET: return MLX4_QP_STATE_RST; case IB_QPS_INIT: return MLX4_QP_STATE_INIT; case IB_QPS_RTR: return MLX4_QP_STATE_RTR; case IB_QPS_RTS: return MLX4_QP_STATE_RTS; case IB_QPS_SQD: return MLX4_QP_STATE_SQD; case IB_QPS_SQE: return MLX4_QP_STATE_SQER; case IB_QPS_ERR: return MLX4_QP_STATE_ERR; default: return -1; } } static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) __acquires(&send_cq->lock) __acquires(&recv_cq->lock) { if (send_cq == recv_cq) { spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_lock(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else { spin_lock(&recv_cq->lock); spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); } } static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) __releases(&send_cq->lock) __releases(&recv_cq->lock) { if (send_cq == recv_cq) { __release(&recv_cq->lock); spin_unlock(&send_cq->lock); } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_unlock(&recv_cq->lock); spin_unlock(&send_cq->lock); } else { spin_unlock(&send_cq->lock); spin_unlock(&recv_cq->lock); } } static void del_gid_entries(struct mlx4_ib_qp *qp) { struct mlx4_ib_gid_entry *ge, *tmp; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { list_del(&ge->list); kfree(ge); } } static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp) { if (qp->ibqp.qp_type == IB_QPT_XRC_TGT) return to_mpd(to_mxrcd(qp->ibqp.xrcd)->pd); else return to_mpd(qp->ibqp.pd); } static void get_cqs(struct mlx4_ib_qp *qp, struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq) { switch (qp->ibqp.qp_type) { case IB_QPT_XRC_TGT: *send_cq = to_mcq(to_mxrcd(qp->ibqp.xrcd)->cq); *recv_cq = *send_cq; break; case IB_QPT_XRC_INI: *send_cq = to_mcq(qp->ibqp.send_cq); *recv_cq = *send_cq; break; default: *send_cq = to_mcq(qp->ibqp.send_cq); *recv_cq = to_mcq(qp->ibqp.recv_cq); break; } } static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, int is_user) { struct mlx4_ib_cq *send_cq, *recv_cq; unsigned long flags; if (qp->state != IB_QPS_RESET) { if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) pr_warn("modify QP %06x to RESET failed.\n", qp->mqp.qpn); if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) { mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); qp->pri.smac = 0; qp->pri.smac_port = 0; } if (qp->alt.smac) { mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); qp->alt.smac = 0; } if (qp->pri.vid < 0x1000) { mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); qp->pri.vid = 0xFFFF; qp->pri.candidate_vid = 0xFFFF; qp->pri.update_vid = 0; } if (qp->alt.vid < 0x1000) { mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); qp->alt.vid = 0xFFFF; qp->alt.candidate_vid = 0xFFFF; qp->alt.update_vid = 0; } } get_cqs(qp, &send_cq, &recv_cq); spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); mlx4_ib_lock_cqs(send_cq, recv_cq); /* del from lists under both locks above to protect reset flow paths */ list_del(&qp->qps_list); list_del(&qp->cq_send_list); list_del(&qp->cq_recv_list); if (!is_user) { __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL); if (send_cq != recv_cq) __mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); } mlx4_qp_remove(dev->dev, &qp->mqp); mlx4_ib_unlock_cqs(send_cq, recv_cq); spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); mlx4_qp_free(dev->dev, &qp->mqp); if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) { if (qp->flags & MLX4_IB_QP_NETIF) mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1); else mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); } mlx4_mtt_cleanup(dev->dev, &qp->mtt); if (is_user) { if (qp->rq.wqe_cnt) mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context), &qp->db); ib_umem_release(qp->umem); } else { kvfree(qp->sq.wrid); kvfree(qp->rq.wrid); if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) free_proxy_bufs(&dev->ib_dev, qp); mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); if (qp->rq.wqe_cnt) mlx4_db_free(dev->dev, &qp->db); } del_gid_entries(qp); } static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr) { /* Native or PPF */ if (!mlx4_is_mfunc(dev->dev) || (mlx4_is_master(dev->dev) && attr->create_flags & MLX4_IB_SRIOV_SQP)) { return dev->dev->phys_caps.base_sqpn + (attr->qp_type == IB_QPT_SMI ? 0 : 2) + attr->port_num - 1; } /* PF or VF -- creating proxies */ if (attr->qp_type == IB_QPT_SMI) return dev->dev->caps.qp0_proxy[attr->port_num - 1]; else return dev->dev->caps.qp1_proxy[attr->port_num - 1]; } static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { struct mlx4_ib_qp *qp = NULL; int err; int sup_u_create_flags = MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; u16 xrcdn = 0; gfp_t gfp; gfp = (init_attr->create_flags & MLX4_IB_QP_CREATE_USE_GFP_NOIO) ? GFP_NOIO : GFP_KERNEL; /* * We only support LSO, vendor flag1, and multicast loopback blocking, * and only for kernel UD QPs. */ if (init_attr->create_flags & ~(MLX4_IB_QP_LSO | MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK | MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP | MLX4_IB_QP_NETIF | MLX4_IB_QP_CREATE_ROCE_V2_GSI | MLX4_IB_QP_CREATE_USE_GFP_NOIO)) return ERR_PTR(-EINVAL); if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { if (init_attr->qp_type != IB_QPT_UD) return ERR_PTR(-EINVAL); } if (init_attr->create_flags) { if (udata && init_attr->create_flags & ~(sup_u_create_flags)) return ERR_PTR(-EINVAL); if ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | MLX4_IB_QP_CREATE_USE_GFP_NOIO | MLX4_IB_QP_CREATE_ROCE_V2_GSI | MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) && init_attr->qp_type != IB_QPT_UD) || (init_attr->create_flags & MLX4_IB_SRIOV_SQP && init_attr->qp_type > IB_QPT_GSI) || (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI && init_attr->qp_type != IB_QPT_GSI)) return ERR_PTR(-EINVAL); } switch (init_attr->qp_type) { case IB_QPT_XRC_TGT: pd = to_mxrcd(init_attr->xrcd)->pd; xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq; /* fall through */ case IB_QPT_XRC_INI: if (!(to_mdev(pd->device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) return ERR_PTR(-ENOSYS); init_attr->recv_cq = init_attr->send_cq; /* fall through */ case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_RAW_PACKET: qp = kzalloc(sizeof *qp, gfp); if (!qp) return ERR_PTR(-ENOMEM); qp->pri.vid = 0xFFFF; qp->alt.vid = 0xFFFF; /* fall through */ case IB_QPT_UD: { err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, 0, &qp, gfp); if (err) { kfree(qp); return ERR_PTR(err); } qp->ibqp.qp_num = qp->mqp.qpn; qp->xrcdn = xrcdn; break; } case IB_QPT_SMI: case IB_QPT_GSI: { int sqpn; /* Userspace is not allowed to create special QPs: */ if (udata) return ERR_PTR(-EINVAL); if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) { int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev, 1, 1, &sqpn, 0); if (res) return ERR_PTR(res); } else { sqpn = get_sqp_num(to_mdev(pd->device), init_attr); } err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, sqpn, &qp, gfp); if (err) return ERR_PTR(err); qp->port = init_attr->port_num; qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI ? sqpn : 1; break; } default: /* Don't support raw QPs */ return ERR_PTR(-EINVAL); } return &qp->ibqp; } struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { struct ib_device *device = pd ? pd->device : init_attr->xrcd->device; struct ib_qp *ibqp; struct mlx4_ib_dev *dev = to_mdev(device); ibqp = _mlx4_ib_create_qp(pd, init_attr, udata); if (!IS_ERR(ibqp) && (init_attr->qp_type == IB_QPT_GSI) && !(init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI)) { struct mlx4_ib_sqp *sqp = to_msqp((to_mqp(ibqp))); int is_eth = rdma_cap_eth_ah(&dev->ib_dev, init_attr->port_num); if (is_eth && dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) { init_attr->create_flags |= MLX4_IB_QP_CREATE_ROCE_V2_GSI; sqp->roce_v2_gsi = ib_create_qp(pd, init_attr); if (IS_ERR(sqp->roce_v2_gsi)) { pr_err("Failed to create GSI QP for RoCEv2 (%ld)\n", PTR_ERR(sqp->roce_v2_gsi)); sqp->roce_v2_gsi = NULL; } else { sqp = to_msqp(to_mqp(sqp->roce_v2_gsi)); sqp->qp.flags |= MLX4_IB_ROCE_V2_GSI_QP; } init_attr->create_flags &= ~MLX4_IB_QP_CREATE_ROCE_V2_GSI; } } return ibqp; } static int _mlx4_ib_destroy_qp(struct ib_qp *qp) { struct mlx4_ib_dev *dev = to_mdev(qp->device); struct mlx4_ib_qp *mqp = to_mqp(qp); struct mlx4_ib_pd *pd; if (is_qp0(dev, mqp)) mlx4_CLOSE_PORT(dev->dev, mqp->port); if (dev->qp1_proxy[mqp->port - 1] == mqp) { mutex_lock(&dev->qp1_proxy_lock[mqp->port - 1]); dev->qp1_proxy[mqp->port - 1] = NULL; mutex_unlock(&dev->qp1_proxy_lock[mqp->port - 1]); } if (mqp->counter_index) mlx4_ib_free_qp_counter(dev, mqp); pd = get_pd(mqp); destroy_qp_common(dev, mqp, !!pd->ibpd.uobject); if (is_sqp(dev, mqp)) kfree(to_msqp(mqp)); else kfree(mqp); return 0; } int mlx4_ib_destroy_qp(struct ib_qp *qp) { struct mlx4_ib_qp *mqp = to_mqp(qp); if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) { struct mlx4_ib_sqp *sqp = to_msqp(mqp); if (sqp->roce_v2_gsi) ib_destroy_qp(sqp->roce_v2_gsi); } return _mlx4_ib_destroy_qp(qp); } static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type) { switch (type) { case MLX4_IB_QPT_RC: return MLX4_QP_ST_RC; case MLX4_IB_QPT_UC: return MLX4_QP_ST_UC; case MLX4_IB_QPT_UD: return MLX4_QP_ST_UD; case MLX4_IB_QPT_XRC_INI: case MLX4_IB_QPT_XRC_TGT: return MLX4_QP_ST_XRC; case MLX4_IB_QPT_SMI: case MLX4_IB_QPT_GSI: case MLX4_IB_QPT_RAW_PACKET: return MLX4_QP_ST_MLX; case MLX4_IB_QPT_PROXY_SMI_OWNER: case MLX4_IB_QPT_TUN_SMI_OWNER: return (mlx4_is_mfunc(dev->dev) ? MLX4_QP_ST_MLX : -1); case MLX4_IB_QPT_PROXY_SMI: case MLX4_IB_QPT_TUN_SMI: case MLX4_IB_QPT_PROXY_GSI: case MLX4_IB_QPT_TUN_GSI: return (mlx4_is_mfunc(dev->dev) ? MLX4_QP_ST_UD : -1); default: return -1; } } static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr, int attr_mask) { u8 dest_rd_atomic; u32 access_flags; u32 hw_access_flags = 0; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) dest_rd_atomic = attr->max_dest_rd_atomic; else dest_rd_atomic = qp->resp_depth; if (attr_mask & IB_QP_ACCESS_FLAGS) access_flags = attr->qp_access_flags; else access_flags = qp->atomic_rd_en; if (!dest_rd_atomic) access_flags &= IB_ACCESS_REMOTE_WRITE; if (access_flags & IB_ACCESS_REMOTE_READ) hw_access_flags |= MLX4_QP_BIT_RRE; if (access_flags & IB_ACCESS_REMOTE_ATOMIC) hw_access_flags |= MLX4_QP_BIT_RAE; if (access_flags & IB_ACCESS_REMOTE_WRITE) hw_access_flags |= MLX4_QP_BIT_RWE; return cpu_to_be32(hw_access_flags); } static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr, int attr_mask) { if (attr_mask & IB_QP_PKEY_INDEX) sqp->pkey_index = attr->pkey_index; if (attr_mask & IB_QP_QKEY) sqp->qkey = attr->qkey; if (attr_mask & IB_QP_SQ_PSN) sqp->send_psn = attr->sq_psn; } static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) { path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6); } static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, u64 smac, u16 vlan_tag, struct mlx4_qp_path *path, struct mlx4_roce_smac_vlan_info *smac_info, u8 port) { int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_ETHERNET; int vidx; int smac_index; int err; path->grh_mylmc = ah->src_path_bits & 0x7f; path->rlid = cpu_to_be16(ah->dlid); if (ah->static_rate) { path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET; while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && !(1 << path->static_rate & dev->dev->caps.stat_rate_support)) --path->static_rate; } else path->static_rate = 0; if (ah->ah_flags & IB_AH_GRH) { int real_sgid_index = mlx4_ib_gid_index_to_real_index(dev, port, ah->grh.sgid_index); if (real_sgid_index >= dev->dev->caps.gid_table_len[port]) { pr_err("sgid_index (%u) too large. max is %d\n", real_sgid_index, dev->dev->caps.gid_table_len[port] - 1); return -1; } path->grh_mylmc |= 1 << 7; path->mgid_index = real_sgid_index; path->hop_limit = ah->grh.hop_limit; path->tclass_flowlabel = cpu_to_be32((ah->grh.traffic_class << 20) | (ah->grh.flow_label)); memcpy(path->rgid, ah->grh.dgid.raw, 16); } if (is_eth) { if (!(ah->ah_flags & IB_AH_GRH)) return -1; path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((port - 1) << 6) | ((ah->sl & 7) << 3); path->feup |= MLX4_FEUP_FORCE_ETH_UP; if (vlan_tag < 0x1000) { if (smac_info->vid < 0x1000) { /* both valid vlan ids */ if (smac_info->vid != vlan_tag) { /* different VIDs. unreg old and reg new */ err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); if (err) return err; smac_info->candidate_vid = vlan_tag; smac_info->candidate_vlan_index = vidx; smac_info->candidate_vlan_port = port; smac_info->update_vid = 1; path->vlan_index = vidx; } else { path->vlan_index = smac_info->vlan_index; } } else { /* no current vlan tag in qp */ err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); if (err) return err; smac_info->candidate_vid = vlan_tag; smac_info->candidate_vlan_index = vidx; smac_info->candidate_vlan_port = port; smac_info->update_vid = 1; path->vlan_index = vidx; } path->feup |= MLX4_FVL_FORCE_ETH_VLAN; path->fl = 1 << 6; } else { /* have current vlan tag. unregister it at modify-qp success */ if (smac_info->vid < 0x1000) { smac_info->candidate_vid = 0xFFFF; smac_info->update_vid = 1; } } /* get smac_index for RoCE use. * If no smac was yet assigned, register one. * If one was already assigned, but the new mac differs, * unregister the old one and register the new one. */ if ((!smac_info->smac && !smac_info->smac_port) || smac_info->smac != smac) { /* register candidate now, unreg if needed, after success */ smac_index = mlx4_register_mac(dev->dev, port, smac); if (smac_index >= 0) { smac_info->candidate_smac_index = smac_index; smac_info->candidate_smac = smac; smac_info->candidate_smac_port = port; } else { return -EINVAL; } } else { smac_index = smac_info->smac_index; } memcpy(path->dmac, ah->dmac, 6); path->ackto = MLX4_IB_LINK_TYPE_ETH; /* put MAC table smac index for IBoE */ path->grh_mylmc = (u8) (smac_index) | 0x80; } else { path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((port - 1) << 6) | ((ah->sl & 0xf) << 2); } return 0; } static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp, enum ib_qp_attr_mask qp_attr_mask, struct mlx4_ib_qp *mqp, struct mlx4_qp_path *path, u8 port, u16 vlan_id, u8 *smac) { return _mlx4_set_path(dev, &qp->ah_attr, mlx4_mac_to_u64(smac), vlan_id, path, &mqp->pri, port); } static int mlx4_set_alt_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp, enum ib_qp_attr_mask qp_attr_mask, struct mlx4_ib_qp *mqp, struct mlx4_qp_path *path, u8 port) { return _mlx4_set_path(dev, &qp->alt_ah_attr, 0, 0xffff, path, &mqp->alt, port); } static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { struct mlx4_ib_gid_entry *ge, *tmp; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) { ge->added = 1; ge->port = qp->port; } } } static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, struct mlx4_qp_context *context) { u64 u64_mac; int smac_index; u64_mac = atomic64_read(&dev->iboe.mac[qp->port - 1]); context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6); if (!qp->pri.smac && !qp->pri.smac_port) { smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac); if (smac_index >= 0) { qp->pri.candidate_smac_index = smac_index; qp->pri.candidate_smac = u64_mac; qp->pri.candidate_smac_port = qp->port; context->pri_path.grh_mylmc = 0x80 | (u8) smac_index; } else { return -ENOENT; } } return 0; } static int create_qp_lb_counter(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { struct counter_index *new_counter_index; int err; u32 tmp_idx; if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) != IB_LINK_LAYER_ETHERNET || !(qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) || !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK)) return 0; err = mlx4_counter_alloc(dev->dev, &tmp_idx); if (err) return err; new_counter_index = kmalloc(sizeof(*new_counter_index), GFP_KERNEL); if (!new_counter_index) { mlx4_counter_free(dev->dev, tmp_idx); return -ENOMEM; } new_counter_index->index = tmp_idx; new_counter_index->allocated = 1; qp->counter_index = new_counter_index; mutex_lock(&dev->counters_table[qp->port - 1].mutex); list_add_tail(&new_counter_index->list, &dev->counters_table[qp->port - 1].counters_list); mutex_unlock(&dev->counters_table[qp->port - 1].mutex); return 0; } enum { MLX4_QPC_ROCE_MODE_1 = 0, MLX4_QPC_ROCE_MODE_2 = 2, MLX4_QPC_ROCE_MODE_UNDEFINED = 0xff }; static u8 gid_type_to_qpc(enum ib_gid_type gid_type) { switch (gid_type) { case IB_GID_TYPE_ROCE: return MLX4_QPC_ROCE_MODE_1; case IB_GID_TYPE_ROCE_UDP_ENCAP: return MLX4_QPC_ROCE_MODE_2; default: return MLX4_QPC_ROCE_MODE_UNDEFINED; } } static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) { struct mlx4_ib_dev *dev = to_mdev(ibqp->device); struct mlx4_ib_qp *qp = to_mqp(ibqp); struct mlx4_ib_pd *pd; struct mlx4_ib_cq *send_cq, *recv_cq; struct mlx4_qp_context *context; enum mlx4_qp_optpar optpar = 0; int sqd_event; int steer_qp = 0; int err = -EINVAL; int counter_index; /* APM is not supported under RoCE */ if (attr_mask & IB_QP_ALT_PATH && rdma_port_get_link_layer(&dev->ib_dev, qp->port) == IB_LINK_LAYER_ETHERNET) return -ENOTSUPP; context = kzalloc(sizeof *context, GFP_KERNEL); if (!context) return -ENOMEM; context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) | (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16)); if (!(attr_mask & IB_QP_PATH_MIG_STATE)) context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); else { optpar |= MLX4_QP_OPTPAR_PM_STATE; switch (attr->path_mig_state) { case IB_MIG_MIGRATED: context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); break; case IB_MIG_REARM: context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11); break; case IB_MIG_ARMED: context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11); break; } } if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) context->mtu_msgmax = (IB_MTU_4096 << 5) | 11; else if (ibqp->qp_type == IB_QPT_RAW_PACKET) context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX; else if (ibqp->qp_type == IB_QPT_UD) { if (qp->flags & MLX4_IB_QP_LSO) context->mtu_msgmax = (IB_MTU_4096 << 5) | ilog2(dev->dev->caps.max_gso_sz); else context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; } else if (attr_mask & IB_QP_PATH_MTU) { if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) { pr_err("path MTU (%u) is invalid\n", attr->path_mtu); goto out; } context->mtu_msgmax = (attr->path_mtu << 5) | ilog2(dev->dev->caps.max_msg_sz); } if (qp->rq.wqe_cnt) context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3; context->rq_size_stride |= qp->rq.wqe_shift - 4; if (qp->sq.wqe_cnt) context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3; context->sq_size_stride |= qp->sq.wqe_shift - 4; if (new_state == IB_QPS_RESET && qp->counter_index) mlx4_ib_free_qp_counter(dev, qp); if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { context->sq_size_stride |= !!qp->sq_no_prefetch << 7; context->xrcd = cpu_to_be32((u32) qp->xrcdn); if (ibqp->qp_type == IB_QPT_RAW_PACKET) context->param3 |= cpu_to_be32(1 << 30); } if (qp->ibqp.uobject) context->usr_page = cpu_to_be32( mlx4_to_hw_uar_index(dev->dev, to_mucontext(ibqp->uobject->context)->uar.index)); else context->usr_page = cpu_to_be32( mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index)); if (attr_mask & IB_QP_DEST_QPN) context->remote_qpn = cpu_to_be32(attr->dest_qp_num); if (attr_mask & IB_QP_PORT) { if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD && !(attr_mask & IB_QP_AV)) { mlx4_set_sched(&context->pri_path, attr->port_num); optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE; } } if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { err = create_qp_lb_counter(dev, qp); if (err) goto out; counter_index = dev->counters_table[qp->port - 1].default_counter; if (qp->counter_index) counter_index = qp->counter_index->index; if (counter_index != -1) { context->pri_path.counter_index = counter_index; optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX; if (qp->counter_index) { context->pri_path.fl |= MLX4_FL_ETH_SRC_CHECK_MC_LB; context->pri_path.vlan_control |= MLX4_CTRL_ETH_SRC_CHECK_IF_COUNTER; } } else context->pri_path.counter_index = MLX4_SINK_COUNTER_INDEX(dev->dev); if (qp->flags & MLX4_IB_QP_NETIF) { mlx4_ib_steer_qp_reg(dev, qp, 1); steer_qp = 1; } if (ibqp->qp_type == IB_QPT_GSI) { enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ? IB_GID_TYPE_ROCE_UDP_ENCAP : IB_GID_TYPE_ROCE; u8 qpc_roce_mode = gid_type_to_qpc(gid_type); context->rlkey_roce_mode |= (qpc_roce_mode << 6); } } if (attr_mask & IB_QP_PKEY_INDEX) { if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) context->pri_path.disable_pkey_check = 0x40; context->pri_path.pkey_index = attr->pkey_index; optpar |= MLX4_QP_OPTPAR_PKEY_INDEX; } if (attr_mask & IB_QP_AV) { u8 port_num = mlx4_is_bonded(to_mdev(ibqp->device)->dev) ? 1 : attr_mask & IB_QP_PORT ? attr->port_num : qp->port; union ib_gid gid; struct ib_gid_attr gid_attr; u16 vlan = 0xffff; u8 smac[ETH_ALEN]; int status = 0; int is_eth = rdma_cap_eth_ah(&dev->ib_dev, port_num) && attr->ah_attr.ah_flags & IB_AH_GRH; if (is_eth) { int index = attr->ah_attr.grh.sgid_index; status = ib_get_cached_gid(ibqp->device, port_num, index, &gid, &gid_attr); if (!status && !memcmp(&gid, &zgid, sizeof(gid))) status = -ENOENT; if (!status && gid_attr.ndev) { vlan = rdma_vlan_dev_vlan_id(gid_attr.ndev); memcpy(smac, IF_LLADDR(gid_attr.ndev), ETH_ALEN); if_rele(gid_attr.ndev); } } if (status) goto out; if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path, port_num, vlan, smac)) goto out; optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | MLX4_QP_OPTPAR_SCHED_QUEUE); if (is_eth && (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR)) { u8 qpc_roce_mode = gid_type_to_qpc(gid_attr.gid_type); if (qpc_roce_mode == MLX4_QPC_ROCE_MODE_UNDEFINED) { err = -EINVAL; goto out; } context->rlkey_roce_mode |= (qpc_roce_mode << 6); } } if (attr_mask & IB_QP_TIMEOUT) { context->pri_path.ackto |= attr->timeout << 3; optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT; } if (attr_mask & IB_QP_ALT_PATH) { if (attr->alt_port_num == 0 || attr->alt_port_num > dev->dev->caps.num_ports) goto out; if (attr->alt_pkey_index >= dev->dev->caps.pkey_table_len[attr->alt_port_num]) goto out; if (mlx4_set_alt_path(dev, attr, attr_mask, qp, &context->alt_path, attr->alt_port_num)) goto out; context->alt_path.pkey_index = attr->alt_pkey_index; context->alt_path.ackto = attr->alt_timeout << 3; optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH; } pd = get_pd(qp); get_cqs(qp, &send_cq, &recv_cq); context->pd = cpu_to_be32(pd->pdn); context->cqn_send = cpu_to_be32(send_cq->mcq.cqn); context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn); context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); /* Set "fast registration enabled" for all kernel QPs */ if (!qp->ibqp.uobject) context->params1 |= cpu_to_be32(1 << 11); if (attr_mask & IB_QP_RNR_RETRY) { context->params1 |= cpu_to_be32(attr->rnr_retry << 13); optpar |= MLX4_QP_OPTPAR_RNR_RETRY; } if (attr_mask & IB_QP_RETRY_CNT) { context->params1 |= cpu_to_be32(attr->retry_cnt << 16); optpar |= MLX4_QP_OPTPAR_RETRY_COUNT; } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { if (attr->max_rd_atomic) context->params1 |= cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); optpar |= MLX4_QP_OPTPAR_SRA_MAX; } if (attr_mask & IB_QP_SQ_PSN) context->next_send_psn = cpu_to_be32(attr->sq_psn); if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { if (attr->max_dest_rd_atomic) context->params2 |= cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); optpar |= MLX4_QP_OPTPAR_RRA_MAX; } if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask); optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE; } if (ibqp->srq) context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC); if (attr_mask & IB_QP_MIN_RNR_TIMER) { context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT; } if (attr_mask & IB_QP_RQ_PSN) context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); /* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */ if (attr_mask & IB_QP_QKEY) { if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) context->qkey = cpu_to_be32(IB_QP_SET_QKEY); else { if (mlx4_is_mfunc(dev->dev) && !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) && (attr->qkey & MLX4_RESERVED_QKEY_MASK) == MLX4_RESERVED_QKEY_BASE) { pr_err("Cannot use reserved QKEY" " 0x%x (range 0xffff0000..0xffffffff" " is reserved)\n", attr->qkey); err = -EINVAL; goto out; } context->qkey = cpu_to_be32(attr->qkey); } optpar |= MLX4_QP_OPTPAR_Q_KEY; } if (ibqp->srq) context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn); if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->db_rec_addr = cpu_to_be64(qp->db.dma); if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR && (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI || ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == IB_QPT_RAW_PACKET)) { context->pri_path.sched_queue = (qp->port - 1) << 6; if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) { context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE; if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI) context->pri_path.fl = 0x80; } else { if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) context->pri_path.fl = 0x80; context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE; } if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) == IB_LINK_LAYER_ETHERNET) { if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI || qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) context->pri_path.feup = 1 << 7; /* don't fsm */ /* handle smac_index */ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD || qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI || qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) { err = handle_eth_ud_smac_index(dev, qp, context); if (err) { err = -EINVAL; goto out; } if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) dev->qp1_proxy[qp->port - 1] = qp; } } } if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { context->pri_path.ackto = (context->pri_path.ackto & 0xf8) | MLX4_IB_LINK_TYPE_ETH; if (dev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) { /* set QP to receive both tunneled & non-tunneled packets */ if (!(context->flags & cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET))) context->srqn = cpu_to_be32(7 << 28); } } if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) { int is_eth = rdma_port_get_link_layer( &dev->ib_dev, qp->port) == IB_LINK_LAYER_ETHERNET; if (is_eth) { context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH; optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH; } } if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) sqd_event = 1; else sqd_event = 0; if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->rlkey_roce_mode |= (1 << 4); /* * Before passing a kernel QP to the HW, make sure that the * ownership bits of the send queue are set and the SQ * headroom is stamped so that the hardware doesn't start * processing stale work requests. */ if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { struct mlx4_wqe_ctrl_seg *ctrl; int i; for (i = 0; i < qp->sq.wqe_cnt; ++i) { ctrl = get_send_wqe(qp, i); ctrl->owner_opcode = cpu_to_be32(1U << 31); if (qp->sq_max_wqes_per_wr == 1) ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4); stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift); } } err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state), to_mlx4_state(new_state), context, optpar, sqd_event, &qp->mqp); if (err) goto out; qp->state = new_state; if (attr_mask & IB_QP_ACCESS_FLAGS) qp->atomic_rd_en = attr->qp_access_flags; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) qp->resp_depth = attr->max_dest_rd_atomic; if (attr_mask & IB_QP_PORT) { qp->port = attr->port_num; update_mcg_macs(dev, qp); } if (attr_mask & IB_QP_ALT_PATH) qp->alt_port = attr->alt_port_num; if (is_sqp(dev, qp)) store_sqp_attrs(to_msqp(qp), attr, attr_mask); /* * If we moved QP0 to RTR, bring the IB link up; if we moved * QP0 to RESET or ERROR, bring the link back down. */ if (is_qp0(dev, qp)) { if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR) if (mlx4_INIT_PORT(dev->dev, qp->port)) pr_warn("INIT_PORT failed for port %d\n", qp->port); if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) mlx4_CLOSE_PORT(dev->dev, qp->port); } /* * If we moved a kernel QP to RESET, clean up all old CQ * entries and reinitialize the QP. */ if (new_state == IB_QPS_RESET) { if (!ibqp->uobject) { mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, ibqp->srq ? to_msrq(ibqp->srq) : NULL); if (send_cq != recv_cq) mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); qp->rq.head = 0; qp->rq.tail = 0; qp->sq.head = 0; qp->sq.tail = 0; qp->sq_next_wqe = 0; if (qp->rq.wqe_cnt) *qp->db.db = 0; if (qp->flags & MLX4_IB_QP_NETIF) mlx4_ib_steer_qp_reg(dev, qp, 0); } if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) { mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); qp->pri.smac = 0; qp->pri.smac_port = 0; } if (qp->alt.smac) { mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); qp->alt.smac = 0; } if (qp->pri.vid < 0x1000) { mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); qp->pri.vid = 0xFFFF; qp->pri.candidate_vid = 0xFFFF; qp->pri.update_vid = 0; } if (qp->alt.vid < 0x1000) { mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); qp->alt.vid = 0xFFFF; qp->alt.candidate_vid = 0xFFFF; qp->alt.update_vid = 0; } } out: if (err && qp->counter_index) mlx4_ib_free_qp_counter(dev, qp); if (err && steer_qp) mlx4_ib_steer_qp_reg(dev, qp, 0); kfree(context); if (qp->pri.candidate_smac || (!qp->pri.candidate_smac && qp->pri.candidate_smac_port)) { if (err) { mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac); } else { if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); qp->pri.smac = qp->pri.candidate_smac; qp->pri.smac_index = qp->pri.candidate_smac_index; qp->pri.smac_port = qp->pri.candidate_smac_port; } qp->pri.candidate_smac = 0; qp->pri.candidate_smac_index = 0; qp->pri.candidate_smac_port = 0; } if (qp->alt.candidate_smac) { if (err) { mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac); } else { if (qp->alt.smac) mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); qp->alt.smac = qp->alt.candidate_smac; qp->alt.smac_index = qp->alt.candidate_smac_index; qp->alt.smac_port = qp->alt.candidate_smac_port; } qp->alt.candidate_smac = 0; qp->alt.candidate_smac_index = 0; qp->alt.candidate_smac_port = 0; } if (qp->pri.update_vid) { if (err) { if (qp->pri.candidate_vid < 0x1000) mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port, qp->pri.candidate_vid); } else { if (qp->pri.vid < 0x1000) mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); qp->pri.vid = qp->pri.candidate_vid; qp->pri.vlan_port = qp->pri.candidate_vlan_port; qp->pri.vlan_index = qp->pri.candidate_vlan_index; } qp->pri.candidate_vid = 0xFFFF; qp->pri.update_vid = 0; } if (qp->alt.update_vid) { if (err) { if (qp->alt.candidate_vid < 0x1000) mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port, qp->alt.candidate_vid); } else { if (qp->alt.vid < 0x1000) mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); qp->alt.vid = qp->alt.candidate_vid; qp->alt.vlan_port = qp->alt.candidate_vlan_port; qp->alt.vlan_index = qp->alt.candidate_vlan_index; } qp->alt.candidate_vid = 0xFFFF; qp->alt.update_vid = 0; } return err; } static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(ibqp->device); struct mlx4_ib_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; int err = -EINVAL; mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) { pr_debug("qpn 0x%x: invalid attribute mask specified " "for transition %d to %d. qp_type %d," " attr_mask 0x%x\n", ibqp->qp_num, cur_state, new_state, ibqp->qp_type, attr_mask); goto out; } if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) { if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) { if ((ibqp->qp_type == IB_QPT_RC) || (ibqp->qp_type == IB_QPT_UD) || (ibqp->qp_type == IB_QPT_UC) || (ibqp->qp_type == IB_QPT_RAW_PACKET) || (ibqp->qp_type == IB_QPT_XRC_INI)) { attr->port_num = mlx4_ib_bond_next_port(dev); } } else { /* no sense in changing port_num * when ports are bonded */ attr_mask &= ~IB_QP_PORT; } } if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || attr->port_num > dev->num_ports)) { pr_debug("qpn 0x%x: invalid port number (%d) specified " "for transition %d to %d. qp_type %d\n", ibqp->qp_num, attr->port_num, cur_state, new_state, ibqp->qp_type); goto out; } if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET) && (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) != IB_LINK_LAYER_ETHERNET)) goto out; if (attr_mask & IB_QP_PKEY_INDEX) { int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) { pr_debug("qpn 0x%x: invalid pkey index (%d) specified " "for transition %d to %d. qp_type %d\n", ibqp->qp_num, attr->pkey_index, cur_state, new_state, ibqp->qp_type); goto out; } } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) { pr_debug("qpn 0x%x: max_rd_atomic (%d) too large. " "Transition %d to %d. qp_type %d\n", ibqp->qp_num, attr->max_rd_atomic, cur_state, new_state, ibqp->qp_type); goto out; } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) { pr_debug("qpn 0x%x: max_dest_rd_atomic (%d) too large. " "Transition %d to %d. qp_type %d\n", ibqp->qp_num, attr->max_dest_rd_atomic, cur_state, new_state, ibqp->qp_type); goto out; } if (cur_state == new_state && cur_state == IB_QPS_RESET) { err = 0; goto out; } err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) attr->port_num = 1; out: mutex_unlock(&qp->mutex); return err; } int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct mlx4_ib_qp *mqp = to_mqp(ibqp); int ret; ret = _mlx4_ib_modify_qp(ibqp, attr, attr_mask, udata); if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) { struct mlx4_ib_sqp *sqp = to_msqp(mqp); int err = 0; if (sqp->roce_v2_gsi) err = ib_modify_qp(sqp->roce_v2_gsi, attr, attr_mask); if (err) pr_err("Failed to modify GSI QP for RoCEv2 (%d)\n", err); } return ret; } static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey) { int i; for (i = 0; i < dev->caps.num_ports; i++) { if (qpn == dev->caps.qp0_proxy[i] || qpn == dev->caps.qp0_tunnel[i]) { *qkey = dev->caps.qp0_qkey[i]; return 0; } } return -EINVAL; } static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, - struct ib_ud_wr *wr, + const struct ib_ud_wr *wr, void *wqe, unsigned *mlx_seg_len) { struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device); struct ib_device *ib_dev = &mdev->ib_dev; struct mlx4_wqe_mlx_seg *mlx = wqe; struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; struct mlx4_ib_ah *ah = to_mah(wr->ah); u16 pkey; u32 qkey; int send_size; int header_size; int spc; int i; if (wr->wr.opcode != IB_WR_SEND) return -EINVAL; send_size = 0; for (i = 0; i < wr->wr.num_sge; ++i) send_size += wr->wr.sg_list[i].length; /* for proxy-qp0 sends, need to add in size of tunnel header */ /* for tunnel-qp0 sends, tunnel header is already in s/g list */ if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) send_size += sizeof (struct mlx4_ib_tunnel_header); ib_ud_header_init(send_size, 1, 0, 0, 0, 0, 0, 0, &sqp->ud_header); if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) { sqp->ud_header.lrh.service_level = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; sqp->ud_header.lrh.destination_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); } mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); /* force loopback */ mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR); mlx->rlid = sqp->ud_header.lrh.destination_lid; sqp->ud_header.lrh.virtual_lane = 0; sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED); ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey); sqp->ud_header.bth.pkey = cpu_to_be16(pkey); if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER) sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn); else sqp->ud_header.bth.destination_qpn = cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]); sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); if (mlx4_is_master(mdev->dev)) { if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) return -EINVAL; } else { if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) return -EINVAL; } sqp->ud_header.deth.qkey = cpu_to_be32(qkey); sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn); sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; sqp->ud_header.immediate_present = 0; header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); /* * Inline data segments may not cross a 64 byte boundary. If * our UD header is bigger than the space available up to the * next 64 byte boundary in the WQE, use two inline data * segments to hold the UD header. */ spc = MLX4_INLINE_ALIGN - ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); if (header_size <= spc) { inl->byte_count = cpu_to_be32((1U << 31) | header_size); memcpy(inl + 1, sqp->header_buf, header_size); i = 1; } else { inl->byte_count = cpu_to_be32((1U << 31) | spc); memcpy(inl + 1, sqp->header_buf, spc); inl = (void *) (inl + 1) + spc; memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); /* * Need a barrier here to make sure all the data is * visible before the byte_count field is set. * Otherwise the HCA prefetcher could grab the 64-byte * chunk with this inline segment and get a valid (!= * 0xffffffff) byte count but stale data, and end up * generating a packet with bad headers. * * The first inline segment's byte_count field doesn't * need a barrier, because it comes after a * control/MLX segment and therefore is at an offset * of 16 mod 64. */ wmb(); inl->byte_count = cpu_to_be32((1U << 31) | (header_size - spc)); i = 2; } *mlx_seg_len = ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); return 0; } static u8 sl_to_vl(struct mlx4_ib_dev *dev, u8 sl, int port_num) { union sl2vl_tbl_to_u64 tmp_vltab; u8 vl; if (sl > 15) return 0xf; tmp_vltab.sl64 = atomic64_read(&dev->sl2vl[port_num - 1]); vl = tmp_vltab.sl8[sl >> 1]; if (sl & 1) vl &= 0x0f; else vl >>= 4; return vl; } #define MLX4_ROCEV2_QP1_SPORT 0xC000 -static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr, +static int build_mlx_header(struct mlx4_ib_sqp *sqp, const struct ib_ud_wr *wr, void *wqe, unsigned *mlx_seg_len) { struct ib_device *ib_dev = sqp->qp.ibqp.device; struct mlx4_wqe_mlx_seg *mlx = wqe; struct mlx4_wqe_ctrl_seg *ctrl = wqe; struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; struct mlx4_ib_ah *ah = to_mah(wr->ah); union ib_gid sgid; u16 pkey; int send_size; int header_size; int spc; int i; int err = 0; u16 vlan = 0xffff; bool is_eth; bool is_vlan = false; bool is_grh; bool is_udp = false; int ip_version = 0; send_size = 0; for (i = 0; i < wr->wr.num_sge; ++i) send_size += wr->wr.sg_list[i].length; is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; is_grh = mlx4_ib_ah_grh_present(ah); if (is_eth) { struct ib_gid_attr gid_attr; if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { /* When multi-function is enabled, the ib_core gid * indexes don't necessarily match the hw ones, so * we must use our own cache */ err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, ah->av.ib.gid_index, &sgid.raw[0]); if (err) return err; } else { err = ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, ah->av.ib.gid_index, &sgid, &gid_attr); if (!err) { if (gid_attr.ndev) if_rele(gid_attr.ndev); if (!memcmp(&sgid, &zgid, sizeof(sgid))) err = -ENOENT; } if (!err) { is_udp = gid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP; if (is_udp) { if (ipv6_addr_v4mapped((struct in6_addr *)&sgid)) ip_version = 4; else ip_version = 6; is_grh = false; } } else { return err; } } if (ah->av.eth.vlan != cpu_to_be16(0xffff)) { vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff; is_vlan = 1; } } err = ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, ip_version, is_udp, 0, &sqp->ud_header); if (err) return err; if (!is_eth) { sqp->ud_header.lrh.service_level = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid; sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); } if (is_grh || (ip_version == 6)) { sqp->ud_header.grh.traffic_class = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; sqp->ud_header.grh.flow_label = ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit; if (is_eth) { memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16); } else { if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { /* When multi-function is enabled, the ib_core gid * indexes don't necessarily match the hw ones, so * we must use our own cache */ sqp->ud_header.grh.source_gid.global.subnet_prefix = cpu_to_be64(atomic64_read(&(to_mdev(ib_dev)->sriov. demux[sqp->qp.port - 1]. subnet_prefix))); sqp->ud_header.grh.source_gid.global.interface_id = to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. guid_cache[ah->av.ib.gid_index]; } else { ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid, NULL); } } memcpy(sqp->ud_header.grh.destination_gid.raw, ah->av.ib.dgid, 16); } if (ip_version == 4) { sqp->ud_header.ip4.tos = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; sqp->ud_header.ip4.id = 0; sqp->ud_header.ip4.frag_off = htons(IP_DF); sqp->ud_header.ip4.ttl = ah->av.eth.hop_limit; memcpy(&sqp->ud_header.ip4.saddr, sgid.raw + 12, 4); memcpy(&sqp->ud_header.ip4.daddr, ah->av.ib.dgid + 12, 4); sqp->ud_header.ip4.check = ib_ud_ip4_csum(&sqp->ud_header); } if (is_udp) { sqp->ud_header.udp.dport = htons(ROCE_V2_UDP_DPORT); sqp->ud_header.udp.sport = htons(MLX4_ROCEV2_QP1_SPORT); sqp->ud_header.udp.csum = 0; } mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); if (!is_eth) { mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | (sqp->ud_header.lrh.service_level << 8)); if (ah->av.ib.port_pd & cpu_to_be32(0x80000000)) mlx->flags |= cpu_to_be32(0x1); /* force loopback */ mlx->rlid = sqp->ud_header.lrh.destination_lid; } switch (wr->wr.opcode) { case IB_WR_SEND: sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; sqp->ud_header.immediate_present = 0; break; case IB_WR_SEND_WITH_IMM: sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; sqp->ud_header.immediate_present = 1; sqp->ud_header.immediate_data = wr->wr.ex.imm_data; break; default: return -EINVAL; } if (is_eth) { struct in6_addr in6; u16 ether_type; u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13; ether_type = (!is_udp) ? MLX4_IB_IBOE_ETHERTYPE : (ip_version == 4 ? ETHERTYPE_IP : ETHERTYPE_IPV6); mlx->sched_prio = cpu_to_be16(pcp); ether_addr_copy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac); memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2); memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4); memcpy(&in6, sgid.raw, sizeof(in6)); if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); if (!is_vlan) { sqp->ud_header.eth.type = cpu_to_be16(ether_type); } else { sqp->ud_header.vlan.type = cpu_to_be16(ether_type); sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp); } } else { sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : sl_to_vl(to_mdev(ib_dev), sqp->ud_header.lrh.service_level, sqp->qp.port); if (sqp->qp.ibqp.qp_num && sqp->ud_header.lrh.virtual_lane == 15) return -EINVAL; if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; } sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED); if (!sqp->qp.ibqp.qp_num) ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); else ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->pkey_index, &pkey); sqp->ud_header.bth.pkey = cpu_to_be16(pkey); sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn); sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); sqp->ud_header.deth.qkey = cpu_to_be32(wr->remote_qkey & 0x80000000 ? sqp->qkey : wr->remote_qkey); sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); if (0) { pr_err("built UD header of size %d:\n", header_size); for (i = 0; i < header_size / 4; ++i) { if (i % 8 == 0) pr_err(" [%02x] ", i * 4); pr_cont(" %08x", be32_to_cpu(((__be32 *) sqp->header_buf)[i])); if ((i + 1) % 8 == 0) pr_cont("\n"); } pr_err("\n"); } /* * Inline data segments may not cross a 64 byte boundary. If * our UD header is bigger than the space available up to the * next 64 byte boundary in the WQE, use two inline data * segments to hold the UD header. */ spc = MLX4_INLINE_ALIGN - ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); if (header_size <= spc) { inl->byte_count = cpu_to_be32(1U << 31 | header_size); memcpy(inl + 1, sqp->header_buf, header_size); i = 1; } else { inl->byte_count = cpu_to_be32(1U << 31 | spc); memcpy(inl + 1, sqp->header_buf, spc); inl = (void *) (inl + 1) + spc; memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); /* * Need a barrier here to make sure all the data is * visible before the byte_count field is set. * Otherwise the HCA prefetcher could grab the 64-byte * chunk with this inline segment and get a valid (!= * 0xffffffff) byte count but stale data, and end up * generating a packet with bad headers. * * The first inline segment's byte_count field doesn't * need a barrier, because it comes after a * control/MLX segment and therefore is at an offset * of 16 mod 64. */ wmb(); inl->byte_count = cpu_to_be32(1U << 31 | (header_size - spc)); i = 2; } *mlx_seg_len = ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); return 0; } static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq) { unsigned cur; struct mlx4_ib_cq *cq; cur = wq->head - wq->tail; if (likely(cur + nreq < wq->max_post)) return 0; cq = to_mcq(ib_cq); spin_lock(&cq->lock); cur = wq->head - wq->tail; spin_unlock(&cq->lock); return cur + nreq >= wq->max_post; } static __be32 convert_access(int acc) { return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC) : 0) | (acc & IB_ACCESS_REMOTE_WRITE ? cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) | (acc & IB_ACCESS_REMOTE_READ ? cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ) : 0) | (acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0) | cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ); } static void set_reg_seg(struct mlx4_wqe_fmr_seg *fseg, - struct ib_reg_wr *wr) + const struct ib_reg_wr *wr) { struct mlx4_ib_mr *mr = to_mmr(wr->mr); fseg->flags = convert_access(wr->access); fseg->mem_key = cpu_to_be32(wr->key); fseg->buf_list = cpu_to_be64(mr->page_map); fseg->start_addr = cpu_to_be64(mr->ibmr.iova); fseg->reg_len = cpu_to_be64(mr->ibmr.length); fseg->offset = 0; /* XXX -- is this just for ZBVA? */ fseg->page_size = cpu_to_be32(ilog2(mr->ibmr.page_size)); fseg->reserved[0] = 0; fseg->reserved[1] = 0; } static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey) { memset(iseg, 0, sizeof(*iseg)); iseg->mem_key = cpu_to_be32(rkey); } static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, u64 remote_addr, u32 rkey) { rseg->raddr = cpu_to_be64(remote_addr); rseg->rkey = cpu_to_be32(rkey); rseg->reserved = 0; } static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, - struct ib_atomic_wr *wr) + const struct ib_atomic_wr *wr) { if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { aseg->swap_add = cpu_to_be64(wr->swap); aseg->compare = cpu_to_be64(wr->compare_add); } else if (wr->wr.opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) { aseg->swap_add = cpu_to_be64(wr->compare_add); aseg->compare = cpu_to_be64(wr->compare_add_mask); } else { aseg->swap_add = cpu_to_be64(wr->compare_add); aseg->compare = 0; } } static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg, - struct ib_atomic_wr *wr) + const struct ib_atomic_wr *wr) { aseg->swap_add = cpu_to_be64(wr->swap); aseg->swap_add_mask = cpu_to_be64(wr->swap_mask); aseg->compare = cpu_to_be64(wr->compare_add); aseg->compare_mask = cpu_to_be64(wr->compare_add_mask); } static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, - struct ib_ud_wr *wr) + const struct ib_ud_wr *wr) { memcpy(dseg->av, &to_mah(wr->ah)->av, sizeof (struct mlx4_av)); dseg->dqpn = cpu_to_be32(wr->remote_qpn); dseg->qkey = cpu_to_be32(wr->remote_qkey); dseg->vlan = to_mah(wr->ah)->av.eth.vlan; memcpy(dseg->mac, to_mah(wr->ah)->av.eth.mac, 6); } static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, struct mlx4_wqe_datagram_seg *dseg, - struct ib_ud_wr *wr, + const struct ib_ud_wr *wr, enum mlx4_ib_qp_type qpt) { union mlx4_ext_av *av = &to_mah(wr->ah)->av; struct mlx4_av sqp_av = {0}; int port = *((u8 *) &av->ib.port_pd) & 0x3; /* force loopback */ sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000); sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */ sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel & cpu_to_be32(0xf0000000); memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av)); if (qpt == MLX4_IB_QPT_PROXY_GSI) dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]); else dseg->dqpn = cpu_to_be32(dev->dev->caps.qp0_tunnel[port - 1]); /* Use QKEY from the QP context, which is set by master */ dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY); } -static void build_tunnel_header(struct ib_ud_wr *wr, void *wqe, unsigned *mlx_seg_len) +static void build_tunnel_header(const struct ib_ud_wr *wr, void *wqe, unsigned *mlx_seg_len) { struct mlx4_wqe_inline_seg *inl = wqe; struct mlx4_ib_tunnel_header hdr; struct mlx4_ib_ah *ah = to_mah(wr->ah); int spc; int i; memcpy(&hdr.av, &ah->av, sizeof hdr.av); hdr.remote_qpn = cpu_to_be32(wr->remote_qpn); hdr.pkey_index = cpu_to_be16(wr->pkey_index); hdr.qkey = cpu_to_be32(wr->remote_qkey); memcpy(hdr.mac, ah->av.eth.mac, 6); hdr.vlan = ah->av.eth.vlan; spc = MLX4_INLINE_ALIGN - ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); if (sizeof (hdr) <= spc) { memcpy(inl + 1, &hdr, sizeof (hdr)); wmb(); inl->byte_count = cpu_to_be32((1U << 31) | (u32)sizeof(hdr)); i = 1; } else { memcpy(inl + 1, &hdr, spc); wmb(); inl->byte_count = cpu_to_be32((1U << 31) | spc); inl = (void *) (inl + 1) + spc; memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc); wmb(); inl->byte_count = cpu_to_be32((1U << 31) | (u32)(sizeof (hdr) - spc)); i = 2; } *mlx_seg_len = ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof (hdr), 16); } static void set_mlx_icrc_seg(void *dseg) { u32 *t = dseg; struct mlx4_wqe_inline_seg *iseg = dseg; t[1] = 0; /* * Need a barrier here before writing the byte_count field to * make sure that all the data is visible before the * byte_count field is set. Otherwise, if the segment begins * a new cacheline, the HCA prefetcher could grab the 64-byte * chunk and get a valid (!= * 0xffffffff) byte count but * stale data, and end up sending the wrong data. */ wmb(); iseg->byte_count = cpu_to_be32((1U << 31) | 4); } static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) { dseg->lkey = cpu_to_be32(sg->lkey); dseg->addr = cpu_to_be64(sg->addr); /* * Need a barrier here before writing the byte_count field to * make sure that all the data is visible before the * byte_count field is set. Otherwise, if the segment begins * a new cacheline, the HCA prefetcher could grab the 64-byte * chunk and get a valid (!= * 0xffffffff) byte count but * stale data, and end up sending the wrong data. */ wmb(); dseg->byte_count = cpu_to_be32(sg->length); } static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) { dseg->byte_count = cpu_to_be32(sg->length); dseg->lkey = cpu_to_be32(sg->lkey); dseg->addr = cpu_to_be64(sg->addr); } -static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_ud_wr *wr, +static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, const struct ib_ud_wr *wr, struct mlx4_ib_qp *qp, unsigned *lso_seg_len, __be32 *lso_hdr_sz, __be32 *blh) { unsigned halign = ALIGN(sizeof *wqe + wr->hlen, 16); if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE)) *blh = cpu_to_be32(1 << 6); if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) && wr->wr.num_sge > qp->sq.max_gs - (halign >> 4))) return -EINVAL; memcpy(wqe->header, wr->header, wr->hlen); *lso_hdr_sz = cpu_to_be32(wr->mss << 16 | wr->hlen); *lso_seg_len = halign; return 0; } -static __be32 send_ieth(struct ib_send_wr *wr) +static __be32 send_ieth(const struct ib_send_wr *wr) { switch (wr->opcode) { case IB_WR_SEND_WITH_IMM: case IB_WR_RDMA_WRITE_WITH_IMM: return wr->ex.imm_data; case IB_WR_SEND_WITH_INV: return cpu_to_be32(wr->ex.invalidate_rkey); default: return 0; } } static void add_zero_len_inline(void *wqe) { struct mlx4_wqe_inline_seg *inl = wqe; memset(wqe, 0, 16); inl->byte_count = cpu_to_be32(1U << 31); } -int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct mlx4_ib_qp *qp = to_mqp(ibqp); void *wqe; struct mlx4_wqe_ctrl_seg *ctrl; struct mlx4_wqe_data_seg *dseg; unsigned long flags; int nreq; int err = 0; unsigned ind; int uninitialized_var(stamp); int uninitialized_var(size); unsigned uninitialized_var(seglen); __be32 dummy; __be32 *lso_wqe; __be32 lso_hdr_sz = 0; __be32 blh; int i; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) { struct mlx4_ib_sqp *sqp = to_msqp(qp); if (sqp->roce_v2_gsi) { struct mlx4_ib_ah *ah = to_mah(ud_wr(wr)->ah); struct ib_gid_attr gid_attr; union ib_gid gid; if (!ib_get_cached_gid(ibqp->device, be32_to_cpu(ah->av.ib.port_pd) >> 24, ah->av.ib.gid_index, &gid, &gid_attr)) { if (gid_attr.ndev) if_rele(gid_attr.ndev); qp = (gid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? to_mqp(sqp->roce_v2_gsi) : qp; } else { pr_err("Failed to get gid at index %d. RoCEv2 will not work properly\n", ah->av.ib.gid_index); } } } spin_lock_irqsave(&qp->sq.lock, flags); if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { err = -EIO; *bad_wr = wr; nreq = 0; goto out; } ind = qp->sq_next_wqe; for (nreq = 0; wr; ++nreq, wr = wr->next) { lso_wqe = &dummy; blh = 0; if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { err = -ENOMEM; *bad_wr = wr; goto out; } if (unlikely(wr->num_sge > qp->sq.max_gs)) { err = -EINVAL; *bad_wr = wr; goto out; } ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id; ctrl->srcrb_flags = (wr->send_flags & IB_SEND_SIGNALED ? cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | (wr->send_flags & IB_SEND_SOLICITED ? cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) | ((wr->send_flags & IB_SEND_IP_CSUM) ? cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) | qp->sq_signal_bits; ctrl->imm = send_ieth(wr); wqe += sizeof *ctrl; size = sizeof *ctrl / 16; switch (qp->mlx4_ib_qp_type) { case MLX4_IB_QPT_RC: case MLX4_IB_QPT_UC: switch (wr->opcode) { case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: set_raddr_seg(wqe, atomic_wr(wr)->remote_addr, atomic_wr(wr)->rkey); wqe += sizeof (struct mlx4_wqe_raddr_seg); set_atomic_seg(wqe, atomic_wr(wr)); wqe += sizeof (struct mlx4_wqe_atomic_seg); size += (sizeof (struct mlx4_wqe_raddr_seg) + sizeof (struct mlx4_wqe_atomic_seg)) / 16; break; case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: set_raddr_seg(wqe, atomic_wr(wr)->remote_addr, atomic_wr(wr)->rkey); wqe += sizeof (struct mlx4_wqe_raddr_seg); set_masked_atomic_seg(wqe, atomic_wr(wr)); wqe += sizeof (struct mlx4_wqe_masked_atomic_seg); size += (sizeof (struct mlx4_wqe_raddr_seg) + sizeof (struct mlx4_wqe_masked_atomic_seg)) / 16; break; case IB_WR_RDMA_READ: case IB_WR_RDMA_WRITE: case IB_WR_RDMA_WRITE_WITH_IMM: set_raddr_seg(wqe, rdma_wr(wr)->remote_addr, rdma_wr(wr)->rkey); wqe += sizeof (struct mlx4_wqe_raddr_seg); size += sizeof (struct mlx4_wqe_raddr_seg) / 16; break; case IB_WR_LOCAL_INV: ctrl->srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); set_local_inv_seg(wqe, wr->ex.invalidate_rkey); wqe += sizeof (struct mlx4_wqe_local_inval_seg); size += sizeof (struct mlx4_wqe_local_inval_seg) / 16; break; case IB_WR_REG_MR: ctrl->srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); set_reg_seg(wqe, reg_wr(wr)); wqe += sizeof(struct mlx4_wqe_fmr_seg); size += sizeof(struct mlx4_wqe_fmr_seg) / 16; break; default: /* No extra segments required for sends */ break; } break; case MLX4_IB_QPT_TUN_SMI_OWNER: err = build_sriov_qp0_header(to_msqp(qp), ud_wr(wr), ctrl, &seglen); if (unlikely(err)) { *bad_wr = wr; goto out; } wqe += seglen; size += seglen / 16; break; case MLX4_IB_QPT_TUN_SMI: case MLX4_IB_QPT_TUN_GSI: /* this is a UD qp used in MAD responses to slaves. */ set_datagram_seg(wqe, ud_wr(wr)); /* set the forced-loopback bit in the data seg av */ *(__be32 *) wqe |= cpu_to_be32(0x80000000); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; break; case MLX4_IB_QPT_UD: set_datagram_seg(wqe, ud_wr(wr)); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; if (wr->opcode == IB_WR_LSO) { err = build_lso_seg(wqe, ud_wr(wr), qp, &seglen, &lso_hdr_sz, &blh); if (unlikely(err)) { *bad_wr = wr; goto out; } lso_wqe = (__be32 *) wqe; wqe += seglen; size += seglen / 16; } break; case MLX4_IB_QPT_PROXY_SMI_OWNER: err = build_sriov_qp0_header(to_msqp(qp), ud_wr(wr), ctrl, &seglen); if (unlikely(err)) { *bad_wr = wr; goto out; } wqe += seglen; size += seglen / 16; /* to start tunnel header on a cache-line boundary */ add_zero_len_inline(wqe); wqe += 16; size++; build_tunnel_header(ud_wr(wr), wqe, &seglen); wqe += seglen; size += seglen / 16; break; case MLX4_IB_QPT_PROXY_SMI: case MLX4_IB_QPT_PROXY_GSI: /* If we are tunneling special qps, this is a UD qp. * In this case we first add a UD segment targeting * the tunnel qp, and then add a header with address * information */ set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, ud_wr(wr), qp->mlx4_ib_qp_type); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; build_tunnel_header(ud_wr(wr), wqe, &seglen); wqe += seglen; size += seglen / 16; break; case MLX4_IB_QPT_SMI: case MLX4_IB_QPT_GSI: err = build_mlx_header(to_msqp(qp), ud_wr(wr), ctrl, &seglen); if (unlikely(err)) { *bad_wr = wr; goto out; } wqe += seglen; size += seglen / 16; break; default: break; } /* * Write data segments in reverse order, so as to * overwrite cacheline stamp last within each * cacheline. This avoids issues with WQE * prefetching. */ dseg = wqe; dseg += wr->num_sge - 1; size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16); /* Add one more inline data segment for ICRC for MLX sends */ if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI || qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))) { set_mlx_icrc_seg(dseg + 1); size += sizeof (struct mlx4_wqe_data_seg) / 16; } for (i = wr->num_sge - 1; i >= 0; --i, --dseg) set_data_seg(dseg, wr->sg_list + i); /* * Possibly overwrite stamping in cacheline with LSO * segment only after making sure all data segments * are written. */ wmb(); *lso_wqe = lso_hdr_sz; ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ? MLX4_WQE_CTRL_FENCE : 0) | size; /* * Make sure descriptor is fully written before * setting ownership bit (because HW can start * executing as soon as we do). */ wmb(); if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) { *bad_wr = wr; err = -EINVAL; goto out; } ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | (ind & qp->sq.wqe_cnt ? cpu_to_be32(1U << 31) : 0) | blh; stamp = ind + qp->sq_spare_wqes; ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); /* * We can improve latency by not stamping the last * send queue WQE until after ringing the doorbell, so * only stamp here if there are still more WQEs to post. * * Same optimization applies to padding with NOP wqe * in case of WQE shrinking (used to prevent wrap-around * in the middle of WR). */ if (wr->next) { stamp_send_wqe(qp, stamp, size * 16); ind = pad_wraparound(qp, ind); } } out: if (likely(nreq)) { qp->sq.head += nreq; /* * Make sure that descriptors are written before * doorbell record. */ wmb(); writel(qp->doorbell_qpn, to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL); /* * Make sure doorbells don't leak out of SQ spinlock * and reach the HCA out of order. */ mmiowb(); stamp_send_wqe(qp, stamp, size * 16); ind = pad_wraparound(qp, ind); qp->sq_next_wqe = ind; } spin_unlock_irqrestore(&qp->sq.lock, flags); return err; } -int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mlx4_ib_qp *qp = to_mqp(ibqp); struct mlx4_wqe_data_seg *scat; unsigned long flags; int err = 0; int nreq; int ind; int max_gs; int i; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); max_gs = qp->rq.max_gs; spin_lock_irqsave(&qp->rq.lock, flags); if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { err = -EIO; *bad_wr = wr; nreq = 0; goto out; } ind = qp->rq.head & (qp->rq.wqe_cnt - 1); for (nreq = 0; wr; ++nreq, wr = wr->next) { if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { err = -ENOMEM; *bad_wr = wr; goto out; } if (unlikely(wr->num_sge > qp->rq.max_gs)) { err = -EINVAL; *bad_wr = wr; goto out; } scat = get_recv_wqe(qp, ind); if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) { ib_dma_sync_single_for_device(ibqp->device, qp->sqp_proxy_rcv[ind].map, sizeof (struct mlx4_ib_proxy_sqp_hdr), DMA_FROM_DEVICE); scat->byte_count = cpu_to_be32(sizeof (struct mlx4_ib_proxy_sqp_hdr)); /* use dma lkey from upper layer entry */ scat->lkey = cpu_to_be32(wr->sg_list->lkey); scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map); scat++; max_gs--; } for (i = 0; i < wr->num_sge; ++i) __set_data_seg(scat + i, wr->sg_list + i); if (i < max_gs) { scat[i].byte_count = 0; scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); scat[i].addr = 0; } qp->rq.wrid[ind] = wr->wr_id; ind = (ind + 1) & (qp->rq.wqe_cnt - 1); } out: if (likely(nreq)) { qp->rq.head += nreq; /* * Make sure that descriptors are written before * doorbell record. */ wmb(); *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); } spin_unlock_irqrestore(&qp->rq.lock, flags); return err; } static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state) { switch (mlx4_state) { case MLX4_QP_STATE_RST: return IB_QPS_RESET; case MLX4_QP_STATE_INIT: return IB_QPS_INIT; case MLX4_QP_STATE_RTR: return IB_QPS_RTR; case MLX4_QP_STATE_RTS: return IB_QPS_RTS; case MLX4_QP_STATE_SQ_DRAINING: case MLX4_QP_STATE_SQD: return IB_QPS_SQD; case MLX4_QP_STATE_SQER: return IB_QPS_SQE; case MLX4_QP_STATE_ERR: return IB_QPS_ERR; default: return -1; } } static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state) { switch (mlx4_mig_state) { case MLX4_QP_PM_ARMED: return IB_MIG_ARMED; case MLX4_QP_PM_REARM: return IB_MIG_REARM; case MLX4_QP_PM_MIGRATED: return IB_MIG_MIGRATED; default: return -1; } } static int to_ib_qp_access_flags(int mlx4_flags) { int ib_flags = 0; if (mlx4_flags & MLX4_QP_BIT_RRE) ib_flags |= IB_ACCESS_REMOTE_READ; if (mlx4_flags & MLX4_QP_BIT_RWE) ib_flags |= IB_ACCESS_REMOTE_WRITE; if (mlx4_flags & MLX4_QP_BIT_RAE) ib_flags |= IB_ACCESS_REMOTE_ATOMIC; return ib_flags; } static void to_ib_ah_attr(struct mlx4_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr, struct mlx4_qp_path *path) { struct mlx4_dev *dev = ibdev->dev; int is_eth; memset(ib_ah_attr, 0, sizeof *ib_ah_attr); ib_ah_attr->port_num = path->sched_queue & 0x40 ? 2 : 1; if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports) return; is_eth = rdma_port_get_link_layer(&ibdev->ib_dev, ib_ah_attr->port_num) == IB_LINK_LAYER_ETHERNET; if (is_eth) ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) | ((path->sched_queue & 4) << 1); else ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf; ib_ah_attr->dlid = be16_to_cpu(path->rlid); ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f; ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; ib_ah_attr->ah_flags = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0; if (ib_ah_attr->ah_flags) { ib_ah_attr->grh.sgid_index = path->mgid_index; ib_ah_attr->grh.hop_limit = path->hop_limit; ib_ah_attr->grh.traffic_class = (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; ib_ah_attr->grh.flow_label = be32_to_cpu(path->tclass_flowlabel) & 0xfffff; memcpy(ib_ah_attr->grh.dgid.raw, path->rgid, sizeof ib_ah_attr->grh.dgid.raw); } } int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { struct mlx4_ib_dev *dev = to_mdev(ibqp->device); struct mlx4_ib_qp *qp = to_mqp(ibqp); struct mlx4_qp_context context; int mlx4_state; int err = 0; mutex_lock(&qp->mutex); if (qp->state == IB_QPS_RESET) { qp_attr->qp_state = IB_QPS_RESET; goto done; } err = mlx4_qp_query(dev->dev, &qp->mqp, &context); if (err) { err = -EINVAL; goto out; } mlx4_state = be32_to_cpu(context.flags) >> 28; qp->state = to_ib_qp_state(mlx4_state); qp_attr->qp_state = qp->state; qp_attr->path_mtu = context.mtu_msgmax >> 5; qp_attr->path_mig_state = to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3); qp_attr->qkey = be32_to_cpu(context.qkey); qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff; qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff; qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff; qp_attr->qp_access_flags = to_ib_qp_access_flags(be32_to_cpu(context.params2)); if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path); to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path); qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f; qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; } qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f; if (qp_attr->qp_state == IB_QPS_INIT) qp_attr->port_num = qp->port; else qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1; /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING; qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7); qp_attr->max_dest_rd_atomic = 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7); qp_attr->min_rnr_timer = (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f; qp_attr->timeout = context.pri_path.ackto >> 3; qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7; qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7; qp_attr->alt_timeout = context.alt_path.ackto >> 3; done: qp_attr->cur_qp_state = qp_attr->qp_state; qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; qp_attr->cap.max_recv_sge = qp->rq.max_gs; if (!ibqp->uobject) { qp_attr->cap.max_send_wr = qp->sq.wqe_cnt; qp_attr->cap.max_send_sge = qp->sq.max_gs; } else { qp_attr->cap.max_send_wr = 0; qp_attr->cap.max_send_sge = 0; } /* * We don't support inline sends for kernel QPs (yet), and we * don't know what userspace's value should be. */ qp_attr->cap.max_inline_data = 0; qp_init_attr->cap = qp_attr->cap; qp_init_attr->create_flags = 0; if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; if (qp->flags & MLX4_IB_QP_LSO) qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; if (qp->flags & MLX4_IB_QP_NETIF) qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP; qp_init_attr->sq_sig_type = qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; out: mutex_unlock(&qp->mutex); return err; } diff --git a/sys/dev/mlx4/mlx4_ib/mlx4_ib_srq.c b/sys/dev/mlx4/mlx4_ib/mlx4_ib_srq.c index a2e55db37125..a040b673dae9 100644 --- a/sys/dev/mlx4/mlx4_ib/mlx4_ib_srq.c +++ b/sys/dev/mlx4/mlx4_ib/mlx4_ib_srq.c @@ -1,382 +1,382 @@ /* * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include "mlx4_ib.h" #include static void *get_wqe(struct mlx4_ib_srq *srq, int n) { return mlx4_buf_offset(&srq->buf, n << srq->msrq.wqe_shift); } static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type) { struct ib_event event; struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq; if (ibsrq->event_handler) { event.device = ibsrq->device; event.element.srq = ibsrq; switch (type) { case MLX4_EVENT_TYPE_SRQ_LIMIT: event.event = IB_EVENT_SRQ_LIMIT_REACHED; break; case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR: event.event = IB_EVENT_SRQ_ERR; break; default: pr_warn("Unexpected event type %d " "on SRQ %06x\n", type, srq->srqn); return; } ibsrq->event_handler(&event, ibsrq->srq_context); } } struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *init_attr, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(pd->device); struct mlx4_ib_srq *srq; struct mlx4_wqe_srq_next_seg *next; struct mlx4_wqe_data_seg *scatter; u32 cqn; u16 xrcdn; int desc_size; int buf_size; int err; int i; /* Sanity check SRQ size before proceeding */ if (init_attr->attr.max_wr >= dev->dev->caps.max_srq_wqes || init_attr->attr.max_sge > dev->dev->caps.max_srq_sge) return ERR_PTR(-EINVAL); srq = kmalloc(sizeof *srq, GFP_KERNEL); if (!srq) return ERR_PTR(-ENOMEM); mutex_init(&srq->mutex); spin_lock_init(&srq->lock); srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); srq->msrq.max_gs = init_attr->attr.max_sge; desc_size = max(32UL, roundup_pow_of_two(sizeof (struct mlx4_wqe_srq_next_seg) + srq->msrq.max_gs * sizeof (struct mlx4_wqe_data_seg))); srq->msrq.wqe_shift = ilog2(desc_size); buf_size = srq->msrq.max * desc_size; if (pd->uobject) { struct mlx4_ib_create_srq ucmd; if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { err = -EFAULT; goto err_srq; } srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, 0, 0); if (IS_ERR(srq->umem)) { err = PTR_ERR(srq->umem); goto err_srq; } err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem), ilog2(srq->umem->page_size), &srq->mtt); if (err) goto err_buf; err = mlx4_ib_umem_write_mtt(dev, &srq->mtt, srq->umem); if (err) goto err_mtt; err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), ucmd.db_addr, &srq->db); if (err) goto err_mtt; } else { err = mlx4_db_alloc(dev->dev, &srq->db, 0, GFP_KERNEL); if (err) goto err_srq; *srq->db.db = 0; if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf, GFP_KERNEL)) { err = -ENOMEM; goto err_db; } srq->head = 0; srq->tail = srq->msrq.max - 1; srq->wqe_ctr = 0; for (i = 0; i < srq->msrq.max; ++i) { next = get_wqe(srq, i); next->next_wqe_index = cpu_to_be16((i + 1) & (srq->msrq.max - 1)); for (scatter = (void *) (next + 1); (void *) scatter < (void *) next + desc_size; ++scatter) scatter->lkey = cpu_to_be32(MLX4_INVALID_LKEY); } err = mlx4_mtt_init(dev->dev, srq->buf.npages, srq->buf.page_shift, &srq->mtt); if (err) goto err_buf; err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf, GFP_KERNEL); if (err) goto err_mtt; srq->wrid = kmalloc_array(srq->msrq.max, sizeof(u64), GFP_KERNEL | __GFP_NOWARN); if (!srq->wrid) { srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL, 0 /*PAGE_KERNEL*/); if (!srq->wrid) { err = -ENOMEM; goto err_mtt; } } } cqn = (init_attr->srq_type == IB_SRQT_XRC) ? to_mcq(init_attr->ext.xrc.cq)->mcq.cqn : 0; xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ? to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn : (u16) dev->dev->caps.reserved_xrcds; err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, cqn, xrcdn, &srq->mtt, srq->db.dma, &srq->msrq); if (err) goto err_wrid; srq->msrq.event = mlx4_ib_srq_event; srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; if (pd->uobject) if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) { err = -EFAULT; goto err_wrid; } init_attr->attr.max_wr = srq->msrq.max - 1; return &srq->ibsrq; err_wrid: if (pd->uobject) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db); else kvfree(srq->wrid); err_mtt: mlx4_mtt_cleanup(dev->dev, &srq->mtt); err_buf: if (pd->uobject) ib_umem_release(srq->umem); else mlx4_buf_free(dev->dev, buf_size, &srq->buf); err_db: if (!pd->uobject) mlx4_db_free(dev->dev, &srq->db); err_srq: kfree(srq); return ERR_PTR(err); } int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(ibsrq->device); struct mlx4_ib_srq *srq = to_msrq(ibsrq); int ret; /* We don't support resizing SRQs (yet?) */ if (attr_mask & IB_SRQ_MAX_WR) return -EINVAL; if (attr_mask & IB_SRQ_LIMIT) { if (attr->srq_limit >= srq->msrq.max) return -EINVAL; mutex_lock(&srq->mutex); ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit); mutex_unlock(&srq->mutex); if (ret) return ret; } return 0; } int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) { struct mlx4_ib_dev *dev = to_mdev(ibsrq->device); struct mlx4_ib_srq *srq = to_msrq(ibsrq); int ret; int limit_watermark; ret = mlx4_srq_query(dev->dev, &srq->msrq, &limit_watermark); if (ret) return ret; srq_attr->srq_limit = limit_watermark; srq_attr->max_wr = srq->msrq.max - 1; srq_attr->max_sge = srq->msrq.max_gs; return 0; } int mlx4_ib_destroy_srq(struct ib_srq *srq) { struct mlx4_ib_dev *dev = to_mdev(srq->device); struct mlx4_ib_srq *msrq = to_msrq(srq); mlx4_srq_free(dev->dev, &msrq->msrq); mlx4_mtt_cleanup(dev->dev, &msrq->mtt); if (srq->uobject) { mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); ib_umem_release(msrq->umem); } else { kvfree(msrq->wrid); mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift, &msrq->buf); mlx4_db_free(dev->dev, &msrq->db); } kfree(msrq); return 0; } void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index) { struct mlx4_wqe_srq_next_seg *next; /* always called with interrupts disabled. */ spin_lock(&srq->lock); next = get_wqe(srq, srq->tail); next->next_wqe_index = cpu_to_be16(wqe_index); srq->tail = wqe_index; spin_unlock(&srq->lock); } -int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mlx4_ib_srq *srq = to_msrq(ibsrq); struct mlx4_wqe_srq_next_seg *next; struct mlx4_wqe_data_seg *scat; unsigned long flags; int err = 0; int nreq; int i; struct mlx4_ib_dev *mdev = to_mdev(ibsrq->device); spin_lock_irqsave(&srq->lock, flags); if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { err = -EIO; *bad_wr = wr; nreq = 0; goto out; } for (nreq = 0; wr; ++nreq, wr = wr->next) { if (unlikely(wr->num_sge > srq->msrq.max_gs)) { err = -EINVAL; *bad_wr = wr; break; } if (unlikely(srq->head == srq->tail)) { err = -ENOMEM; *bad_wr = wr; break; } srq->wrid[srq->head] = wr->wr_id; next = get_wqe(srq, srq->head); srq->head = be16_to_cpu(next->next_wqe_index); scat = (struct mlx4_wqe_data_seg *) (next + 1); for (i = 0; i < wr->num_sge; ++i) { scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); } if (i < srq->msrq.max_gs) { scat[i].byte_count = 0; scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); scat[i].addr = 0; } } if (likely(nreq)) { srq->wqe_ctr += nreq; /* * Make sure that descriptors are written before * doorbell record. */ wmb(); *srq->db.db = cpu_to_be32(srq->wqe_ctr); } out: spin_unlock_irqrestore(&srq->lock, flags); return err; } diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib.h b/sys/dev/mlx5/mlx5_ib/mlx5_ib.h index 44a9aa307be7..695b1ab14948 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib.h +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib.h @@ -1,1112 +1,1112 @@ /*- * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef MLX5_IB_H #define MLX5_IB_H #include #include #include #include #include #include #include #include #include #include #include #include #include #define mlx5_ib_dbg(dev, format, arg...) \ pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ __LINE__, current->pid, ##arg) #define mlx5_ib_err(dev, format, arg...) \ pr_err("%s: ERR: %s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ __LINE__, current->pid, ##arg) #define mlx5_ib_warn(dev, format, arg...) \ pr_warn("%s: WARN: %s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ __LINE__, current->pid, ##arg) #define field_avail(type, fld, sz) (offsetof(type, fld) + \ sizeof(((type *)0)->fld) <= (sz)) #define MLX5_IB_DEFAULT_UIDX 0xffffff #define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index) #define MLX5_MKEY_PAGE_SHIFT_MASK __mlx5_mask(mkc, log_page_size) enum { MLX5_IB_MMAP_CMD_SHIFT = 8, MLX5_IB_MMAP_CMD_MASK = 0xff, }; enum mlx5_ib_mmap_cmd { MLX5_IB_MMAP_REGULAR_PAGE = 0, MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, MLX5_IB_MMAP_WC_PAGE = 2, MLX5_IB_MMAP_NC_PAGE = 3, /* 5 is chosen in order to be compatible with old versions of libmlx5 */ MLX5_IB_MMAP_CORE_CLOCK = 5, }; enum { MLX5_RES_SCAT_DATA32_CQE = 0x1, MLX5_RES_SCAT_DATA64_CQE = 0x2, MLX5_REQ_SCAT_DATA32_CQE = 0x11, MLX5_REQ_SCAT_DATA64_CQE = 0x22, }; enum mlx5_ib_latency_class { MLX5_IB_LATENCY_CLASS_LOW, MLX5_IB_LATENCY_CLASS_MEDIUM, MLX5_IB_LATENCY_CLASS_HIGH, MLX5_IB_LATENCY_CLASS_FAST_PATH }; enum mlx5_ib_mad_ifc_flags { MLX5_MAD_IFC_IGNORE_MKEY = 1, MLX5_MAD_IFC_IGNORE_BKEY = 2, MLX5_MAD_IFC_NET_VIEW = 4, }; enum { MLX5_CROSS_CHANNEL_BFREG = 0, }; enum { MLX5_CQE_VERSION_V0, MLX5_CQE_VERSION_V1, }; enum { MLX5_IB_INVALID_UAR_INDEX = BIT(31), MLX5_IB_INVALID_BFREG = BIT(31), }; struct mlx5_ib_vma_private_data { struct list_head list; struct vm_area_struct *vma; }; struct mlx5_bfreg_info { u32 *sys_pages; int num_low_latency_bfregs; unsigned int *count; /* * protect bfreg allocation data structs */ struct mutex lock; u32 ver; u8 lib_uar_4k : 1; u8 lib_uar_dyn : 1; u32 num_sys_pages; u32 num_static_sys_pages; u32 total_num_bfregs; u32 num_dyn_bfregs; }; struct mlx5_ib_ucontext { struct ib_ucontext ibucontext; struct list_head db_page_list; /* protect doorbell record alloc/free */ struct mutex db_page_mutex; struct mlx5_bfreg_info bfregi; u8 cqe_version; /* Transport Domain number */ u32 tdn; struct list_head vma_private_list; }; static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) { return container_of(ibucontext, struct mlx5_ib_ucontext, ibucontext); } struct mlx5_ib_pd { struct ib_pd ibpd; u32 pdn; }; #define MLX5_IB_FLOW_MCAST_PRIO (MLX5_BY_PASS_NUM_PRIOS - 1) #define MLX5_IB_FLOW_LAST_PRIO (MLX5_BY_PASS_NUM_REGULAR_PRIOS - 1) #if (MLX5_IB_FLOW_LAST_PRIO <= 0) #error "Invalid number of bypass priorities" #endif #define MLX5_IB_FLOW_LEFTOVERS_PRIO (MLX5_IB_FLOW_MCAST_PRIO + 1) #define MLX5_IB_NUM_FLOW_FT (MLX5_IB_FLOW_LEFTOVERS_PRIO + 1) #define MLX5_IB_NUM_SNIFFER_FTS 2 struct mlx5_ib_flow_prio { struct mlx5_flow_table *flow_table; unsigned int refcount; }; struct mlx5_ib_flow_handler { struct list_head list; struct ib_flow ibflow; struct mlx5_ib_flow_prio *prio; struct mlx5_flow_rule *rule; }; struct mlx5_ib_flow_db { struct mlx5_ib_flow_prio prios[MLX5_IB_NUM_FLOW_FT]; struct mlx5_ib_flow_prio sniffer[MLX5_IB_NUM_SNIFFER_FTS]; struct mlx5_flow_table *lag_demux_ft; /* Protect flow steering bypass flow tables * when add/del flow rules. * only single add/removal of flow steering rule could be done * simultaneously. */ struct mutex lock; }; /* Use macros here so that don't have to duplicate * enum ib_send_flags and enum ib_qp_type for low-level driver */ #define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START #define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1) #define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2) #define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (IB_SEND_RESERVED_START << 3) #define MLX5_IB_SEND_UMR_UPDATE_PD (IB_SEND_RESERVED_START << 4) #define MLX5_IB_SEND_UMR_UPDATE_ACCESS IB_SEND_RESERVED_END #define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 /* * IB_QPT_GSI creates the software wrapper around GSI, and MLX5_IB_QPT_HW_GSI * creates the actual hardware QP. */ #define MLX5_IB_QPT_HW_GSI IB_QPT_RESERVED2 #define MLX5_IB_QPT_DCI IB_QPT_RESERVED3 #define MLX5_IB_QPT_DCT IB_QPT_RESERVED4 #define MLX5_IB_WR_UMR IB_WR_RESERVED1 /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. * * These flags are intended for internal use by the mlx5_ib driver, and they * rely on the range reserved for that use in the ib_qp_create_flags enum. */ #define MLX5_IB_QP_CREATE_SQPN_QP1 IB_QP_CREATE_RESERVED_START #define MLX5_IB_QP_CREATE_WC_TEST (IB_QP_CREATE_RESERVED_START << 1) struct wr_list { u16 opcode; u16 next; }; struct mlx5_ib_wq { u64 *wrid; u32 *wr_data; struct wr_list *w_list; unsigned *wqe_head; u16 unsig_count; /* serialize post to the work queue */ spinlock_t lock; int wqe_cnt; int max_post; int max_gs; int offset; int wqe_shift; unsigned head; unsigned tail; u16 cur_post; u16 last_poll; void *qend; }; struct mlx5_ib_rwq { struct ib_wq ibwq; struct mlx5_core_qp core_qp; u32 rq_num_pas; u32 log_rq_stride; u32 log_rq_size; u32 rq_page_offset; u32 log_page_size; struct ib_umem *umem; size_t buf_size; unsigned int page_shift; int create_type; struct mlx5_db db; u32 user_index; u32 wqe_count; u32 wqe_shift; int wq_sig; }; enum { MLX5_QP_USER, MLX5_QP_KERNEL, MLX5_QP_EMPTY }; enum { MLX5_WQ_USER, MLX5_WQ_KERNEL }; struct mlx5_ib_rwq_ind_table { struct ib_rwq_ind_table ib_rwq_ind_tbl; u32 rqtn; }; /* * Connect-IB can trigger up to four concurrent pagefaults * per-QP. */ enum mlx5_ib_pagefault_context { MLX5_IB_PAGEFAULT_RESPONDER_READ, MLX5_IB_PAGEFAULT_REQUESTOR_READ, MLX5_IB_PAGEFAULT_RESPONDER_WRITE, MLX5_IB_PAGEFAULT_REQUESTOR_WRITE, MLX5_IB_PAGEFAULT_CONTEXTS }; static inline enum mlx5_ib_pagefault_context mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault) { return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE); } struct mlx5_ib_pfault { struct work_struct work; struct mlx5_pagefault mpfault; }; struct mlx5_ib_ubuffer { struct ib_umem *umem; int buf_size; u64 buf_addr; }; struct mlx5_ib_qp_base { struct mlx5_ib_qp *container_mibqp; struct mlx5_core_qp mqp; struct mlx5_ib_ubuffer ubuffer; }; struct mlx5_ib_qp_trans { struct mlx5_ib_qp_base base; u16 xrcdn; u8 alt_port; u8 atomic_rd_en; u8 resp_depth; }; struct mlx5_ib_rss_qp { u32 tirn; }; struct mlx5_ib_rq { struct mlx5_ib_qp_base base; struct mlx5_ib_wq *rq; struct mlx5_ib_ubuffer ubuffer; struct mlx5_db *doorbell; u32 tirn; u8 state; }; struct mlx5_ib_sq { struct mlx5_ib_qp_base base; struct mlx5_ib_wq *sq; struct mlx5_ib_ubuffer ubuffer; struct mlx5_db *doorbell; u32 tisn; u8 state; }; struct mlx5_ib_raw_packet_qp { struct mlx5_ib_sq sq; struct mlx5_ib_rq rq; }; struct mlx5_bf { int buf_size; unsigned long offset; struct mlx5_sq_bfreg *bfreg; spinlock_t lock32; }; struct mlx5_ib_qp { struct ib_qp ibqp; union { struct mlx5_ib_qp_trans trans_qp; struct mlx5_ib_raw_packet_qp raw_packet_qp; struct mlx5_ib_rss_qp rss_qp; }; struct mlx5_buf buf; struct mlx5_db db; struct mlx5_ib_wq rq; u8 sq_signal_bits; u8 fm_cache; struct mlx5_ib_wq sq; /* serialize qp state modifications */ struct mutex mutex; u32 flags; u8 port; u8 state; int wq_sig; int scat_cqe; int max_inline_data; struct mlx5_bf bf; int has_rq; /* only for user space QPs. For kernel * we have it from the bf object */ int bfregn; int create_type; /* Store signature errors */ bool signature_en; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING /* * A flag that is true for QP's that are in a state that doesn't * allow page faults, and shouldn't schedule any more faults. */ int disable_page_faults; /* * The disable_page_faults_lock protects a QP's disable_page_faults * field, allowing for a thread to atomically check whether the QP * allows page faults, and if so schedule a page fault. */ spinlock_t disable_page_faults_lock; struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS]; #endif struct list_head qps_list; struct list_head cq_recv_list; struct list_head cq_send_list; }; struct mlx5_ib_cq_buf { struct mlx5_buf buf; struct ib_umem *umem; int cqe_size; int nent; }; enum mlx5_ib_qp_flags { MLX5_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, MLX5_IB_QP_CROSS_CHANNEL = IB_QP_CREATE_CROSS_CHANNEL, MLX5_IB_QP_MANAGED_SEND = IB_QP_CREATE_MANAGED_SEND, MLX5_IB_QP_MANAGED_RECV = IB_QP_CREATE_MANAGED_RECV, MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 5, /* QP uses 1 as its source QP number */ MLX5_IB_QP_SQPN_QP1 = 1 << 6, MLX5_IB_QP_CAP_SCATTER_FCS = 1 << 7, MLX5_IB_QP_RSS = 1 << 8, }; struct mlx5_umr_wr { struct ib_send_wr wr; union { u64 virt_addr; u64 offset; } target; struct ib_pd *pd; unsigned int page_shift; unsigned int npages; u32 length; int access_flags; u32 mkey; }; -static inline struct mlx5_umr_wr *umr_wr(struct ib_send_wr *wr) +static inline const struct mlx5_umr_wr *umr_wr(const struct ib_send_wr *wr) { return container_of(wr, struct mlx5_umr_wr, wr); } struct mlx5_shared_mr_info { int mr_id; struct ib_umem *umem; }; struct mlx5_ib_cq { struct ib_cq ibcq; struct mlx5_core_cq mcq; struct mlx5_ib_cq_buf buf; struct mlx5_db db; /* serialize access to the CQ */ spinlock_t lock; /* protect resize cq */ struct mutex resize_mutex; struct mlx5_ib_cq_buf *resize_buf; struct ib_umem *resize_umem; int cqe_size; struct list_head list_send_qp; struct list_head list_recv_qp; u32 create_flags; struct list_head wc_list; enum ib_cq_notify_flags notify_flags; struct work_struct notify_work; }; struct mlx5_ib_wc { struct ib_wc wc; struct list_head list; }; struct mlx5_ib_srq { struct ib_srq ibsrq; struct mlx5_core_srq msrq; struct mlx5_buf buf; struct mlx5_db db; u64 *wrid; /* protect SRQ hanlding */ spinlock_t lock; int head; int tail; u16 wqe_ctr; struct ib_umem *umem; /* serialize arming a SRQ */ struct mutex mutex; int wq_sig; }; struct mlx5_ib_xrcd { struct ib_xrcd ibxrcd; u32 xrcdn; }; enum mlx5_ib_mtt_access_flags { MLX5_IB_MTT_READ = (1 << 0), MLX5_IB_MTT_WRITE = (1 << 1), }; #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) struct mlx5_ib_mr { struct ib_mr ibmr; void *descs; dma_addr_t desc_map; int ndescs; int max_descs; int desc_size; int access_mode; struct mlx5_core_mr mmkey; struct ib_umem *umem; struct mlx5_shared_mr_info *smr_info; struct list_head list; int order; int umred; int npages; struct mlx5_ib_dev *dev; u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; struct mlx5_core_sig_ctx *sig; int live; void *descs_alloc; int access_flags; /* Needed for rereg MR */ struct mlx5_async_work cb_work; }; struct mlx5_ib_mw { struct ib_mw ibmw; struct mlx5_core_mr mmkey; }; struct mlx5_ib_umr_context { struct ib_cqe cqe; enum ib_wc_status status; struct completion done; }; struct umr_common { struct ib_pd *pd; struct ib_cq *cq; struct ib_qp *qp; /* control access to UMR QP */ struct semaphore sem; }; enum { MLX5_FMR_INVALID, MLX5_FMR_VALID, MLX5_FMR_BUSY, }; struct mlx5_cache_ent { struct list_head head; /* sync access to the cahce entry */ spinlock_t lock; struct dentry *dir; char name[4]; u32 order; u32 size; u32 cur; u32 miss; u32 limit; struct dentry *fsize; struct dentry *fcur; struct dentry *fmiss; struct dentry *flimit; struct mlx5_ib_dev *dev; struct work_struct work; struct delayed_work dwork; int pending; }; struct mlx5_mr_cache { struct workqueue_struct *wq; struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES]; int stopped; struct dentry *root; unsigned long last_add; }; struct mlx5_ib_gsi_qp; struct mlx5_ib_port_resources { struct mlx5_ib_resources *devr; struct mlx5_ib_gsi_qp *gsi; struct work_struct pkey_change_work; }; struct mlx5_ib_resources { struct ib_cq *c0; struct ib_xrcd *x0; struct ib_xrcd *x1; struct ib_pd *p0; struct ib_srq *s0; struct ib_srq *s1; struct mlx5_ib_port_resources ports[2]; /* Protects changes to the port resources */ struct mutex mutex; }; struct mlx5_ib_port { u16 q_cnt_id; }; struct mlx5_roce { /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL * netdev pointer */ rwlock_t netdev_lock; struct ifnet *netdev; struct notifier_block nb; atomic_t next_port; }; #define MLX5_IB_STATS_COUNT(a,b,c,d) a #define MLX5_IB_STATS_VAR(a,b,c,d) b; #define MLX5_IB_STATS_DESC(a,b,c,d) c, d, #define MLX5_IB_CONG_PARAMS(m) \ /* ECN RP */ \ m(+1, u64 rp_clamp_tgt_rate, "rp_clamp_tgt_rate", "If set, whenever a CNP is processed, the target rate is updated to be the current rate") \ m(+1, u64 rp_clamp_tgt_rate_ati, "rp_clamp_tgt_rate_ati", "If set, when receiving a CNP, the target rate should be updated if the transission rate was increased due to the timer, and not only due to the byte counter") \ m(+1, u64 rp_time_reset, "rp_time_reset", "Time in microseconds between rate increases if no CNPs are received") \ m(+1, u64 rp_byte_reset, "rp_byte_reset", "Transmitted data in bytes between rate increases if no CNP's are received. A value of zero means disabled.") \ m(+1, u64 rp_threshold, "rp_threshold", "The number of times rpByteStage or rpTimeStage can count before the RP rate control state machine advances states") \ m(+1, u64 rp_ai_rate, "rp_ai_rate", "The rate, in Mbits per second, used to increase rpTargetRate in the active increase state") \ m(+1, u64 rp_hai_rate, "rp_hai_rate", "The rate, in Mbits per second, used to increase rpTargetRate in the hyper increase state") \ m(+1, u64 rp_min_dec_fac, "rp_min_dec_fac", "The minimum factor by which the current transmit rate can be changed when processing a CNP. Value is given as a percentage, [1 .. 100]") \ m(+1, u64 rp_min_rate, "rp_min_rate", "The minimum value, in Mbps per second, for rate to limit") \ m(+1, u64 rp_rate_to_set_on_first_cnp, "rp_rate_to_set_on_first_cnp", "The rate that is set for the flow when a rate limiter is allocated to it upon first CNP received, in Mbps. A value of zero means use full port speed") \ m(+1, u64 rp_dce_tcp_g, "rp_dce_tcp_g", "Used to update the congestion estimator, alpha, once every dce_tcp_rtt once every dce_tcp_rtt microseconds") \ m(+1, u64 rp_dce_tcp_rtt, "rp_dce_tcp_rtt", "The time between updates of the aolpha value, in microseconds") \ m(+1, u64 rp_rate_reduce_monitor_period, "rp_rate_reduce_monitor_period", "The minimum time between two consecutive rate reductions for a single flow") \ m(+1, u64 rp_initial_alpha_value, "rp_initial_alpha_value", "The initial value of alpha to use when receiving the first CNP for a flow") \ m(+1, u64 rp_gd, "rp_gd", "If a CNP is received, the flow rate is reduced at the beginning of the next rate_reduce_monitor_period interval") \ /* ECN NP */ \ m(+1, u64 np_cnp_dscp, "np_cnp_dscp", "The DiffServ Code Point of the generated CNP for this port") \ m(+1, u64 np_cnp_prio_mode, "np_cnp_prio_mode", "The 802.1p priority value of the generated CNP for this port") \ m(+1, u64 np_cnp_prio, "np_cnp_prio", "The 802.1p priority value of the generated CNP for this port") #define MLX5_IB_CONG_PARAMS_NUM (0 MLX5_IB_CONG_PARAMS(MLX5_IB_STATS_COUNT)) #define MLX5_IB_CONG_STATS(m) \ m(+1, u64 syndrome, "syndrome", "Syndrome number") \ m(+1, u64 rp_cur_flows, "rp_cur_flows", "Number of flows limited") \ m(+1, u64 sum_flows, "sum_flows", "Sum of the number of flows limited over time") \ m(+1, u64 rp_cnp_ignored, "rp_cnp_ignored", "Number of CNPs and CNMs ignored") \ m(+1, u64 rp_cnp_handled, "rp_cnp_handled", "Number of CNPs and CNMs successfully handled") \ m(+1, u64 time_stamp, "time_stamp", "Time stamp in microseconds") \ m(+1, u64 accumulators_period, "accumulators_period", "The value of X variable for accumulating counters") \ m(+1, u64 np_ecn_marked_roce_packets, "np_ecn_marked_roce_packets", "Number of ECN marked packets seen") \ m(+1, u64 np_cnp_sent, "np_cnp_sent", "Number of CNPs sent") #define MLX5_IB_CONG_STATS_NUM (0 MLX5_IB_CONG_STATS(MLX5_IB_STATS_COUNT)) struct mlx5_ib_congestion { struct sysctl_ctx_list ctx; struct sx lock; struct delayed_work dwork; union { u64 arg[1]; struct { MLX5_IB_CONG_PARAMS(MLX5_IB_STATS_VAR) MLX5_IB_CONG_STATS(MLX5_IB_STATS_VAR) }; }; }; struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; struct mlx5_roce roce; MLX5_DECLARE_DOORBELL_LOCK(uar_lock); int num_ports; /* serialize update of capability mask */ struct mutex cap_mask_mutex; bool ib_active; struct umr_common umrc; /* sync used page count stats */ struct mlx5_ib_resources devr; struct mlx5_mr_cache cache; struct timer_list delay_timer; /* Prevents soft lock on massive reg MRs */ struct mutex slow_path_mutex; int fill_delay; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_odp_caps odp_caps; /* * Sleepable RCU that prevents destruction of MRs while they are still * being used by a page fault handler. */ struct srcu_struct mr_srcu; #endif struct mlx5_ib_flow_db flow_db; /* protect resources needed as part of reset flow */ spinlock_t reset_flow_resource_lock; struct list_head qp_list; /* Array with num_ports elements */ struct mlx5_ib_port *port; struct mlx5_sq_bfreg bfreg; struct mlx5_sq_bfreg wc_bfreg; struct mlx5_sq_bfreg fp_bfreg; struct mlx5_ib_congestion congestion; struct mlx5_async_ctx async_ctx; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) { return container_of(mcq, struct mlx5_ib_cq, mcq); } static inline struct mlx5_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) { return container_of(ibxrcd, struct mlx5_ib_xrcd, ibxrcd); } static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev) { return container_of(ibdev, struct mlx5_ib_dev, ib_dev); } static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq) { return container_of(ibcq, struct mlx5_ib_cq, ibcq); } static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) { return container_of(mqp, struct mlx5_ib_qp_base, mqp)->container_mibqp; } static inline struct mlx5_ib_rwq *to_mibrwq(struct mlx5_core_qp *core_qp) { return container_of(core_qp, struct mlx5_ib_rwq, core_qp); } static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmkey) { return container_of(mmkey, struct mlx5_ib_mr, mmkey); } static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd) { return container_of(ibpd, struct mlx5_ib_pd, ibpd); } static inline struct mlx5_ib_srq *to_msrq(struct ib_srq *ibsrq) { return container_of(ibsrq, struct mlx5_ib_srq, ibsrq); } static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp) { return container_of(ibqp, struct mlx5_ib_qp, ibqp); } static inline struct mlx5_ib_rwq *to_mrwq(struct ib_wq *ibwq) { return container_of(ibwq, struct mlx5_ib_rwq, ibwq); } static inline struct mlx5_ib_rwq_ind_table *to_mrwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) { return container_of(ib_rwq_ind_tbl, struct mlx5_ib_rwq_ind_table, ib_rwq_ind_tbl); } static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq) { return container_of(msrq, struct mlx5_ib_srq, msrq); } static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr) { return container_of(ibmr, struct mlx5_ib_mr, ibmr); } static inline struct mlx5_ib_mw *to_mmw(struct ib_mw *ibmw) { return container_of(ibmw, struct mlx5_ib_mw, ibmw); } struct mlx5_ib_ah { struct ib_ah ibah; struct mlx5_av av; }; static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah) { return container_of(ibah, struct mlx5_ib_ah, ibah); } int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, struct mlx5_db *db); void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index); int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const void *in_mad, void *response_mad); struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, struct ib_udata *udata); int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); int mlx5_ib_destroy_ah(struct ib_ah *ah); struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *init_attr, struct ib_udata *udata); int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr); int mlx5_ib_destroy_srq(struct ib_srq *srq); -int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int mlx5_ib_destroy_qp(struct ib_qp *qp); -int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); -int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, void *buffer, u32 length, struct mlx5_ib_qp_base *base); struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata); int mlx5_ib_destroy_cq(struct ib_cq *cq); int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata); int mlx5_ib_dealloc_mw(struct ib_mw *mw); int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, int zap); int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_pd *pd, struct ib_udata *udata); int mlx5_ib_dereg_mr(struct ib_mr *ibmr); struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *in, size_t in_mad_size, struct ib_mad_hdr *out, size_t *out_mad_size, u16 *out_mad_pkey_index); struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata); int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd); int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset); int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port); int mlx5_query_mad_ifc_smp_attr_node_info(struct ib_device *ibdev, struct ib_smp *out_mad); int mlx5_query_mad_ifc_system_image_guid(struct ib_device *ibdev, __be64 *sys_image_guid); int mlx5_query_mad_ifc_max_pkeys(struct ib_device *ibdev, u16 *max_pkeys); int mlx5_query_mad_ifc_vendor_id(struct ib_device *ibdev, u32 *vendor_id); int mlx5_query_mad_ifc_node_desc(struct mlx5_ib_dev *dev, char *node_desc); int mlx5_query_mad_ifc_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid); int mlx5_query_mad_ifc_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey); int mlx5_query_mad_ifc_gids(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid); int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props); int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props); int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev); void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, unsigned long max_page_shift, int *count, int *shift, int *ncont, int *order); void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int page_shift, size_t offset, size_t num_pages, __be64 *pas, int access_flags); void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int page_shift, __be64 *pas, int access_flags); void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata); int mlx5_ib_destroy_wq(struct ib_wq *wq); int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, u32 wq_attr_mask, struct ib_udata *udata); struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, struct ib_rwq_ind_table_init_attr *init_attr, struct ib_udata *udata); int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING extern struct workqueue_struct *mlx5_ib_page_fault_wq; void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev); void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault); void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp); int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, unsigned long end); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) { return; } static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {} static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} static inline int mlx5_ib_odp_init(void) { return 0; } static inline void mlx5_ib_odp_cleanup(void) {} static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {} static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ int mlx5_ib_get_vf_config(struct ib_device *device, int vf, u8 port, struct ifla_vf_info *info); int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, int state); int mlx5_ib_get_vf_stats(struct ib_device *device, int vf, u8 port, struct ifla_vf_stats *stats); int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, int type); __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, int index); int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num, int index, enum ib_gid_type *gid_type); /* GSI QP helper functions */ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr); int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp); int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask); int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); -int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); -int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int mlx5_ib_gsi_post_send(struct ib_qp *qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi); int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc); void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn); static inline void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; mad->class_version = 1; mad->method = IB_MGMT_METHOD_GET; } static inline u8 convert_access(int acc) { return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | MLX5_PERM_LOCAL_READ; } static inline int is_qp1(enum ib_qp_type qp_type) { return qp_type == MLX5_IB_QPT_HW_GSI; } #define MLX5_MAX_UMR_SHIFT 16 #define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) static inline u32 check_cq_create_flags(u32 flags) { /* * It returns non-zero value for unsupported CQ * create flags, otherwise it returns zero. */ return (flags & ~(IB_CQ_FLAGS_IGNORE_OVERRUN | IB_CQ_FLAGS_TIMESTAMP_COMPLETION)); } static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx, u32 *user_index) { if (cqe_version) { if ((cmd_uidx == MLX5_IB_DEFAULT_UIDX) || (cmd_uidx & ~MLX5_USER_ASSIGNED_UIDX_MASK)) return -EINVAL; *user_index = cmd_uidx; } else { *user_index = MLX5_IB_DEFAULT_UIDX; } return 0; } static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext, struct mlx5_ib_create_qp *ucmd, int inlen, u32 *user_index) { u8 cqe_version = ucontext->cqe_version; if (field_avail(struct mlx5_ib_create_qp, uidx, inlen) && !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) return 0; if (!!(field_avail(struct mlx5_ib_create_qp, uidx, inlen) != !!cqe_version)) return -EINVAL; return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); } static inline int get_srq_user_index(struct mlx5_ib_ucontext *ucontext, struct mlx5_ib_create_srq *ucmd, int inlen, u32 *user_index) { u8 cqe_version = ucontext->cqe_version; if (field_avail(struct mlx5_ib_create_srq, uidx, inlen) && !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) return 0; if (!!(field_avail(struct mlx5_ib_create_srq, uidx, inlen) != !!cqe_version)) return -EINVAL; return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); } void mlx5_ib_cleanup_congestion(struct mlx5_ib_dev *); int mlx5_ib_init_congestion(struct mlx5_ib_dev *); static inline int get_uars_per_sys_page(struct mlx5_ib_dev *dev, bool lib_support) { return lib_support && MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1; } static inline int get_num_static_uars(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi) { return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_static_sys_pages; } int bfregn_to_uar_index(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, u32 bfregn, bool dyn_bfreg); #endif /* MLX5_IB_H */ diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_gsi.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_gsi.c index 6c0417851665..dc90b1348378 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_gsi.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_gsi.c @@ -1,536 +1,536 @@ /*- * Copyright (c) 2016, Mellanox Technologies, Ltd. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "mlx5_ib.h" struct mlx5_ib_gsi_wr { struct ib_cqe cqe; struct ib_wc wc; int send_flags; bool completed:1; }; struct mlx5_ib_gsi_qp { struct ib_qp ibqp; struct ib_qp *rx_qp; u8 port_num; struct ib_qp_cap cap; enum ib_sig_type sq_sig_type; /* Serialize qp state modifications */ struct mutex mutex; struct ib_cq *cq; struct mlx5_ib_gsi_wr *outstanding_wrs; u32 outstanding_pi, outstanding_ci; int num_qps; /* Protects access to the tx_qps. Post send operations synchronize * with tx_qp creation in setup_qp(). Also protects the * outstanding_wrs array and indices. */ spinlock_t lock; struct ib_qp **tx_qps; }; static struct mlx5_ib_gsi_qp *gsi_qp(struct ib_qp *qp) { return container_of(qp, struct mlx5_ib_gsi_qp, ibqp); } static bool mlx5_ib_deth_sqpn_cap(struct mlx5_ib_dev *dev) { return MLX5_CAP_GEN(dev->mdev, set_deth_sqpn); } /* Call with gsi->lock locked */ static void generate_completions(struct mlx5_ib_gsi_qp *gsi) { struct ib_cq *gsi_cq = gsi->ibqp.send_cq; struct mlx5_ib_gsi_wr *wr; u32 index; for (index = gsi->outstanding_ci; index != gsi->outstanding_pi; index++) { wr = &gsi->outstanding_wrs[index % gsi->cap.max_send_wr]; if (!wr->completed) break; if (gsi->sq_sig_type == IB_SIGNAL_ALL_WR || wr->send_flags & IB_SEND_SIGNALED) WARN_ON_ONCE(mlx5_ib_generate_wc(gsi_cq, &wr->wc)); wr->completed = false; } gsi->outstanding_ci = index; } static void handle_single_completion(struct ib_cq *cq, struct ib_wc *wc) { struct mlx5_ib_gsi_qp *gsi = cq->cq_context; struct mlx5_ib_gsi_wr *wr = container_of(wc->wr_cqe, struct mlx5_ib_gsi_wr, cqe); u64 wr_id; unsigned long flags; spin_lock_irqsave(&gsi->lock, flags); wr->completed = true; wr_id = wr->wc.wr_id; wr->wc = *wc; wr->wc.wr_id = wr_id; wr->wc.qp = &gsi->ibqp; generate_completions(gsi); spin_unlock_irqrestore(&gsi->lock, flags); } struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_gsi_qp *gsi; struct ib_qp_init_attr hw_init_attr = *init_attr; const u8 port_num = init_attr->port_num; const int num_pkeys = pd->device->attrs.max_pkeys; const int num_qps = mlx5_ib_deth_sqpn_cap(dev) ? num_pkeys : 0; int ret; mlx5_ib_dbg(dev, "creating GSI QP\n"); if (port_num > ARRAY_SIZE(dev->devr.ports) || port_num < 1) { mlx5_ib_warn(dev, "invalid port number %d during GSI QP creation\n", port_num); return ERR_PTR(-EINVAL); } gsi = kzalloc(sizeof(*gsi), GFP_KERNEL); if (!gsi) return ERR_PTR(-ENOMEM); gsi->tx_qps = kcalloc(num_qps, sizeof(*gsi->tx_qps), GFP_KERNEL); if (!gsi->tx_qps) { ret = -ENOMEM; goto err_free; } gsi->outstanding_wrs = kcalloc(init_attr->cap.max_send_wr, sizeof(*gsi->outstanding_wrs), GFP_KERNEL); if (!gsi->outstanding_wrs) { ret = -ENOMEM; goto err_free_tx; } mutex_init(&gsi->mutex); mutex_lock(&dev->devr.mutex); if (dev->devr.ports[port_num - 1].gsi) { mlx5_ib_warn(dev, "GSI QP already exists on port %d\n", port_num); ret = -EBUSY; goto err_free_wrs; } gsi->num_qps = num_qps; spin_lock_init(&gsi->lock); gsi->cap = init_attr->cap; gsi->sq_sig_type = init_attr->sq_sig_type; gsi->ibqp.qp_num = 1; gsi->port_num = port_num; gsi->cq = ib_alloc_cq(pd->device, gsi, init_attr->cap.max_send_wr, 0, IB_POLL_SOFTIRQ); if (IS_ERR(gsi->cq)) { mlx5_ib_warn(dev, "unable to create send CQ for GSI QP. error %ld\n", PTR_ERR(gsi->cq)); ret = PTR_ERR(gsi->cq); goto err_free_wrs; } hw_init_attr.qp_type = MLX5_IB_QPT_HW_GSI; hw_init_attr.send_cq = gsi->cq; if (num_qps) { hw_init_attr.cap.max_send_wr = 0; hw_init_attr.cap.max_send_sge = 0; hw_init_attr.cap.max_inline_data = 0; } gsi->rx_qp = ib_create_qp(pd, &hw_init_attr); if (IS_ERR(gsi->rx_qp)) { mlx5_ib_warn(dev, "unable to create hardware GSI QP. error %ld\n", PTR_ERR(gsi->rx_qp)); ret = PTR_ERR(gsi->rx_qp); goto err_destroy_cq; } dev->devr.ports[init_attr->port_num - 1].gsi = gsi; mutex_unlock(&dev->devr.mutex); return &gsi->ibqp; err_destroy_cq: ib_free_cq(gsi->cq); err_free_wrs: mutex_unlock(&dev->devr.mutex); kfree(gsi->outstanding_wrs); err_free_tx: kfree(gsi->tx_qps); err_free: kfree(gsi); return ERR_PTR(ret); } int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp) { struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); const int port_num = gsi->port_num; int qp_index; int ret; mlx5_ib_dbg(dev, "destroying GSI QP\n"); mutex_lock(&dev->devr.mutex); ret = ib_destroy_qp(gsi->rx_qp); if (ret) { mlx5_ib_warn(dev, "unable to destroy hardware GSI QP. error %d\n", ret); mutex_unlock(&dev->devr.mutex); return ret; } dev->devr.ports[port_num - 1].gsi = NULL; mutex_unlock(&dev->devr.mutex); gsi->rx_qp = NULL; for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) { if (!gsi->tx_qps[qp_index]) continue; WARN_ON_ONCE(ib_destroy_qp(gsi->tx_qps[qp_index])); gsi->tx_qps[qp_index] = NULL; } ib_free_cq(gsi->cq); kfree(gsi->outstanding_wrs); kfree(gsi->tx_qps); kfree(gsi); return 0; } static struct ib_qp *create_gsi_ud_qp(struct mlx5_ib_gsi_qp *gsi) { struct ib_pd *pd = gsi->rx_qp->pd; struct ib_qp_init_attr init_attr = { .event_handler = gsi->rx_qp->event_handler, .qp_context = gsi->rx_qp->qp_context, .send_cq = gsi->cq, .recv_cq = gsi->rx_qp->recv_cq, .cap = { .max_send_wr = gsi->cap.max_send_wr, .max_send_sge = gsi->cap.max_send_sge, .max_inline_data = gsi->cap.max_inline_data, }, .sq_sig_type = gsi->sq_sig_type, .qp_type = IB_QPT_UD, .create_flags = MLX5_IB_QP_CREATE_SQPN_QP1, }; return ib_create_qp(pd, &init_attr); } static int modify_to_rts(struct mlx5_ib_gsi_qp *gsi, struct ib_qp *qp, u16 qp_index) { struct mlx5_ib_dev *dev = to_mdev(qp->device); struct ib_qp_attr attr; int mask; int ret; mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY | IB_QP_PORT; attr.qp_state = IB_QPS_INIT; attr.pkey_index = qp_index; attr.qkey = IB_QP1_QKEY; attr.port_num = gsi->port_num; ret = ib_modify_qp(qp, &attr, mask); if (ret) { mlx5_ib_err(dev, "could not change QP%d state to INIT: %d\n", qp->qp_num, ret); return ret; } attr.qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, &attr, IB_QP_STATE); if (ret) { mlx5_ib_err(dev, "could not change QP%d state to RTR: %d\n", qp->qp_num, ret); return ret; } attr.qp_state = IB_QPS_RTS; attr.sq_psn = 0; ret = ib_modify_qp(qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN); if (ret) { mlx5_ib_err(dev, "could not change QP%d state to RTS: %d\n", qp->qp_num, ret); return ret; } return 0; } static void setup_qp(struct mlx5_ib_gsi_qp *gsi, u16 qp_index) { struct ib_device *device = gsi->rx_qp->device; struct mlx5_ib_dev *dev = to_mdev(device); struct ib_qp *qp; unsigned long flags; u16 pkey; int ret; ret = ib_query_pkey(device, gsi->port_num, qp_index, &pkey); if (ret) { mlx5_ib_warn(dev, "unable to read P_Key at port %d, index %d\n", gsi->port_num, qp_index); return; } if (!pkey) { mlx5_ib_dbg(dev, "invalid P_Key at port %d, index %d. Skipping.\n", gsi->port_num, qp_index); return; } spin_lock_irqsave(&gsi->lock, flags); qp = gsi->tx_qps[qp_index]; spin_unlock_irqrestore(&gsi->lock, flags); if (qp) { mlx5_ib_dbg(dev, "already existing GSI TX QP at port %d, index %d. Skipping\n", gsi->port_num, qp_index); return; } qp = create_gsi_ud_qp(gsi); if (IS_ERR(qp)) { mlx5_ib_warn(dev, "unable to create hardware UD QP for GSI: %ld\n", PTR_ERR(qp)); return; } ret = modify_to_rts(gsi, qp, qp_index); if (ret) goto err_destroy_qp; spin_lock_irqsave(&gsi->lock, flags); WARN_ON_ONCE(gsi->tx_qps[qp_index]); gsi->tx_qps[qp_index] = qp; spin_unlock_irqrestore(&gsi->lock, flags); return; err_destroy_qp: WARN_ON_ONCE(qp); } static void setup_qps(struct mlx5_ib_gsi_qp *gsi) { u16 qp_index; for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) setup_qp(gsi, qp_index); } int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask) { struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); int ret; mlx5_ib_dbg(dev, "modifying GSI QP to state %d\n", attr->qp_state); mutex_lock(&gsi->mutex); ret = ib_modify_qp(gsi->rx_qp, attr, attr_mask); if (ret) { mlx5_ib_warn(dev, "unable to modify GSI rx QP: %d\n", ret); goto unlock; } if (to_mqp(gsi->rx_qp)->state == IB_QPS_RTS) setup_qps(gsi); unlock: mutex_unlock(&gsi->mutex); return ret; } int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); int ret; mutex_lock(&gsi->mutex); ret = ib_query_qp(gsi->rx_qp, qp_attr, qp_attr_mask, qp_init_attr); qp_init_attr->cap = gsi->cap; mutex_unlock(&gsi->mutex); return ret; } /* Call with gsi->lock locked */ static int mlx5_ib_add_outstanding_wr(struct mlx5_ib_gsi_qp *gsi, struct ib_ud_wr *wr, struct ib_wc *wc) { struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); struct mlx5_ib_gsi_wr *gsi_wr; if (gsi->outstanding_pi == gsi->outstanding_ci + gsi->cap.max_send_wr) { mlx5_ib_warn(dev, "no available GSI work request.\n"); return -ENOMEM; } gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi % gsi->cap.max_send_wr]; gsi->outstanding_pi++; if (!wc) { memset(&gsi_wr->wc, 0, sizeof(gsi_wr->wc)); gsi_wr->wc.pkey_index = wr->pkey_index; gsi_wr->wc.wr_id = wr->wr.wr_id; } else { gsi_wr->wc = *wc; gsi_wr->completed = true; } gsi_wr->cqe.done = &handle_single_completion; wr->wr.wr_cqe = &gsi_wr->cqe; return 0; } /* Call with gsi->lock locked */ static int mlx5_ib_gsi_silent_drop(struct mlx5_ib_gsi_qp *gsi, struct ib_ud_wr *wr) { struct ib_wc wc = { { .wr_id = wr->wr.wr_id }, .status = IB_WC_SUCCESS, .opcode = IB_WC_SEND, .qp = &gsi->ibqp, }; int ret; ret = mlx5_ib_add_outstanding_wr(gsi, wr, &wc); if (ret) return ret; generate_completions(gsi); return 0; } /* Call with gsi->lock locked */ static struct ib_qp *get_tx_qp(struct mlx5_ib_gsi_qp *gsi, struct ib_ud_wr *wr) { struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); int qp_index = wr->pkey_index; if (!mlx5_ib_deth_sqpn_cap(dev)) return gsi->rx_qp; if (qp_index >= gsi->num_qps) return NULL; return gsi->tx_qps[qp_index]; } -int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int mlx5_ib_gsi_post_send(struct ib_qp *qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); struct ib_qp *tx_qp; unsigned long flags; int ret; for (; wr; wr = wr->next) { struct ib_ud_wr cur_wr = *ud_wr(wr); cur_wr.wr.next = NULL; spin_lock_irqsave(&gsi->lock, flags); tx_qp = get_tx_qp(gsi, &cur_wr); if (!tx_qp) { ret = mlx5_ib_gsi_silent_drop(gsi, &cur_wr); if (ret) goto err; spin_unlock_irqrestore(&gsi->lock, flags); continue; } ret = mlx5_ib_add_outstanding_wr(gsi, &cur_wr, NULL); if (ret) goto err; ret = ib_post_send(tx_qp, &cur_wr.wr, bad_wr); if (ret) { /* Undo the effect of adding the outstanding wr */ gsi->outstanding_pi = (gsi->outstanding_pi - 1) % gsi->cap.max_send_wr; goto err; } spin_unlock_irqrestore(&gsi->lock, flags); } return 0; err: spin_unlock_irqrestore(&gsi->lock, flags); *bad_wr = wr; return ret; } -int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); return ib_post_recv(gsi->rx_qp, wr, bad_wr); } void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi) { if (!gsi) return; mutex_lock(&gsi->mutex); setup_qps(gsi); mutex_unlock(&gsi->mutex); } diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c index 585e52d35d3b..145ec55d6757 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c @@ -1,1659 +1,1654 @@ /*- * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include "mlx5_ib.h" enum { MAX_PENDING_REG_MR = 8, }; #define MLX5_UMR_ALIGN 2048 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING static __be64 mlx5_ib_update_mtt_emergency_buffer[ MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)] __aligned(MLX5_UMR_ALIGN); static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex); #endif static int clean_mr(struct mlx5_ib_mr *mr); static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING /* Wait until all page fault handlers using the mr complete. */ synchronize_srcu(&dev->mr_srcu); #endif return err; } static int order2idx(struct mlx5_ib_dev *dev, int order) { struct mlx5_mr_cache *cache = &dev->cache; if (order < cache->ent[0].order) return 0; else return order - cache->ent[0].order; } static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length) { return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >= length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1)); } #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING static void update_odp_mr(struct mlx5_ib_mr *mr) { if (mr->umem->odp_data) { /* * This barrier prevents the compiler from moving the * setting of umem->odp_data->private to point to our * MR, before reg_umr finished, to ensure that the MR * initialization have finished before starting to * handle invalidations. */ smp_wmb(); mr->umem->odp_data->private = mr; /* * Make sure we will see the new * umem->odp_data->private value in the invalidation * routines, before we can get page faults on the * MR. Page faults can happen once we put the MR in * the tree, below this line. Without the barrier, * there can be a fault handling and an invalidation * before umem->odp_data->private == mr is visible to * the invalidation handler. */ smp_wmb(); } } #endif static void reg_mr_callback(int status, struct mlx5_async_work *context) { struct mlx5_ib_mr *mr = container_of(context, struct mlx5_ib_mr, cb_work); struct mlx5_ib_dev *dev = mr->dev; struct mlx5_mr_cache *cache = &dev->cache; int c = order2idx(dev, mr->order); struct mlx5_cache_ent *ent = &cache->ent[c]; u8 key; unsigned long flags; struct mlx5_mr_table *table = &dev->mdev->priv.mr_table; int err; spin_lock_irqsave(&ent->lock, flags); ent->pending--; spin_unlock_irqrestore(&ent->lock, flags); if (status) { mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); kfree(mr); dev->fill_delay = 1; mod_timer(&dev->delay_timer, jiffies + HZ); return; } spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags); key = dev->mdev->priv.mkey_key++; spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags); mr->mmkey.key = mlx5_idx_to_mkey(MLX5_GET(create_mkey_out, mr->out, mkey_index)) | key; cache->last_add = jiffies; spin_lock_irqsave(&ent->lock, flags); list_add_tail(&mr->list, &ent->head); ent->cur++; ent->size++; spin_unlock_irqrestore(&ent->lock, flags); spin_lock_irqsave(&table->lock, flags); err = radix_tree_insert(&table->tree, mlx5_mkey_to_idx(mr->mmkey.key), &mr->mmkey); if (err) pr_err("Error inserting to mkey tree. 0x%x\n", -err); spin_unlock_irqrestore(&table->lock, flags); } static int add_keys(struct mlx5_ib_dev *dev, int c, int num) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent = &cache->ent[c]; int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); struct mlx5_ib_mr *mr; int npages = 1 << ent->order; void *mkc; u32 *in; int err = 0; int i; in = kzalloc(inlen, GFP_KERNEL); if (!in) return -ENOMEM; mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); for (i = 0; i < num; i++) { if (ent->pending >= MAX_PENDING_REG_MR) { err = -EAGAIN; break; } mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { err = -ENOMEM; break; } mr->order = ent->order; mr->umred = 1; mr->dev = dev; MLX5_SET(mkc, mkc, free, 1); MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_MTT); MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, translations_octword_size, (npages + 1) / 2); MLX5_SET(mkc, mkc, log_page_size, 12); spin_lock_irq(&ent->lock); ent->pending++; spin_unlock_irq(&ent->lock); err = mlx5_core_create_mkey_cb(dev->mdev, &mr->mmkey, &dev->async_ctx, in, inlen, mr->out, sizeof(mr->out), reg_mr_callback, &mr->cb_work); if (err) { spin_lock_irq(&ent->lock); ent->pending--; spin_unlock_irq(&ent->lock); mlx5_ib_warn(dev, "create mkey failed %d\n", err); kfree(mr); break; } } kfree(in); return err; } static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent = &cache->ent[c]; struct mlx5_ib_mr *mr; int err; int i; for (i = 0; i < num; i++) { spin_lock_irq(&ent->lock); if (list_empty(&ent->head)) { spin_unlock_irq(&ent->lock); return; } mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); list_del(&mr->list); ent->cur--; ent->size--; spin_unlock_irq(&ent->lock); err = destroy_mkey(dev, mr); if (err) mlx5_ib_warn(dev, "failed destroy mkey\n"); else kfree(mr); } } static int someone_adding(struct mlx5_mr_cache *cache) { int i; for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { if (cache->ent[i].cur < cache->ent[i].limit) return 1; } return 0; } static void __cache_work_func(struct mlx5_cache_ent *ent) { struct mlx5_ib_dev *dev = ent->dev; struct mlx5_mr_cache *cache = &dev->cache; int i = order2idx(dev, ent->order); int err; if (cache->stopped) return; ent = &dev->cache.ent[i]; if (ent->cur < 2 * ent->limit && !dev->fill_delay) { err = add_keys(dev, i, 1); if (ent->cur < 2 * ent->limit) { if (err == -EAGAIN) { mlx5_ib_dbg(dev, "returned eagain, order %d\n", i + 2); queue_delayed_work(cache->wq, &ent->dwork, msecs_to_jiffies(3)); } else if (err) { mlx5_ib_warn(dev, "command failed order %d, err %d\n", i + 2, err); queue_delayed_work(cache->wq, &ent->dwork, msecs_to_jiffies(1000)); } else { queue_work(cache->wq, &ent->work); } } } else if (ent->cur > 2 * ent->limit) { /* * The remove_keys() logic is performed as garbage collection * task. Such task is intended to be run when no other active * processes are running. * * The need_resched() will return TRUE if there are user tasks * to be activated in near future. * * In such case, we don't execute remove_keys() and postpone * the garbage collection work to try to run in next cycle, * in order to free CPU resources to other tasks. */ if (!need_resched() && !someone_adding(cache) && time_after(jiffies, cache->last_add + 300 * HZ)) { remove_keys(dev, i, 1); if (ent->cur > ent->limit) queue_work(cache->wq, &ent->work); } else { queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); } } } static void delayed_cache_work_func(struct work_struct *work) { struct mlx5_cache_ent *ent; ent = container_of(work, struct mlx5_cache_ent, dwork.work); __cache_work_func(ent); } static void cache_work_func(struct work_struct *work) { struct mlx5_cache_ent *ent; ent = container_of(work, struct mlx5_cache_ent, work); __cache_work_func(ent); } static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_ib_mr *mr = NULL; struct mlx5_cache_ent *ent; int c; int i; c = order2idx(dev, order); if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c); return NULL; } for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) { ent = &cache->ent[i]; mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); spin_lock_irq(&ent->lock); if (!list_empty(&ent->head)) { mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); list_del(&mr->list); ent->cur--; spin_unlock_irq(&ent->lock); if (ent->cur < ent->limit) queue_work(cache->wq, &ent->work); break; } spin_unlock_irq(&ent->lock); queue_work(cache->wq, &ent->work); } if (!mr) cache->ent[c].miss++; return mr; } static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent; int shrink = 0; int c; c = order2idx(dev, mr->order); if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c); return; } ent = &cache->ent[c]; spin_lock_irq(&ent->lock); list_add_tail(&mr->list, &ent->head); ent->cur++; if (ent->cur > 2 * ent->limit) shrink = 1; spin_unlock_irq(&ent->lock); if (shrink) queue_work(cache->wq, &ent->work); } static void clean_keys(struct mlx5_ib_dev *dev, int c) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent = &cache->ent[c]; struct mlx5_ib_mr *mr; int err; cancel_delayed_work(&ent->dwork); while (1) { spin_lock_irq(&ent->lock); if (list_empty(&ent->head)) { spin_unlock_irq(&ent->lock); return; } mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); list_del(&mr->list); ent->cur--; ent->size--; spin_unlock_irq(&ent->lock); err = destroy_mkey(dev, mr); if (err) mlx5_ib_warn(dev, "failed destroy mkey\n"); else kfree(mr); } } static void delay_time_func(unsigned long ctx) { struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx; dev->fill_delay = 0; } int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent; int limit; int i; mutex_init(&dev->slow_path_mutex); cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); if (!cache->wq) { mlx5_ib_warn(dev, "failed to create work queue\n"); return -ENOMEM; } mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev); for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { INIT_LIST_HEAD(&cache->ent[i].head); spin_lock_init(&cache->ent[i].lock); ent = &cache->ent[i]; INIT_LIST_HEAD(&ent->head); spin_lock_init(&ent->lock); ent->order = i + 2; ent->dev = dev; if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) limit = dev->mdev->profile->mr_cache[i].limit; else limit = 0; INIT_WORK(&ent->work, cache_work_func); INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); ent->limit = limit; queue_work(cache->wq, &ent->work); } return 0; } int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) { int i; dev->cache.stopped = 1; flush_workqueue(dev->cache.wq); mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) clean_keys(dev, i); destroy_workqueue(dev->cache.wq); del_timer_sync(&dev->delay_timer); return 0; } struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) { struct mlx5_ib_dev *dev = to_mdev(pd->device); int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_ib_mr *mr; void *mkc; u32 *in; int err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); in = kzalloc(inlen, GFP_KERNEL); if (!in) { err = -ENOMEM; goto err_free; } mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_PA); MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); MLX5_SET(mkc, mkc, lr, 1); MLX5_SET(mkc, mkc, length64, 1); MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET64(mkc, mkc, start_addr, 0); err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen); if (err) goto err_in; kfree(in); mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; mr->umem = NULL; return &mr->ibmr; err_in: kfree(in); err_free: kfree(mr); return ERR_PTR(err); } static int get_octo_len(u64 addr, u64 len, int page_size) { u64 offset; int npages; offset = addr & (page_size - 1); npages = ALIGN(len + offset, page_size) >> ilog2(page_size); return (npages + 1) / 2; } static int use_umr(int order) { return order <= MLX5_MAX_UMR_SHIFT; } static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int npages, int page_shift, int *size, __be64 **mr_pas, dma_addr_t *dma) { __be64 *pas; struct device *ddev = dev->ib_dev.dma_device; /* * UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. * To avoid copying garbage after the pas array, we allocate * a little more. */ *size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); *mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); if (!(*mr_pas)) return -ENOMEM; pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN); mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); /* Clear padding after the actual pages. */ memset(pas + npages, 0, *size - npages * sizeof(u64)); *dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE); if (dma_mapping_error(ddev, *dma)) { kfree(*mr_pas); return -ENOMEM; } return 0; } -static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr, +static void prep_umr_wqe_common(struct ib_pd *pd, struct mlx5_umr_wr *umrwr, struct ib_sge *sg, u64 dma, int n, u32 key, int page_shift) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct mlx5_umr_wr *umrwr = umr_wr(wr); sg->addr = dma; sg->length = ALIGN(sizeof(u64) * n, 64); sg->lkey = dev->umrc.pd->local_dma_lkey; - wr->next = NULL; - wr->sg_list = sg; + umrwr->wr.next = NULL; + umrwr->wr.sg_list = sg; if (n) - wr->num_sge = 1; + umrwr->wr.num_sge = 1; else - wr->num_sge = 0; + umrwr->wr.num_sge = 0; - wr->opcode = MLX5_IB_WR_UMR; + umrwr->wr.opcode = MLX5_IB_WR_UMR; umrwr->npages = n; umrwr->page_shift = page_shift; umrwr->mkey = key; } -static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, +static void prep_umr_reg_wqe(struct ib_pd *pd, struct mlx5_umr_wr *umrwr, struct ib_sge *sg, u64 dma, int n, u32 key, int page_shift, u64 virt_addr, u64 len, int access_flags) { - struct mlx5_umr_wr *umrwr = umr_wr(wr); + prep_umr_wqe_common(pd, umrwr, sg, dma, n, key, page_shift); - prep_umr_wqe_common(pd, wr, sg, dma, n, key, page_shift); - - wr->send_flags = 0; + umrwr->wr.send_flags = 0; umrwr->target.virt_addr = virt_addr; umrwr->length = len; umrwr->access_flags = access_flags; umrwr->pd = pd; } static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, - struct ib_send_wr *wr, u32 key) + struct mlx5_umr_wr *umrwr, u32 key) { - struct mlx5_umr_wr *umrwr = umr_wr(wr); - - wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE; - wr->opcode = MLX5_IB_WR_UMR; + umrwr->wr.send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE; + umrwr->wr.opcode = MLX5_IB_WR_UMR; umrwr->mkey = key; } static struct ib_umem *mr_umem_get(struct ib_pd *pd, u64 start, u64 length, int access_flags, int *npages, int *page_shift, int *ncont, int *order) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct ib_umem *umem = ib_umem_get(pd->uobject->context, start, length, access_flags, 0); if (IS_ERR(umem)) { mlx5_ib_err(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); return (void *)umem; } mlx5_ib_cont_pages(umem, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages, page_shift, ncont, order); if (!*npages) { mlx5_ib_warn(dev, "avoid zero region\n"); ib_umem_release(umem); return ERR_PTR(-EINVAL); } mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", *npages, *ncont, *order, *page_shift); return umem; } static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc) { struct mlx5_ib_umr_context *context = container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe); context->status = wc->status; complete(&context->done); } static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) { context->cqe.done = mlx5_ib_umr_done; context->status = -1; init_completion(&context->done); } static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, u64 virt_addr, u64 len, int npages, int page_shift, int order, int access_flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct device *ddev = dev->ib_dev.dma_device; struct umr_common *umrc = &dev->umrc; struct mlx5_ib_umr_context umr_context; struct mlx5_umr_wr umrwr = {}; - struct ib_send_wr *bad; + const struct ib_send_wr *bad; struct mlx5_ib_mr *mr; struct ib_sge sg; int size; __be64 *mr_pas; dma_addr_t dma; int err = 0; int i; for (i = 0; i < 1; i++) { mr = alloc_cached_mr(dev, order); if (mr) break; err = add_keys(dev, order2idx(dev, order), 1); if (err && err != -EAGAIN) { mlx5_ib_warn(dev, "add_keys failed, err %d\n", err); break; } } if (!mr) return ERR_PTR(-EAGAIN); err = dma_map_mr_pas(dev, umem, npages, page_shift, &size, &mr_pas, &dma); if (err) goto free_mr; mlx5_ib_init_umr_context(&umr_context); umrwr.wr.wr_cqe = &umr_context.cqe; - prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, + prep_umr_reg_wqe(pd, &umrwr, &sg, dma, npages, mr->mmkey.key, page_shift, virt_addr, len, access_flags); down(&umrc->sem); err = ib_post_send(umrc->qp, &umrwr.wr, &bad); if (err) { mlx5_ib_warn(dev, "post send failed, err %d\n", err); goto unmap_dma; } else { wait_for_completion(&umr_context.done); if (umr_context.status != IB_WC_SUCCESS) { mlx5_ib_warn(dev, "reg umr failed\n"); err = -EFAULT; } } mr->mmkey.iova = virt_addr; mr->mmkey.size = len; mr->mmkey.pd = to_mpd(pd)->pdn; mr->live = 1; unmap_dma: up(&umrc->sem); dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); kfree(mr_pas); free_mr: if (err) { free_cached_mr(dev, mr); return ERR_PTR(err); } return mr; } #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, int zap) { struct mlx5_ib_dev *dev = mr->dev; struct device *ddev = dev->ib_dev.dma_device; struct umr_common *umrc = &dev->umrc; struct mlx5_ib_umr_context umr_context; struct ib_umem *umem = mr->umem; int size; __be64 *pas; dma_addr_t dma; - struct ib_send_wr *bad; + const struct ib_send_wr *bad; struct mlx5_umr_wr wr; struct ib_sge sg; int err = 0; const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64); const int page_index_mask = page_index_alignment - 1; size_t pages_mapped = 0; size_t pages_to_map = 0; size_t pages_iter = 0; int use_emergency_buf = 0; /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, * so we need to align the offset and length accordingly */ if (start_page_index & page_index_mask) { npages += start_page_index & page_index_mask; start_page_index &= ~page_index_mask; } pages_to_map = ALIGN(npages, page_index_alignment); if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES) return -EINVAL; size = sizeof(u64) * pages_to_map; size = min_t(int, PAGE_SIZE, size); /* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim * code, when we are called from an invalidation. The pas buffer must * be 2k-aligned for Connect-IB. */ pas = (__be64 *)get_zeroed_page(GFP_ATOMIC); if (!pas) { mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n"); pas = mlx5_ib_update_mtt_emergency_buffer; size = MLX5_UMR_MTT_MIN_CHUNK_SIZE; use_emergency_buf = 1; mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex); memset(pas, 0, size); } pages_iter = size / sizeof(u64); dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); if (dma_mapping_error(ddev, dma)) { mlx5_ib_err(dev, "unable to map DMA during MTT update.\n"); err = -ENOMEM; goto free_pas; } for (pages_mapped = 0; pages_mapped < pages_to_map && !err; pages_mapped += pages_iter, start_page_index += pages_iter) { dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE); npages = min_t(size_t, pages_iter, ib_umem_num_pages(umem) - start_page_index); if (!zap) { __mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT, start_page_index, npages, pas, MLX5_IB_MTT_PRESENT); /* Clear padding after the pages brought from the * umem. */ memset(pas + npages, 0, size - npages * sizeof(u64)); } dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE); mlx5_ib_init_umr_context(&umr_context); memset(&wr, 0, sizeof(wr)); wr.wr.wr_cqe = &umr_context.cqe; sg.addr = dma; sg.length = ALIGN(npages * sizeof(u64), MLX5_UMR_MTT_ALIGNMENT); sg.lkey = dev->umrc.pd->local_dma_lkey; wr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | MLX5_IB_SEND_UMR_UPDATE_MTT; wr.wr.sg_list = &sg; wr.wr.num_sge = 1; wr.wr.opcode = MLX5_IB_WR_UMR; wr.npages = sg.length / sizeof(u64); wr.page_shift = PAGE_SHIFT; wr.mkey = mr->mmkey.key; wr.target.offset = start_page_index; down(&umrc->sem); err = ib_post_send(umrc->qp, &wr.wr, &bad); if (err) { mlx5_ib_err(dev, "UMR post send failed, err %d\n", err); } else { wait_for_completion(&umr_context.done); if (umr_context.status != IB_WC_SUCCESS) { mlx5_ib_err(dev, "UMR completion failed, code %d\n", umr_context.status); err = -EFAULT; } } up(&umrc->sem); } dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); free_pas: if (!use_emergency_buf) free_page((unsigned long)pas); else mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex); return err; } #endif /* * If ibmr is NULL it will be allocated by reg_create. * Else, the given ibmr will be used. */ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, u64 virt_addr, u64 length, struct ib_umem *umem, int npages, int page_shift, int access_flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr; __be64 *pas; void *mkc; int inlen; u32 *in; int err; bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + sizeof(*pas) * ((npages + 1) / 2) * 2; in = mlx5_vzalloc(inlen); if (!in) { err = -ENOMEM; goto err_1; } pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); mlx5_ib_populate_pas(dev, umem, page_shift, pas, pg_cap ? MLX5_IB_MTT_PRESENT : 0); /* The pg_access bit allows setting the access flags * in the page list submitted with the command. */ MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_MTT); MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC)); MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE)); MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ)); MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE)); MLX5_SET(mkc, mkc, lr, 1); MLX5_SET64(mkc, mkc, start_addr, virt_addr); MLX5_SET64(mkc, mkc, len, length); MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); MLX5_SET(mkc, mkc, bsf_octword_size, 0); MLX5_SET(mkc, mkc, translations_octword_size, get_octo_len(virt_addr, length, 1 << page_shift)); MLX5_SET(mkc, mkc, log_page_size, page_shift); MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(create_mkey_in, in, translations_octword_actual_size, get_octo_len(virt_addr, length, 1 << page_shift)); err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen); if (err) { mlx5_ib_warn(dev, "create mkey failed\n"); goto err_2; } mr->umem = umem; mr->dev = dev; mr->live = 1; kvfree(in); mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); return mr; err_2: kvfree(in); err_1: if (!ibmr) kfree(mr); return ERR_PTR(err); } static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, int npages, u64 length, int access_flags) { mr->npages = npages; atomic_add(npages, &dev->mdev->priv.reg_pages); mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; mr->ibmr.length = length; mr->access_flags = access_flags; } struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr = NULL; struct ib_umem *umem; int page_shift; int npages; int ncont; int order; int err; mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", (long long)start, (long long)virt_addr, (long long)length, access_flags); umem = mr_umem_get(pd, start, length, access_flags, &npages, &page_shift, &ncont, &order); if (IS_ERR(umem)) return (void *)umem; if (use_umr(order)) { mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift, order, access_flags); if (PTR_ERR(mr) == -EAGAIN) { mlx5_ib_dbg(dev, "cache empty for order %d", order); mr = NULL; } } else if (access_flags & IB_ACCESS_ON_DEMAND) { err = -EINVAL; pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB"); goto error; } if (!mr) { mutex_lock(&dev->slow_path_mutex); mr = reg_create(NULL, pd, virt_addr, length, umem, ncont, page_shift, access_flags); mutex_unlock(&dev->slow_path_mutex); } if (IS_ERR(mr)) { err = PTR_ERR(mr); goto error; } mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); mr->umem = umem; set_mr_fileds(dev, mr, npages, length, access_flags); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING update_odp_mr(mr); #endif return &mr->ibmr; error: ib_umem_release(umem); return ERR_PTR(err); } static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { struct mlx5_core_dev *mdev = dev->mdev; struct umr_common *umrc = &dev->umrc; struct mlx5_ib_umr_context umr_context; struct mlx5_umr_wr umrwr = {}; - struct ib_send_wr *bad; + const struct ib_send_wr *bad; int err; if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) return 0; mlx5_ib_init_umr_context(&umr_context); umrwr.wr.wr_cqe = &umr_context.cqe; - prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmkey.key); + prep_umr_unreg_wqe(dev, &umrwr, mr->mmkey.key); down(&umrc->sem); err = ib_post_send(umrc->qp, &umrwr.wr, &bad); if (err) { up(&umrc->sem); mlx5_ib_dbg(dev, "err %d\n", err); goto error; } else { wait_for_completion(&umr_context.done); up(&umrc->sem); } if (umr_context.status != IB_WC_SUCCESS) { mlx5_ib_warn(dev, "unreg umr failed\n"); err = -EFAULT; goto error; } return 0; error: return err; } static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr, u64 length, int npages, int page_shift, int order, int access_flags, int flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct device *ddev = dev->ib_dev.dma_device; struct mlx5_ib_umr_context umr_context; - struct ib_send_wr *bad; + const struct ib_send_wr *bad; struct mlx5_umr_wr umrwr = {}; struct ib_sge sg; struct umr_common *umrc = &dev->umrc; dma_addr_t dma = 0; __be64 *mr_pas = NULL; int size; int err; mlx5_ib_init_umr_context(&umr_context); umrwr.wr.wr_cqe = &umr_context.cqe; umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE; if (flags & IB_MR_REREG_TRANS) { err = dma_map_mr_pas(dev, mr->umem, npages, page_shift, &size, &mr_pas, &dma); if (err) return err; umrwr.target.virt_addr = virt_addr; umrwr.length = length; umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; } - prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, + prep_umr_wqe_common(pd, &umrwr, &sg, dma, npages, mr->mmkey.key, page_shift); if (flags & IB_MR_REREG_PD) { umrwr.pd = pd; umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD; } if (flags & IB_MR_REREG_ACCESS) { umrwr.access_flags = access_flags; umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_ACCESS; } /* post send request to UMR QP */ down(&umrc->sem); err = ib_post_send(umrc->qp, &umrwr.wr, &bad); if (err) { mlx5_ib_warn(dev, "post send failed, err %d\n", err); } else { wait_for_completion(&umr_context.done); if (umr_context.status != IB_WC_SUCCESS) { mlx5_ib_warn(dev, "reg umr failed (%u)\n", umr_context.status); err = -EFAULT; } } up(&umrc->sem); if (flags & IB_MR_REREG_TRANS) { dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); kfree(mr_pas); } return err; } int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 length, u64 virt_addr, int new_access_flags, struct ib_pd *new_pd, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); struct mlx5_ib_mr *mr = to_mmr(ib_mr); struct ib_pd *pd = (flags & IB_MR_REREG_PD) ? new_pd : ib_mr->pd; int access_flags = flags & IB_MR_REREG_ACCESS ? new_access_flags : mr->access_flags; u64 addr = (flags & IB_MR_REREG_TRANS) ? virt_addr : mr->umem->address; u64 len = (flags & IB_MR_REREG_TRANS) ? length : mr->umem->length; int page_shift = 0; int npages = 0; int ncont = 0; int order = 0; int err; mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", (long long)start, (long long)virt_addr, (long long)length, access_flags); if (flags != IB_MR_REREG_PD) { /* * Replace umem. This needs to be done whether or not UMR is * used. */ flags |= IB_MR_REREG_TRANS; ib_umem_release(mr->umem); mr->umem = mr_umem_get(pd, addr, len, access_flags, &npages, &page_shift, &ncont, &order); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); mr->umem = NULL; return err; } } if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) { /* * UMR can't be used - MKey needs to be replaced. */ if (mr->umred) { err = unreg_umr(dev, mr); if (err) mlx5_ib_warn(dev, "Failed to unregister MR\n"); } else { err = destroy_mkey(dev, mr); if (err) mlx5_ib_warn(dev, "Failed to destroy MKey\n"); } if (err) return err; mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont, page_shift, access_flags); if (IS_ERR(mr)) return PTR_ERR(mr); mr->umred = 0; } else { /* * Send a UMR WQE */ err = rereg_umr(pd, mr, addr, len, npages, page_shift, order, access_flags, flags); if (err) { mlx5_ib_warn(dev, "Failed to rereg UMR\n"); return err; } } if (flags & IB_MR_REREG_PD) { ib_mr->pd = pd; mr->mmkey.pd = to_mpd(pd)->pdn; } if (flags & IB_MR_REREG_ACCESS) mr->access_flags = access_flags; if (flags & IB_MR_REREG_TRANS) { atomic_sub(mr->npages, &dev->mdev->priv.reg_pages); set_mr_fileds(dev, mr, npages, len, access_flags); mr->mmkey.iova = addr; mr->mmkey.size = len; } #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING update_odp_mr(mr); #endif return 0; } static int mlx5_alloc_priv_descs(struct ib_device *device, struct mlx5_ib_mr *mr, int ndescs, int desc_size) { int size = ndescs * desc_size; int add_size; int ret; add_size = max_t(int, MLX5_UMR_ALIGN - 1, 0); mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); if (!mr->descs_alloc) return -ENOMEM; mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); mr->desc_map = dma_map_single(device->dma_device, mr->descs, size, DMA_TO_DEVICE); if (dma_mapping_error(device->dma_device, mr->desc_map)) { ret = -ENOMEM; goto err; } return 0; err: kfree(mr->descs_alloc); return ret; } static void mlx5_free_priv_descs(struct mlx5_ib_mr *mr) { if (mr->descs) { struct ib_device *device = mr->ibmr.device; int size = mr->max_descs * mr->desc_size; dma_unmap_single(device->dma_device, mr->desc_map, size, DMA_TO_DEVICE); kfree(mr->descs_alloc); mr->descs = NULL; } } static int clean_mr(struct mlx5_ib_mr *mr) { struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); int umred = mr->umred; int err; if (mr->sig) { if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", mr->sig->psv_memory.psv_idx); if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", mr->sig->psv_wire.psv_idx); kfree(mr->sig); mr->sig = NULL; } mlx5_free_priv_descs(mr); if (!umred) { err = destroy_mkey(dev, mr); if (err) { mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", mr->mmkey.key, err); return err; } } else { err = unreg_umr(dev, mr); if (err) { mlx5_ib_warn(dev, "failed unregister\n"); return err; } free_cached_mr(dev, mr); } if (!umred) kfree(mr); return 0; } int mlx5_ib_dereg_mr(struct ib_mr *ibmr) { struct mlx5_ib_dev *dev = to_mdev(ibmr->device); struct mlx5_ib_mr *mr = to_mmr(ibmr); int npages = mr->npages; struct ib_umem *umem = mr->umem; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING if (umem && umem->odp_data) { /* Prevent new page faults from succeeding */ mr->live = 0; /* Wait for all running page-fault handlers to finish. */ synchronize_srcu(&dev->mr_srcu); /* Destroy all page mappings */ mlx5_ib_invalidate_range(umem, ib_umem_start(umem), ib_umem_end(umem)); /* * We kill the umem before the MR for ODP, * so that there will not be any invalidations in * flight, looking at the *mr struct. */ ib_umem_release(umem); atomic_sub(npages, &dev->mdev->priv.reg_pages); /* Avoid double-freeing the umem. */ umem = NULL; } #endif clean_mr(mr); if (umem) { ib_umem_release(umem); atomic_sub(npages, &dev->mdev->priv.reg_pages); } return 0; } struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg) { struct mlx5_ib_dev *dev = to_mdev(pd->device); int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); int ndescs = ALIGN(max_num_sg, 4); struct mlx5_ib_mr *mr; void *mkc; u32 *in; int err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); in = kzalloc(inlen, GFP_KERNEL); if (!in) { err = -ENOMEM; goto err_free; } mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, free, 1); MLX5_SET(mkc, mkc, translations_octword_size, ndescs); MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); if (mr_type == IB_MR_TYPE_MEM_REG) { mr->access_mode = MLX5_ACCESS_MODE_MTT; MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, sizeof(u64)); if (err) goto err_free_in; mr->desc_size = sizeof(u64); mr->max_descs = ndescs; } else if (mr_type == IB_MR_TYPE_SG_GAPS) { mr->access_mode = MLX5_ACCESS_MODE_KLM; err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, sizeof(struct mlx5_klm)); if (err) goto err_free_in; mr->desc_size = sizeof(struct mlx5_klm); mr->max_descs = ndescs; } else if (mr_type == IB_MR_TYPE_SIGNATURE) { u32 psv_index[2]; MLX5_SET(mkc, mkc, bsf_en, 1); MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); if (!mr->sig) { err = -ENOMEM; goto err_free_in; } /* create mem & wire PSVs */ err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); if (err) goto err_free_sig; mr->access_mode = MLX5_ACCESS_MODE_KLM; mr->sig->psv_memory.psv_idx = psv_index[0]; mr->sig->psv_wire.psv_idx = psv_index[1]; mr->sig->sig_status_checked = true; mr->sig->sig_err_exists = false; /* Next UMR, Arm SIGERR */ ++mr->sig->sigerr_count; } else { mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); err = -EINVAL; goto err_free_in; } MLX5_SET(mkc, mkc, access_mode, mr->access_mode); MLX5_SET(mkc, mkc, umr_en, 1); err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen); if (err) goto err_destroy_psv; mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; mr->umem = NULL; kfree(in); return &mr->ibmr; err_destroy_psv: if (mr->sig) { if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", mr->sig->psv_memory.psv_idx); if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", mr->sig->psv_wire.psv_idx); } mlx5_free_priv_descs(mr); err_free_sig: kfree(mr->sig); err_free_in: kfree(in); err_free: kfree(mr); return ERR_PTR(err); } struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(pd->device); int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); struct mlx5_ib_mw *mw = NULL; u32 *in = NULL; void *mkc; int ndescs; int err; struct mlx5_ib_alloc_mw req = {}; struct { __u32 comp_mask; __u32 response_length; } resp = {}; err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); if (err) return ERR_PTR(err); if (req.comp_mask || req.reserved1 || req.reserved2) return ERR_PTR(-EOPNOTSUPP); if (udata->inlen > sizeof(req) && !ib_is_udata_cleared(udata, sizeof(req), udata->inlen - sizeof(req))) return ERR_PTR(-EOPNOTSUPP); ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); mw = kzalloc(sizeof(*mw), GFP_KERNEL); in = kzalloc(inlen, GFP_KERNEL); if (!mw || !in) { err = -ENOMEM; goto free; } mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, free, 1); MLX5_SET(mkc, mkc, translations_octword_size, ndescs); MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET(mkc, mkc, lr, 1); MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_KLM); MLX5_SET(mkc, mkc, en_rinval, !!((type == IB_MW_TYPE_2))); MLX5_SET(mkc, mkc, qpn, 0xffffff); err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, inlen); if (err) goto free; mw->ibmw.rkey = mw->mmkey.key; resp.response_length = min(offsetof(typeof(resp), response_length) + sizeof(resp.response_length), udata->outlen); if (resp.response_length) { err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) { mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey); goto free; } } kfree(in); return &mw->ibmw; free: kfree(mw); kfree(in); return ERR_PTR(err); } int mlx5_ib_dealloc_mw(struct ib_mw *mw) { struct mlx5_ib_mw *mmw = to_mmw(mw); int err; err = mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev, &mmw->mmkey); if (!err) kfree(mmw); return err; } int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status) { struct mlx5_ib_mr *mmr = to_mmr(ibmr); int ret = 0; if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { pr_err("Invalid status check mask\n"); ret = -EINVAL; goto done; } mr_status->fail_status = 0; if (check_mask & IB_MR_CHECK_SIG_STATUS) { if (!mmr->sig) { ret = -EINVAL; pr_err("signature status check requested on a non-signature enabled MR\n"); goto done; } mmr->sig->sig_status_checked = true; if (!mmr->sig->sig_err_exists) goto done; if (ibmr->lkey == mmr->sig->err_item.key) memcpy(&mr_status->sig_err, &mmr->sig->err_item, sizeof(mr_status->sig_err)); else { mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; mr_status->sig_err.sig_err_offset = 0; mr_status->sig_err.key = mmr->sig->err_item.key; } mmr->sig->sig_err_exists = false; mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; } done: return ret; } static int mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, struct scatterlist *sgl, unsigned short sg_nents, unsigned int *sg_offset_p) { struct scatterlist *sg = sgl; struct mlx5_klm *klms = mr->descs; unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; u32 lkey = mr->ibmr.pd->local_dma_lkey; int i; mr->ibmr.iova = sg_dma_address(sg) + sg_offset; mr->ibmr.length = 0; mr->ndescs = sg_nents; for_each_sg(sgl, sg, sg_nents, i) { if (unlikely(i > mr->max_descs)) break; klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); klms[i].key = cpu_to_be32(lkey); mr->ibmr.length += sg_dma_len(sg); sg_offset = 0; } if (sg_offset_p) *sg_offset_p = sg_offset; return i; } static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) { struct mlx5_ib_mr *mr = to_mmr(ibmr); __be64 *descs; if (unlikely(mr->ndescs == mr->max_descs)) return -ENOMEM; descs = mr->descs; descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); return 0; } int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset) { struct mlx5_ib_mr *mr = to_mmr(ibmr); int n; mr->ndescs = 0; ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, mr->desc_size * mr->max_descs, DMA_TO_DEVICE); if (mr->access_mode == MLX5_ACCESS_MODE_KLM) n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset); else n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, mlx5_set_page); ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, mr->desc_size * mr->max_descs, DMA_TO_DEVICE); return n; } diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c index ec47b3e07b87..90c6d69e30c2 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c @@ -1,4997 +1,5003 @@ /*- * Copyright (c) 2013-2021, Mellanox Technologies. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include "mlx5_ib.h" /* not supported currently */ static int wq_signature; enum { MLX5_IB_ACK_REQ_FREQ = 8, }; enum { MLX5_IB_DEFAULT_SCHED_QUEUE = 0x83, MLX5_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, MLX5_IB_LINK_TYPE_IB = 0, MLX5_IB_LINK_TYPE_ETH = 1 }; enum { MLX5_IB_SQ_STRIDE = 6, }; static const u32 mlx5_ib_opcode[] = { [IB_WR_SEND] = MLX5_OPCODE_SEND, [IB_WR_LSO] = MLX5_OPCODE_LSO, [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, [IB_WR_RDMA_READ] = MLX5_OPCODE_RDMA_READ, [IB_WR_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_CS, [IB_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA, [IB_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL, [IB_WR_LOCAL_INV] = MLX5_OPCODE_UMR, [IB_WR_REG_MR] = MLX5_OPCODE_UMR, [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_MASKED_CS, [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_MASKED_FA, [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, }; struct mlx5_wqe_eth_pad { u8 rsvd0[16]; }; enum raw_qp_set_mask_map { MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID = 1UL << 0, }; struct mlx5_modify_raw_qp_param { u16 operation; u32 set_mask; /* raw_qp_set_mask_map */ u8 rq_q_ctr_id; }; static void get_cqs(enum ib_qp_type qp_type, struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq); static int is_qp0(enum ib_qp_type qp_type) { return qp_type == IB_QPT_SMI; } static int is_sqp(enum ib_qp_type qp_type) { return is_qp0(qp_type) || is_qp1(qp_type); } static void *get_wqe(struct mlx5_ib_qp *qp, int offset) { return mlx5_buf_offset(&qp->buf, offset); } static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n) { return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); } void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) { return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); } /** * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space. * * @qp: QP to copy from. * @send: copy from the send queue when non-zero, use the receive queue * otherwise. * @wqe_index: index to start copying from. For send work queues, the * wqe_index is in units of MLX5_SEND_WQE_BB. * For receive work queue, it is the number of work queue * element in the queue. * @buffer: destination buffer. * @length: maximum number of bytes to copy. * * Copies at least a single WQE, but may copy more data. * * Return: the number of bytes copied, or an error code. */ int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, void *buffer, u32 length, struct mlx5_ib_qp_base *base) { struct ib_device *ibdev = qp->ibqp.device; struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq; size_t offset; size_t wq_end; struct ib_umem *umem = base->ubuffer.umem; u32 first_copy_length; int wqe_length; int ret; if (wq->wqe_cnt == 0) { mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n", qp->ibqp.qp_type); return -EINVAL; } offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift); wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift); if (send && length < sizeof(struct mlx5_wqe_ctrl_seg)) return -EINVAL; if (offset > umem->length || (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length)) return -EINVAL; first_copy_length = min_t(u32, offset + length, wq_end) - offset; ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length); if (ret) return ret; if (send) { struct mlx5_wqe_ctrl_seg *ctrl = buffer; int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; wqe_length = ds * MLX5_WQE_DS_UNITS; } else { wqe_length = 1 << wq->wqe_shift; } if (wqe_length <= first_copy_length) return first_copy_length; ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset, wqe_length - first_copy_length); if (ret) return ret; return wqe_length; } static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) { struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; struct ib_event event; if (type == MLX5_EVENT_TYPE_PATH_MIG) { /* This event is only valid for trans_qps */ to_mibqp(qp)->port = to_mibqp(qp)->trans_qp.alt_port; } if (ibqp->event_handler) { event.device = ibqp->device; event.element.qp = ibqp; switch (type) { case MLX5_EVENT_TYPE_PATH_MIG: event.event = IB_EVENT_PATH_MIG; break; case MLX5_EVENT_TYPE_COMM_EST: event.event = IB_EVENT_COMM_EST; break; case MLX5_EVENT_TYPE_SQ_DRAINED: event.event = IB_EVENT_SQ_DRAINED; break; case MLX5_EVENT_TYPE_SRQ_LAST_WQE: event.event = IB_EVENT_QP_LAST_WQE_REACHED; break; case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: event.event = IB_EVENT_QP_FATAL; break; case MLX5_EVENT_TYPE_PATH_MIG_FAILED: event.event = IB_EVENT_PATH_MIG_ERR; break; case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: event.event = IB_EVENT_QP_REQ_ERR; break; case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: event.event = IB_EVENT_QP_ACCESS_ERR; break; default: pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn); return; } ibqp->event_handler(&event, ibqp->qp_context); } } static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd) { int wqe_size; int wq_size; /* Sanity check RQ size before proceeding */ if (cap->max_recv_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) return -EINVAL; if (!has_rq) { qp->rq.max_gs = 0; qp->rq.wqe_cnt = 0; qp->rq.wqe_shift = 0; cap->max_recv_wr = 0; cap->max_recv_sge = 0; } else { if (ucmd) { qp->rq.wqe_cnt = ucmd->rq_wqe_count; qp->rq.wqe_shift = ucmd->rq_wqe_shift; qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; qp->rq.max_post = qp->rq.wqe_cnt; } else { wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0; wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg); wqe_size = roundup_pow_of_two(wqe_size); wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size; wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB); qp->rq.wqe_cnt = wq_size / wqe_size; if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq)) { mlx5_ib_dbg(dev, "wqe_size %d, max %d\n", wqe_size, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq)); return -EINVAL; } qp->rq.wqe_shift = ilog2(wqe_size); qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; qp->rq.max_post = qp->rq.wqe_cnt; } } return 0; } static int sq_overhead(struct ib_qp_init_attr *attr) { int size = 0; switch (attr->qp_type) { case IB_QPT_XRC_INI: size += sizeof(struct mlx5_wqe_xrc_seg); /* fall through */ case IB_QPT_RC: size += sizeof(struct mlx5_wqe_ctrl_seg) + max(sizeof(struct mlx5_wqe_atomic_seg) + sizeof(struct mlx5_wqe_raddr_seg), sizeof(struct mlx5_wqe_umr_ctrl_seg) + sizeof(struct mlx5_mkey_seg)); break; case IB_QPT_XRC_TGT: return 0; case IB_QPT_UC: size += sizeof(struct mlx5_wqe_ctrl_seg) + max(sizeof(struct mlx5_wqe_raddr_seg), sizeof(struct mlx5_wqe_umr_ctrl_seg) + sizeof(struct mlx5_mkey_seg)); break; case IB_QPT_UD: if (attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) size += sizeof(struct mlx5_wqe_eth_pad) + sizeof(struct mlx5_wqe_eth_seg); /* fall through */ case IB_QPT_SMI: case MLX5_IB_QPT_HW_GSI: size += sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_datagram_seg); break; case MLX5_IB_QPT_REG_UMR: size += sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_umr_ctrl_seg) + sizeof(struct mlx5_mkey_seg); break; default: return -EINVAL; } return size; } static int calc_send_wqe(struct ib_qp_init_attr *attr) { int inl_size = 0; int size; size = sq_overhead(attr); if (size < 0) return size; if (attr->cap.max_inline_data) { inl_size = size + sizeof(struct mlx5_wqe_inline_seg) + attr->cap.max_inline_data; } size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN && ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE) return MLX5_SIG_WQE_SIZE; else return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); } static int get_send_sge(struct ib_qp_init_attr *attr, int wqe_size) { int max_sge; if (attr->qp_type == IB_QPT_RC) max_sge = (min_t(int, wqe_size, 512) - sizeof(struct mlx5_wqe_ctrl_seg) - sizeof(struct mlx5_wqe_raddr_seg)) / sizeof(struct mlx5_wqe_data_seg); else if (attr->qp_type == IB_QPT_XRC_INI) max_sge = (min_t(int, wqe_size, 512) - sizeof(struct mlx5_wqe_ctrl_seg) - sizeof(struct mlx5_wqe_xrc_seg) - sizeof(struct mlx5_wqe_raddr_seg)) / sizeof(struct mlx5_wqe_data_seg); else max_sge = (wqe_size - sq_overhead(attr)) / sizeof(struct mlx5_wqe_data_seg); return min_t(int, max_sge, wqe_size - sq_overhead(attr) / sizeof(struct mlx5_wqe_data_seg)); } static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, struct mlx5_ib_qp *qp) { int wqe_size; int wq_size; if (!attr->cap.max_send_wr) return 0; wqe_size = calc_send_wqe(attr); mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size); if (wqe_size < 0) return wqe_size; if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) { mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", wqe_size, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)); return -EINVAL; } qp->max_inline_data = wqe_size - sq_overhead(attr) - sizeof(struct mlx5_wqe_inline_seg); attr->cap.max_inline_data = qp->max_inline_data; if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) qp->signature_en = true; wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) { mlx5_ib_dbg(dev, "wqe count(%d) exceeds limits(%d)\n", qp->sq.wqe_cnt, 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)); return -ENOMEM; } qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); qp->sq.max_gs = get_send_sge(attr, wqe_size); if (qp->sq.max_gs < attr->cap.max_send_sge) return -ENOMEM; attr->cap.max_send_sge = qp->sq.max_gs; qp->sq.max_post = wq_size / wqe_size; attr->cap.max_send_wr = qp->sq.max_post; return wq_size; } static int set_user_buf_size(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd, struct mlx5_ib_qp_base *base, struct ib_qp_init_attr *attr) { int desc_sz = 1 << qp->sq.wqe_shift; if (desc_sz > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) { mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n", desc_sz, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)); return -EINVAL; } if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) { mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n", ucmd->sq_wqe_count, ucmd->sq_wqe_count); return -EINVAL; } qp->sq.wqe_cnt = ucmd->sq_wqe_count; if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) { mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n", qp->sq.wqe_cnt, 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)); return -EINVAL; } if (attr->qp_type == IB_QPT_RAW_PACKET) { base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift; qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6; } else { base->ubuffer.buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + (qp->sq.wqe_cnt << 6); } return 0; } static int qp_has_rq(struct ib_qp_init_attr *attr) { if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT || attr->srq || attr->qp_type == MLX5_IB_QPT_REG_UMR || !attr->cap.max_recv_wr) return 0; return 1; } enum { /* this is the first blue flame register in the array of bfregs assigned * to a processes. Since we do not use it for blue flame but rather * regular 64 bit doorbells, we do not need a lock for maintaiing * "odd/even" order */ NUM_NON_BLUE_FLAME_BFREGS = 1, }; static int max_bfregs(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi) { return get_num_static_uars(dev, bfregi) * MLX5_NON_FP_BFREGS_PER_UAR; } static int num_med_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi) { int n; n = max_bfregs(dev, bfregi) - bfregi->num_low_latency_bfregs - NUM_NON_BLUE_FLAME_BFREGS; return n >= 0 ? n : 0; } static int first_med_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi) { return num_med_bfreg(dev, bfregi) ? 1 : -ENOMEM; } static int first_hi_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi) { int med; med = num_med_bfreg(dev, bfregi); return ++med; } static int alloc_high_class_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi) { int i; for (i = first_hi_bfreg(dev, bfregi); i < max_bfregs(dev, bfregi); i++) { if (!bfregi->count[i]) { bfregi->count[i]++; return i; } } return -ENOMEM; } static int alloc_med_class_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi) { int minidx = first_med_bfreg(dev, bfregi); int i; if (minidx < 0) return minidx; for (i = minidx; i < first_hi_bfreg(dev, bfregi); i++) { if (bfregi->count[i] < bfregi->count[minidx]) minidx = i; if (!bfregi->count[minidx]) break; } bfregi->count[minidx]++; return minidx; } static int alloc_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi) { int bfregn = -ENOMEM; if (bfregi->lib_uar_dyn) return -EINVAL; mutex_lock(&bfregi->lock); if (bfregi->ver >= 2) { bfregn = alloc_high_class_bfreg(dev, bfregi); if (bfregn < 0) bfregn = alloc_med_class_bfreg(dev, bfregi); } if (bfregn < 0) { BUILD_BUG_ON(NUM_NON_BLUE_FLAME_BFREGS != 1); bfregn = 0; bfregi->count[bfregn]++; } mutex_unlock(&bfregi->lock); return bfregn; } void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn) { mutex_lock(&bfregi->lock); bfregi->count[bfregn]--; mutex_unlock(&bfregi->lock); } static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state) { switch (state) { case IB_QPS_RESET: return MLX5_QP_STATE_RST; case IB_QPS_INIT: return MLX5_QP_STATE_INIT; case IB_QPS_RTR: return MLX5_QP_STATE_RTR; case IB_QPS_RTS: return MLX5_QP_STATE_RTS; case IB_QPS_SQD: return MLX5_QP_STATE_SQD; case IB_QPS_SQE: return MLX5_QP_STATE_SQER; case IB_QPS_ERR: return MLX5_QP_STATE_ERR; default: return -1; } } static int to_mlx5_st(enum ib_qp_type type) { switch (type) { case IB_QPT_RC: return MLX5_QP_ST_RC; case IB_QPT_UC: return MLX5_QP_ST_UC; case IB_QPT_UD: return MLX5_QP_ST_UD; case MLX5_IB_QPT_REG_UMR: return MLX5_QP_ST_REG_UMR; case IB_QPT_XRC_INI: case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; case IB_QPT_SMI: return MLX5_QP_ST_QP0; case MLX5_IB_QPT_HW_GSI: return MLX5_QP_ST_QP1; case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; case IB_QPT_RAW_PACKET: case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; case IB_QPT_MAX: default: return -EINVAL; } } static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq); static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq); int bfregn_to_uar_index(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, u32 bfregn, bool dyn_bfreg) { unsigned int bfregs_per_sys_page; u32 index_of_sys_page; u32 offset; if (bfregi->lib_uar_dyn) return -EINVAL; bfregs_per_sys_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * MLX5_NON_FP_BFREGS_PER_UAR; index_of_sys_page = bfregn / bfregs_per_sys_page; if (dyn_bfreg) { index_of_sys_page += bfregi->num_static_sys_pages; if (index_of_sys_page >= bfregi->num_sys_pages) return -EINVAL; if (bfregn > bfregi->num_dyn_bfregs || bfregi->sys_pages[index_of_sys_page] == MLX5_IB_INVALID_UAR_INDEX) { mlx5_ib_dbg(dev, "Invalid dynamic uar index\n"); return -EINVAL; } } offset = bfregn % bfregs_per_sys_page / MLX5_NON_FP_BFREGS_PER_UAR; return bfregi->sys_pages[index_of_sys_page] + offset; } static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, struct ib_pd *pd, unsigned long addr, size_t size, struct ib_umem **umem, int *npages, int *page_shift, int *ncont, u32 *offset) { int err; *umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0); if (IS_ERR(*umem)) { mlx5_ib_dbg(dev, "umem_get failed\n"); return PTR_ERR(*umem); } mlx5_ib_cont_pages(*umem, addr, 0, npages, page_shift, ncont, NULL); err = mlx5_ib_get_buf_offset(addr, *page_shift, offset); if (err) { mlx5_ib_warn(dev, "bad offset\n"); goto err_umem; } mlx5_ib_dbg(dev, "addr 0x%lx, size %zu, npages %d, page_shift %d, ncont %d, offset %d\n", addr, size, *npages, *page_shift, *ncont, *offset); return 0; err_umem: ib_umem_release(*umem); *umem = NULL; return err; } static void destroy_user_rq(struct ib_pd *pd, struct mlx5_ib_rwq *rwq) { struct mlx5_ib_ucontext *context; context = to_mucontext(pd->uobject->context); mlx5_ib_db_unmap_user(context, &rwq->db); if (rwq->umem) ib_umem_release(rwq->umem); } static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_rwq *rwq, struct mlx5_ib_create_wq *ucmd) { struct mlx5_ib_ucontext *context; int page_shift = 0; int npages; u32 offset = 0; int ncont = 0; int err; if (!ucmd->buf_addr) return -EINVAL; context = to_mucontext(pd->uobject->context); rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr, rwq->buf_size, 0, 0); if (IS_ERR(rwq->umem)) { mlx5_ib_dbg(dev, "umem_get failed\n"); err = PTR_ERR(rwq->umem); return err; } mlx5_ib_cont_pages(rwq->umem, ucmd->buf_addr, 0, &npages, &page_shift, &ncont, NULL); err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift, &rwq->rq_page_offset); if (err) { mlx5_ib_warn(dev, "bad offset\n"); goto err_umem; } rwq->rq_num_pas = ncont; rwq->page_shift = page_shift; rwq->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT; rwq->wq_sig = !!(ucmd->flags & MLX5_WQ_FLAG_SIGNATURE); mlx5_ib_dbg(dev, "addr 0x%llx, size %zd, npages %d, page_shift %d, ncont %d, offset %d\n", (unsigned long long)ucmd->buf_addr, rwq->buf_size, npages, page_shift, ncont, offset); err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db); if (err) { mlx5_ib_dbg(dev, "map failed\n"); goto err_umem; } rwq->create_type = MLX5_WQ_USER; return 0; err_umem: ib_umem_release(rwq->umem); return err; } static int adjust_bfregn(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn) { return bfregn / MLX5_NON_FP_BFREGS_PER_UAR * MLX5_BFREGS_PER_UAR + bfregn % MLX5_NON_FP_BFREGS_PER_UAR; } static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_udata *udata, struct ib_qp_init_attr *attr, u32 **in, struct mlx5_ib_create_qp_resp *resp, int *inlen, struct mlx5_ib_qp_base *base) { struct mlx5_ib_ucontext *context; struct mlx5_ib_create_qp ucmd; struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer; int page_shift = 0; int uar_index = 0; int npages; u32 offset = 0; int bfregn; int ncont = 0; __be64 *pas; void *qpc; int err; u32 uar_flags; err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); if (err) { mlx5_ib_dbg(dev, "copy failed\n"); return err; } context = to_mucontext(pd->uobject->context); uar_flags = ucmd.flags & (MLX5_QP_FLAG_UAR_PAGE_INDEX | MLX5_QP_FLAG_BFREG_INDEX); switch (uar_flags) { case MLX5_QP_FLAG_UAR_PAGE_INDEX: uar_index = ucmd.bfreg_index; bfregn = MLX5_IB_INVALID_BFREG; break; case MLX5_QP_FLAG_BFREG_INDEX: uar_index = bfregn_to_uar_index(dev, &context->bfregi, ucmd.bfreg_index, true); if (uar_index < 0) return uar_index; bfregn = MLX5_IB_INVALID_BFREG; break; case 0: if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) return -EINVAL; bfregn = alloc_bfreg(dev, &context->bfregi); if (bfregn < 0) return bfregn; break; default: return -EINVAL; } mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index); if (bfregn != MLX5_IB_INVALID_BFREG) uar_index = bfregn_to_uar_index(dev, &context->bfregi, bfregn, false); qp->rq.offset = 0; qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; err = set_user_buf_size(dev, qp, &ucmd, base, attr); if (err) goto err_bfreg; if (ucmd.buf_addr && ubuffer->buf_size) { ubuffer->buf_addr = ucmd.buf_addr; err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size, &ubuffer->umem, &npages, &page_shift, &ncont, &offset); if (err) goto err_bfreg; } else { ubuffer->umem = NULL; } *inlen = MLX5_ST_SZ_BYTES(create_qp_in) + MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * ncont; *in = mlx5_vzalloc(*inlen); if (!*in) { err = -ENOMEM; goto err_umem; } pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas); if (ubuffer->umem) mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift, pas, 0); qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc); MLX5_SET(qpc, qpc, log_page_size, page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET(qpc, qpc, page_offset, offset); MLX5_SET(qpc, qpc, uar_page, uar_index); if (bfregn != MLX5_IB_INVALID_BFREG) resp->bfreg_index = adjust_bfregn(dev, &context->bfregi, bfregn); else resp->bfreg_index = MLX5_IB_INVALID_BFREG; qp->bfregn = bfregn; err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db); if (err) { mlx5_ib_dbg(dev, "map failed\n"); goto err_free; } err = ib_copy_to_udata(udata, resp, sizeof(*resp)); if (err) { mlx5_ib_dbg(dev, "copy failed\n"); goto err_unmap; } qp->create_type = MLX5_QP_USER; return 0; err_unmap: mlx5_ib_db_unmap_user(context, &qp->db); err_free: kvfree(*in); err_umem: if (ubuffer->umem) ib_umem_release(ubuffer->umem); err_bfreg: if (bfregn != MLX5_IB_INVALID_BFREG) mlx5_ib_free_bfreg(dev, &context->bfregi, bfregn); return err; } static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_qp *qp, struct mlx5_ib_qp_base *base) { struct mlx5_ib_ucontext *context; context = to_mucontext(pd->uobject->context); mlx5_ib_db_unmap_user(context, &qp->db); if (base->ubuffer.umem) ib_umem_release(base->ubuffer.umem); /* * Free only the BFREGs which are handled by the kernel. * BFREGs of UARs allocated dynamically are handled by user. */ if (qp->bfregn != MLX5_IB_INVALID_BFREG) mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn); } static int create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *init_attr, struct mlx5_ib_qp *qp, u32 **in, int *inlen, struct mlx5_ib_qp_base *base) { int uar_index; void *qpc; int err; if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | IB_QP_CREATE_IPOIB_UD_LSO | MLX5_IB_QP_CREATE_SQPN_QP1 | MLX5_IB_QP_CREATE_WC_TEST)) return -EINVAL; spin_lock_init(&qp->bf.lock32); if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) qp->bf.bfreg = &dev->fp_bfreg; else if (init_attr->create_flags & MLX5_IB_QP_CREATE_WC_TEST) qp->bf.bfreg = &dev->wc_bfreg; else qp->bf.bfreg = &dev->bfreg; /* We need to divide by two since each register is comprised of * two buffers of identical size, namely odd and even */ qp->bf.buf_size = (1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size)) / 2; uar_index = qp->bf.bfreg->index; err = calc_sq_size(dev, init_attr, qp); if (err < 0) { mlx5_ib_dbg(dev, "err %d\n", err); return err; } qp->rq.offset = 0; qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; base->ubuffer.buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size, 2 * PAGE_SIZE, &qp->buf); if (err) { mlx5_ib_dbg(dev, "err %d\n", err); return err; } qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt); *inlen = MLX5_ST_SZ_BYTES(create_qp_in) + MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * qp->buf.npages; *in = mlx5_vzalloc(*inlen); if (!*in) { err = -ENOMEM; goto err_buf; } qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc); MLX5_SET(qpc, qpc, uar_page, uar_index); MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(dev->mdev)); MLX5_SET(qpc, qpc, log_page_size, qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); /* Set "fast registration enabled" for all kernel QPs */ MLX5_SET(qpc, qpc, fre, 1); MLX5_SET(qpc, qpc, rlky, 1); if (init_attr->create_flags & MLX5_IB_QP_CREATE_SQPN_QP1) { MLX5_SET(qpc, qpc, deth_sqpn, 1); qp->flags |= MLX5_IB_QP_SQPN_QP1; } mlx5_fill_page_array(&qp->buf, (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas)); err = mlx5_db_alloc(dev->mdev, &qp->db); if (err) { mlx5_ib_dbg(dev, "err %d\n", err); goto err_free; } qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid), GFP_KERNEL); qp->sq.wr_data = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data), GFP_KERNEL); qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(*qp->rq.wrid), GFP_KERNEL); qp->sq.w_list = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.w_list), GFP_KERNEL); qp->sq.wqe_head = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head), GFP_KERNEL); if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid || !qp->sq.w_list || !qp->sq.wqe_head) { err = -ENOMEM; goto err_wrid; } qp->create_type = MLX5_QP_KERNEL; return 0; err_wrid: kfree(qp->sq.wqe_head); kfree(qp->sq.w_list); kfree(qp->sq.wrid); kfree(qp->sq.wr_data); kfree(qp->rq.wrid); mlx5_db_free(dev->mdev, &qp->db); err_free: kvfree(*in); err_buf: mlx5_buf_free(dev->mdev, &qp->buf); return err; } static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { kfree(qp->sq.wqe_head); kfree(qp->sq.w_list); kfree(qp->sq.wrid); kfree(qp->sq.wr_data); kfree(qp->rq.wrid); mlx5_db_free(dev->mdev, &qp->db); mlx5_buf_free(dev->mdev, &qp->buf); } static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) { if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) || (attr->qp_type == IB_QPT_XRC_INI)) return MLX5_SRQ_RQ; else if (!qp->has_rq) return MLX5_ZERO_LEN_RQ; else return MLX5_NON_ZERO_RQ; } static int is_connected(enum ib_qp_type qp_type) { if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC) return 1; return 0; } static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, struct mlx5_ib_sq *sq, u32 tdn) { u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0}; void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); MLX5_SET(tisc, tisc, transport_domain, tdn); return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn); } static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev, struct mlx5_ib_sq *sq) { mlx5_core_destroy_tis(dev->mdev, sq->tisn); } static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, struct mlx5_ib_sq *sq, void *qpin, struct ib_pd *pd) { struct mlx5_ib_ubuffer *ubuffer = &sq->ubuffer; __be64 *pas; void *in; void *sqc; void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc); void *wq; int inlen; int err; int page_shift = 0; int npages; int ncont = 0; u32 offset = 0; u8 ts_format; ts_format = mlx5_get_sq_default_ts(dev->mdev); err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size, &sq->ubuffer.umem, &npages, &page_shift, &ncont, &offset); if (err) return err; inlen = MLX5_ST_SZ_BYTES(create_sq_in) + sizeof(u64) * ncont; in = mlx5_vzalloc(inlen); if (!in) { err = -ENOMEM; goto err_umem; } sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); MLX5_SET(sqc, sqc, flush_in_error_en, 1); MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); MLX5_SET(sqc, sqc, ts_format, ts_format); MLX5_SET(sqc, sqc, user_index, MLX5_GET(qpc, qpc, user_index)); MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd)); MLX5_SET(sqc, sqc, tis_lst_sz, 1); MLX5_SET(sqc, sqc, tis_num_0, sq->tisn); wq = MLX5_ADDR_OF(sqc, sqc, wq); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd)); MLX5_SET(wq, wq, uar_page, MLX5_GET(qpc, qpc, uar_page)); MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr)); MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_sq_size)); MLX5_SET(wq, wq, log_wq_pg_sz, page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET(wq, wq, page_offset, offset); pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); mlx5_ib_populate_pas(dev, sq->ubuffer.umem, page_shift, pas, 0); err = mlx5_core_create_sq_tracked(dev->mdev, in, inlen, &sq->base.mqp); kvfree(in); if (err) goto err_umem; return 0; err_umem: ib_umem_release(sq->ubuffer.umem); sq->ubuffer.umem = NULL; return err; } static void destroy_raw_packet_qp_sq(struct mlx5_ib_dev *dev, struct mlx5_ib_sq *sq) { mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp); ib_umem_release(sq->ubuffer.umem); } static int get_rq_pas_size(void *qpc) { u32 log_page_size = MLX5_GET(qpc, qpc, log_page_size) + 12; u32 log_rq_stride = MLX5_GET(qpc, qpc, log_rq_stride); u32 log_rq_size = MLX5_GET(qpc, qpc, log_rq_size); u32 page_offset = MLX5_GET(qpc, qpc, page_offset); u32 po_quanta = 1 << (log_page_size - 6); u32 rq_sz = 1 << (log_rq_size + 4 + log_rq_stride); u32 page_size = 1 << log_page_size; u32 rq_sz_po = rq_sz + (page_offset * po_quanta); u32 rq_num_pas = (rq_sz_po + page_size - 1) / page_size; return rq_num_pas * sizeof(u64); } static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, void *qpin) { struct mlx5_ib_qp *mqp = rq->base.container_mibqp; __be64 *pas; __be64 *qp_pas; void *in; void *rqc; void *wq; void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc); int inlen; int err; u32 rq_pas_size = get_rq_pas_size(qpc); u8 ts_format; ts_format = mlx5_get_rq_default_ts(dev->mdev); inlen = MLX5_ST_SZ_BYTES(create_rq_in) + rq_pas_size; in = mlx5_vzalloc(inlen); if (!in) return -ENOMEM; rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); MLX5_SET(rqc, rqc, vlan_strip_disable, 1); MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_RQ_TYPE_MEMORY_RQ_INLINE); MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); MLX5_SET(rqc, rqc, ts_format, ts_format); MLX5_SET(rqc, rqc, flush_in_error_en, 1); MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index)); MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv)); if (mqp->flags & MLX5_IB_QP_CAP_SCATTER_FCS) MLX5_SET(rqc, rqc, scatter_fcs, 1); wq = MLX5_ADDR_OF(rqc, rqc, wq); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); MLX5_SET(wq, wq, end_padding_mode, MLX5_GET(qpc, qpc, end_padding_mode)); MLX5_SET(wq, wq, page_offset, MLX5_GET(qpc, qpc, page_offset)); MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd)); MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr)); MLX5_SET(wq, wq, log_wq_stride, MLX5_GET(qpc, qpc, log_rq_stride) + 4); MLX5_SET(wq, wq, log_wq_pg_sz, MLX5_GET(qpc, qpc, log_page_size)); MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_rq_size)); pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); qp_pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, qpin, pas); memcpy(pas, qp_pas, rq_pas_size); err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rq->base.mqp); kvfree(in); return err; } static void destroy_raw_packet_qp_rq(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq) { mlx5_core_destroy_rq_tracked(dev->mdev, &rq->base.mqp); } static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, u32 tdn) { u32 *in; void *tirc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(create_tir_in); in = mlx5_vzalloc(inlen); if (!in) return -ENOMEM; tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context); MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn); MLX5_SET(tirc, tirc, transport_domain, tdn); err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn); kvfree(in); return err; } static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq) { mlx5_core_destroy_tir(dev->mdev, rq->tirn); } static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, u32 *in, struct ib_pd *pd) { struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; struct mlx5_ib_sq *sq = &raw_packet_qp->sq; struct mlx5_ib_rq *rq = &raw_packet_qp->rq; struct ib_uobject *uobj = pd->uobject; struct ib_ucontext *ucontext = uobj->context; struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); int err; u32 tdn = mucontext->tdn; if (qp->sq.wqe_cnt) { err = create_raw_packet_qp_tis(dev, sq, tdn); if (err) return err; err = create_raw_packet_qp_sq(dev, sq, in, pd); if (err) goto err_destroy_tis; sq->base.container_mibqp = qp; } if (qp->rq.wqe_cnt) { rq->base.container_mibqp = qp; err = create_raw_packet_qp_rq(dev, rq, in); if (err) goto err_destroy_sq; err = create_raw_packet_qp_tir(dev, rq, tdn); if (err) goto err_destroy_rq; } qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn : rq->base.mqp.qpn; return 0; err_destroy_rq: destroy_raw_packet_qp_rq(dev, rq); err_destroy_sq: if (!qp->sq.wqe_cnt) return err; destroy_raw_packet_qp_sq(dev, sq); err_destroy_tis: destroy_raw_packet_qp_tis(dev, sq); return err; } static void destroy_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; struct mlx5_ib_sq *sq = &raw_packet_qp->sq; struct mlx5_ib_rq *rq = &raw_packet_qp->rq; if (qp->rq.wqe_cnt) { destroy_raw_packet_qp_tir(dev, rq); destroy_raw_packet_qp_rq(dev, rq); } if (qp->sq.wqe_cnt) { destroy_raw_packet_qp_sq(dev, sq); destroy_raw_packet_qp_tis(dev, sq); } } static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp, struct mlx5_ib_raw_packet_qp *raw_packet_qp) { struct mlx5_ib_sq *sq = &raw_packet_qp->sq; struct mlx5_ib_rq *rq = &raw_packet_qp->rq; sq->sq = &qp->sq; rq->rq = &qp->rq; sq->doorbell = &qp->db; rq->doorbell = &qp->db; } static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { mlx5_core_destroy_tir(dev->mdev, qp->rss_qp.tirn); } static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { struct ib_uobject *uobj = pd->uobject; struct ib_ucontext *ucontext = uobj->context; struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); struct mlx5_ib_create_qp_resp resp = {}; int inlen; int err; u32 *in; void *tirc; void *hfso; u32 selected_fields = 0; size_t min_resp_len; u32 tdn = mucontext->tdn; struct mlx5_ib_create_qp_rss ucmd = {}; size_t required_cmd_sz; if (init_attr->qp_type != IB_QPT_RAW_PACKET) return -EOPNOTSUPP; if (init_attr->create_flags || init_attr->send_cq) return -EINVAL; min_resp_len = offsetof(typeof(resp), bfreg_index) + sizeof(resp.bfreg_index); if (udata->outlen < min_resp_len) return -EINVAL; required_cmd_sz = offsetof(typeof(ucmd), reserved1) + sizeof(ucmd.reserved1); if (udata->inlen < required_cmd_sz) { mlx5_ib_dbg(dev, "invalid inlen\n"); return -EINVAL; } if (udata->inlen > sizeof(ucmd) && !ib_is_udata_cleared(udata, sizeof(ucmd), udata->inlen - sizeof(ucmd))) { mlx5_ib_dbg(dev, "inlen is not supported\n"); return -EOPNOTSUPP; } if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) { mlx5_ib_dbg(dev, "copy failed\n"); return -EFAULT; } if (ucmd.comp_mask) { mlx5_ib_dbg(dev, "invalid comp mask\n"); return -EOPNOTSUPP; } if (memchr_inv(ucmd.reserved, 0, sizeof(ucmd.reserved)) || ucmd.reserved1) { mlx5_ib_dbg(dev, "invalid reserved\n"); return -EOPNOTSUPP; } err = ib_copy_to_udata(udata, &resp, min_resp_len); if (err) { mlx5_ib_dbg(dev, "copy failed\n"); return -EINVAL; } inlen = MLX5_ST_SZ_BYTES(create_tir_in); in = mlx5_vzalloc(inlen); if (!in) return -ENOMEM; tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context); MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT); MLX5_SET(tirc, tirc, indirect_table, init_attr->rwq_ind_tbl->ind_tbl_num); MLX5_SET(tirc, tirc, transport_domain, tdn); hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); switch (ucmd.rx_hash_function) { case MLX5_RX_HASH_FUNC_TOEPLITZ: { void *rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key); size_t len = MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key); if (len != ucmd.rx_key_len) { err = -EINVAL; goto err; } MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FUNC_TOEPLITZ); MLX5_SET(tirc, tirc, rx_hash_symmetric, 1); memcpy(rss_key, ucmd.rx_hash_key, len); break; } default: err = -EOPNOTSUPP; goto err; } if (!ucmd.rx_hash_fields_mask) { /* special case when this TIR serves as steering entry without hashing */ if (!init_attr->rwq_ind_tbl->log_ind_tbl_size) goto create_tir; err = -EINVAL; goto err; } if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) && ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) || (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))) { err = -EINVAL; goto err; } /* If none of IPV4 & IPV6 SRC/DST was set - this bit field is ignored */ if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) || (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6)) MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) && ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))) { err = -EINVAL; goto err; } /* If none of TCP & UDP SRC/DST was set - this bit field is ignored */ if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, MLX5_L4_PROT_TYPE_TCP); else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, MLX5_L4_PROT_TYPE_UDP); if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6)) selected_fields |= MLX5_HASH_FIELD_SEL_SRC_IP; if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4) || (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6)) selected_fields |= MLX5_HASH_FIELD_SEL_DST_IP; if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP)) selected_fields |= MLX5_HASH_FIELD_SEL_L4_SPORT; if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP) || (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) selected_fields |= MLX5_HASH_FIELD_SEL_L4_DPORT; MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields); create_tir: err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn); if (err) goto err; kvfree(in); /* qpn is reserved for that QP */ qp->trans_qp.base.mqp.qpn = 0; qp->flags |= MLX5_IB_QP_RSS; return 0; err: kvfree(in); return err; } static int atomic_size_to_mode(int size_mask) { /* driver does not support atomic_size > 256B * and does not know how to translate bigger sizes */ int supported_size_mask = size_mask & 0x1ff; int log_max_size; if (!supported_size_mask) return -EOPNOTSUPP; log_max_size = __fls(supported_size_mask); if (log_max_size > 3) return log_max_size; return MLX5_ATOMIC_MODE_8B; } static int get_atomic_mode(struct mlx5_ib_dev *dev, enum ib_qp_type qp_type) { u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); u8 atomic = MLX5_CAP_GEN(dev->mdev, atomic); int atomic_mode = -EOPNOTSUPP; int atomic_size_mask; if (!atomic) return -EOPNOTSUPP; if (qp_type == MLX5_IB_QPT_DCT) atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); else atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); if ((atomic_operations & MLX5_ATOMIC_OPS_MASKED_CMP_SWAP) || (atomic_operations & MLX5_ATOMIC_OPS_MASKED_FETCH_ADD)) atomic_mode = atomic_size_to_mode(atomic_size_mask); if (atomic_mode <= 0 && (atomic_operations & MLX5_ATOMIC_OPS_CMP_SWAP && atomic_operations & MLX5_ATOMIC_OPS_FETCH_ADD)) atomic_mode = MLX5_ATOMIC_MODE_IB_COMP; return atomic_mode; } static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, struct mlx5_ib_qp *qp) { struct mlx5_ib_resources *devr = &dev->devr; int inlen = MLX5_ST_SZ_BYTES(create_qp_in); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_ib_create_qp_resp resp; struct mlx5_ib_cq *send_cq; struct mlx5_ib_cq *recv_cq; unsigned long flags; u32 uidx = MLX5_IB_DEFAULT_UIDX; struct mlx5_ib_create_qp ucmd; struct mlx5_ib_qp_base *base; void *qpc; u32 *in; int err; base = init_attr->qp_type == IB_QPT_RAW_PACKET ? &qp->raw_packet_qp.rq.base : &qp->trans_qp.base; if (init_attr->qp_type != IB_QPT_RAW_PACKET) mlx5_ib_odp_create_qp(qp); mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); if (init_attr->rwq_ind_tbl) { if (!udata) return -ENOSYS; err = create_rss_raw_qp_tir(dev, qp, pd, init_attr, udata); return err; } if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { if (!MLX5_CAP_GEN(mdev, block_lb_mc)) { mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n"); return -EINVAL; } else { qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; } } if (init_attr->create_flags & (IB_QP_CREATE_CROSS_CHANNEL | IB_QP_CREATE_MANAGED_SEND | IB_QP_CREATE_MANAGED_RECV)) { if (!MLX5_CAP_GEN(mdev, cd)) { mlx5_ib_dbg(dev, "cross-channel isn't supported\n"); return -EINVAL; } if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL) qp->flags |= MLX5_IB_QP_CROSS_CHANNEL; if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND) qp->flags |= MLX5_IB_QP_MANAGED_SEND; if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV) qp->flags |= MLX5_IB_QP_MANAGED_RECV; } if (init_attr->qp_type == IB_QPT_UD && (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) if (!MLX5_CAP_GEN(mdev, ipoib_ipoib_offloads)) { mlx5_ib_dbg(dev, "ipoib UD lso qp isn't supported\n"); return -EOPNOTSUPP; } if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) { if (init_attr->qp_type != IB_QPT_RAW_PACKET) { mlx5_ib_dbg(dev, "Scatter FCS is supported only for Raw Packet QPs"); return -EOPNOTSUPP; } if (!MLX5_CAP_GEN(dev->mdev, eth_net_offloads) || !MLX5_CAP_ETH(dev->mdev, scatter_fcs)) { mlx5_ib_dbg(dev, "Scatter FCS isn't supported\n"); return -EOPNOTSUPP; } qp->flags |= MLX5_IB_QP_CAP_SCATTER_FCS; } if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; if (pd && pd->uobject) { if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { mlx5_ib_dbg(dev, "copy failed\n"); return -EFAULT; } err = get_qp_user_index(to_mucontext(pd->uobject->context), &ucmd, udata->inlen, &uidx); if (err) return err; qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE); } else { qp->wq_sig = !!wq_signature; } qp->has_rq = qp_has_rq(init_attr); err = set_rq_size(dev, &init_attr->cap, qp->has_rq, qp, (pd && pd->uobject) ? &ucmd : NULL); if (err) { mlx5_ib_dbg(dev, "err %d\n", err); return err; } if (pd) { if (pd->uobject) { __u32 max_wqes = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count); if (ucmd.rq_wqe_shift != qp->rq.wqe_shift || ucmd.rq_wqe_count != qp->rq.wqe_cnt) { mlx5_ib_dbg(dev, "invalid rq params\n"); return -EINVAL; } if (ucmd.sq_wqe_count > max_wqes) { mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n", ucmd.sq_wqe_count, max_wqes); return -EINVAL; } if (init_attr->create_flags & MLX5_IB_QP_CREATE_SQPN_QP1) { mlx5_ib_dbg(dev, "user-space is not allowed to create UD QPs spoofing as QP1\n"); return -EINVAL; } err = create_user_qp(dev, pd, qp, udata, init_attr, &in, &resp, &inlen, base); if (err) mlx5_ib_dbg(dev, "err %d\n", err); } else { err = create_kernel_qp(dev, init_attr, qp, &in, &inlen, base); if (err) mlx5_ib_dbg(dev, "err %d\n", err); } if (err) return err; } else { in = mlx5_vzalloc(inlen); if (!in) return -ENOMEM; qp->create_type = MLX5_QP_EMPTY; } if (is_sqp(init_attr->qp_type)) qp->port = init_attr->port_num; qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); MLX5_SET(qpc, qpc, st, to_mlx5_st(init_attr->qp_type)); MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR) MLX5_SET(qpc, qpc, pd, to_mpd(pd ? pd : devr->p0)->pdn); else MLX5_SET(qpc, qpc, latency_sensitive, 1); if (qp->wq_sig) MLX5_SET(qpc, qpc, wq_signature, 1); if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) MLX5_SET(qpc, qpc, block_lb_mc, 1); if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) MLX5_SET(qpc, qpc, cd_master, 1); if (qp->flags & MLX5_IB_QP_MANAGED_SEND) MLX5_SET(qpc, qpc, cd_slave_send, 1); if (qp->flags & MLX5_IB_QP_MANAGED_RECV) MLX5_SET(qpc, qpc, cd_slave_receive, 1); if (qp->scat_cqe && is_connected(init_attr->qp_type)) { int rcqe_sz; int scqe_sz; rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq); scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq); if (rcqe_sz == 128) MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE); else MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA32_CQE); if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) { if (scqe_sz == 128) MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA64_CQE); else MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE); } } if (qp->rq.wqe_cnt) { MLX5_SET(qpc, qpc, log_rq_stride, qp->rq.wqe_shift - 4); MLX5_SET(qpc, qpc, log_rq_size, ilog2(qp->rq.wqe_cnt)); } if (init_attr->qp_type != IB_QPT_RAW_PACKET) MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(dev->mdev)); MLX5_SET(qpc, qpc, rq_type, get_rx_type(qp, init_attr)); if (qp->sq.wqe_cnt) MLX5_SET(qpc, qpc, log_sq_size, ilog2(qp->sq.wqe_cnt)); else MLX5_SET(qpc, qpc, no_sq, 1); /* Set default resources */ switch (init_attr->qp_type) { case IB_QPT_XRC_TGT: MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn); MLX5_SET(qpc, qpc, cqn_snd, to_mcq(devr->c0)->mcq.cqn); MLX5_SET(qpc, qpc, srqn_rmpn, to_msrq(devr->s0)->msrq.srqn); MLX5_SET(qpc, qpc, xrcd, to_mxrcd(init_attr->xrcd)->xrcdn); break; case IB_QPT_XRC_INI: MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn); MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn); MLX5_SET(qpc, qpc, srqn_rmpn, to_msrq(devr->s0)->msrq.srqn); break; default: if (init_attr->srq) { MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x0)->xrcdn); MLX5_SET(qpc, qpc, srqn_rmpn, to_msrq(init_attr->srq)->msrq.srqn); } else { MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn); MLX5_SET(qpc, qpc, srqn_rmpn, to_msrq(devr->s1)->msrq.srqn); } } if (init_attr->send_cq) MLX5_SET(qpc, qpc, cqn_snd, to_mcq(init_attr->send_cq)->mcq.cqn); if (init_attr->recv_cq) MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(init_attr->recv_cq)->mcq.cqn); MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); /* 0xffffff means we ask to work with cqe version 0 */ if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) MLX5_SET(qpc, qpc, user_index, uidx); /* we use IB_QP_CREATE_IPOIB_UD_LSO to indicates ipoib qp */ if (init_attr->qp_type == IB_QPT_UD && (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) { MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1); qp->flags |= MLX5_IB_QP_LSO; } if (init_attr->qp_type == IB_QPT_RAW_PACKET) { qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr; raw_packet_qp_copy_info(qp, &qp->raw_packet_qp); err = create_raw_packet_qp(dev, qp, in, pd); } else { err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen); } if (err) { mlx5_ib_dbg(dev, "create qp failed\n"); goto err_create; } kvfree(in); base->container_mibqp = qp; base->mqp.event = mlx5_ib_qp_event; get_cqs(init_attr->qp_type, init_attr->send_cq, init_attr->recv_cq, &send_cq, &recv_cq); spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); mlx5_ib_lock_cqs(send_cq, recv_cq); /* Maintain device to QPs access, needed for further handling via reset * flow */ list_add_tail(&qp->qps_list, &dev->qp_list); /* Maintain CQ to QPs access, needed for further handling via reset flow */ if (send_cq) list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp); if (recv_cq) list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp); mlx5_ib_unlock_cqs(send_cq, recv_cq); spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); return 0; err_create: if (qp->create_type == MLX5_QP_USER) destroy_qp_user(dev, pd, qp, base); else if (qp->create_type == MLX5_QP_KERNEL) destroy_qp_kernel(dev, qp); kvfree(in); return err; } static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) __acquires(&send_cq->lock) __acquires(&recv_cq->lock) { if (send_cq) { if (recv_cq) { if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_lock(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } else { spin_lock(&recv_cq->lock); spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); } } else { spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } } else if (recv_cq) { spin_lock(&recv_cq->lock); __acquire(&send_cq->lock); } else { __acquire(&send_cq->lock); __acquire(&recv_cq->lock); } } static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) __releases(&send_cq->lock) __releases(&recv_cq->lock) { if (send_cq) { if (recv_cq) { if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_unlock(&recv_cq->lock); spin_unlock(&send_cq->lock); } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { __release(&recv_cq->lock); spin_unlock(&send_cq->lock); } else { spin_unlock(&send_cq->lock); spin_unlock(&recv_cq->lock); } } else { __release(&recv_cq->lock); spin_unlock(&send_cq->lock); } } else if (recv_cq) { __release(&send_cq->lock); spin_unlock(&recv_cq->lock); } else { __release(&recv_cq->lock); __release(&send_cq->lock); } } static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp) { return to_mpd(qp->ibqp.pd); } static void get_cqs(enum ib_qp_type qp_type, struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq) { switch (qp_type) { case IB_QPT_XRC_TGT: *send_cq = NULL; *recv_cq = NULL; break; case MLX5_IB_QPT_REG_UMR: case IB_QPT_XRC_INI: *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; *recv_cq = NULL; break; case IB_QPT_SMI: case MLX5_IB_QPT_HW_GSI: case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: case IB_QPT_RAW_IPV6: case IB_QPT_RAW_ETHERTYPE: case IB_QPT_RAW_PACKET: *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; *recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL; break; case IB_QPT_MAX: default: *send_cq = NULL; *recv_cq = NULL; break; } } static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, const struct mlx5_modify_raw_qp_param *raw_qp_param, u8 lag_tx_affinity); static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { struct mlx5_ib_cq *send_cq, *recv_cq; struct mlx5_ib_qp_base *base = &qp->trans_qp.base; unsigned long flags; int err; if (qp->ibqp.rwq_ind_tbl) { destroy_rss_raw_qp_tir(dev, qp); return; } base = qp->ibqp.qp_type == IB_QPT_RAW_PACKET ? &qp->raw_packet_qp.rq.base : &qp->trans_qp.base; if (qp->state != IB_QPS_RESET) { if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) { mlx5_ib_qp_disable_pagefaults(qp); err = mlx5_core_qp_modify(dev->mdev, MLX5_CMD_OP_2RST_QP, 0, NULL, &base->mqp); } else { struct mlx5_modify_raw_qp_param raw_qp_param = { .operation = MLX5_CMD_OP_2RST_QP }; err = modify_raw_packet_qp(dev, qp, &raw_qp_param, 0); } if (err) mlx5_ib_warn(dev, "mlx5_ib: modify QP 0x%06x to RESET failed\n", base->mqp.qpn); } get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, &send_cq, &recv_cq); spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); mlx5_ib_lock_cqs(send_cq, recv_cq); /* del from lists under both locks above to protect reset flow paths */ list_del(&qp->qps_list); if (send_cq) list_del(&qp->cq_send_list); if (recv_cq) list_del(&qp->cq_recv_list); if (qp->create_type == MLX5_QP_KERNEL) { __mlx5_ib_cq_clean(recv_cq, base->mqp.qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); if (send_cq != recv_cq) __mlx5_ib_cq_clean(send_cq, base->mqp.qpn, NULL); } mlx5_ib_unlock_cqs(send_cq, recv_cq); spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { destroy_raw_packet_qp(dev, qp); } else { err = mlx5_core_destroy_qp(dev->mdev, &base->mqp); if (err) mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", base->mqp.qpn); } if (qp->create_type == MLX5_QP_KERNEL) destroy_qp_kernel(dev, qp); else if (qp->create_type == MLX5_QP_USER) destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base); } static const char *ib_qp_type_str(enum ib_qp_type type) { switch (type) { case IB_QPT_SMI: return "IB_QPT_SMI"; case IB_QPT_GSI: return "IB_QPT_GSI"; case IB_QPT_RC: return "IB_QPT_RC"; case IB_QPT_UC: return "IB_QPT_UC"; case IB_QPT_UD: return "IB_QPT_UD"; case IB_QPT_RAW_IPV6: return "IB_QPT_RAW_IPV6"; case IB_QPT_RAW_ETHERTYPE: return "IB_QPT_RAW_ETHERTYPE"; case IB_QPT_XRC_INI: return "IB_QPT_XRC_INI"; case IB_QPT_XRC_TGT: return "IB_QPT_XRC_TGT"; case IB_QPT_RAW_PACKET: return "IB_QPT_RAW_PACKET"; case MLX5_IB_QPT_REG_UMR: return "MLX5_IB_QPT_REG_UMR"; case IB_QPT_MAX: default: return "Invalid QP type"; } } struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { struct mlx5_ib_dev *dev; struct mlx5_ib_qp *qp; u16 xrcdn = 0; int err; if (pd) { dev = to_mdev(pd->device); if (init_attr->qp_type == IB_QPT_RAW_PACKET) { if (!pd->uobject) { mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n"); return ERR_PTR(-EINVAL); } else if (!to_mucontext(pd->uobject->context)->cqe_version) { mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n"); return ERR_PTR(-EINVAL); } } } else { /* being cautious here */ if (init_attr->qp_type != IB_QPT_XRC_TGT && init_attr->qp_type != MLX5_IB_QPT_REG_UMR) { pr_warn("%s: no PD for transport %s\n", __func__, ib_qp_type_str(init_attr->qp_type)); return ERR_PTR(-EINVAL); } dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); } switch (init_attr->qp_type) { case IB_QPT_XRC_TGT: case IB_QPT_XRC_INI: if (!MLX5_CAP_GEN(dev->mdev, xrc)) { mlx5_ib_dbg(dev, "XRC not supported\n"); return ERR_PTR(-ENOSYS); } init_attr->recv_cq = NULL; if (init_attr->qp_type == IB_QPT_XRC_TGT) { xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; init_attr->send_cq = NULL; } /* fall through */ case IB_QPT_RAW_PACKET: case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: case IB_QPT_SMI: case MLX5_IB_QPT_HW_GSI: case MLX5_IB_QPT_REG_UMR: qp = kzalloc(sizeof(*qp), GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); err = create_qp_common(dev, pd, init_attr, udata, qp); if (err) { mlx5_ib_dbg(dev, "create_qp_common failed\n"); kfree(qp); return ERR_PTR(err); } if (is_qp0(init_attr->qp_type)) qp->ibqp.qp_num = 0; else if (is_qp1(init_attr->qp_type)) qp->ibqp.qp_num = 1; else qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn; mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn, init_attr->recv_cq ? to_mcq(init_attr->recv_cq)->mcq.cqn : -1, init_attr->send_cq ? to_mcq(init_attr->send_cq)->mcq.cqn : -1); qp->trans_qp.xrcdn = xrcdn; break; case IB_QPT_GSI: return mlx5_ib_gsi_create_qp(pd, init_attr); case IB_QPT_RAW_IPV6: case IB_QPT_RAW_ETHERTYPE: case IB_QPT_MAX: default: mlx5_ib_dbg(dev, "unsupported qp type %d\n", init_attr->qp_type); /* Don't support raw QPs */ return ERR_PTR(-EINVAL); } return &qp->ibqp; } int mlx5_ib_destroy_qp(struct ib_qp *qp) { struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_ib_qp *mqp = to_mqp(qp); if (unlikely(qp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_destroy_qp(qp); destroy_qp_common(dev, mqp); kfree(mqp); return 0; } static int to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr, int attr_mask, __be32 *hw_access_flags_be) { u8 dest_rd_atomic; u32 access_flags, hw_access_flags = 0; struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) dest_rd_atomic = attr->max_dest_rd_atomic; else dest_rd_atomic = qp->trans_qp.resp_depth; if (attr_mask & IB_QP_ACCESS_FLAGS) access_flags = attr->qp_access_flags; else access_flags = qp->trans_qp.atomic_rd_en; if (!dest_rd_atomic) access_flags &= IB_ACCESS_REMOTE_WRITE; if (access_flags & IB_ACCESS_REMOTE_READ) hw_access_flags |= MLX5_QP_BIT_RRE; if (access_flags & IB_ACCESS_REMOTE_ATOMIC) { int atomic_mode; atomic_mode = get_atomic_mode(dev, qp->ibqp.qp_type); if (atomic_mode < 0) return -EOPNOTSUPP; hw_access_flags |= MLX5_QP_BIT_RAE; hw_access_flags |= atomic_mode << MLX5_ATOMIC_MODE_OFF; } if (access_flags & IB_ACCESS_REMOTE_WRITE) hw_access_flags |= MLX5_QP_BIT_RWE; *hw_access_flags_be = cpu_to_be32(hw_access_flags); return 0; } enum { MLX5_PATH_FLAG_FL = 1 << 0, MLX5_PATH_FLAG_FREE_AR = 1 << 1, MLX5_PATH_FLAG_COUNTER = 1 << 2, }; static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) { if (rate == IB_RATE_PORT_CURRENT) { return 0; } else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_600_GBPS) { return -EINVAL; } else { while (rate != IB_RATE_2_5_GBPS && !(1 << (rate + MLX5_STAT_RATE_OFFSET) & MLX5_CAP_GEN(dev->mdev, stat_rate_support))) --rate; } return rate + MLX5_STAT_RATE_OFFSET; } static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev, struct mlx5_ib_sq *sq, u8 sl) { void *in; void *tisc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(modify_tis_in); in = mlx5_vzalloc(inlen); if (!in) return -ENOMEM; MLX5_SET(modify_tis_in, in, bitmask.prio, 1); tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1)); err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen); kvfree(in); return err; } static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev, struct mlx5_ib_sq *sq, u8 tx_affinity) { void *in; void *tisc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(modify_tis_in); in = mlx5_vzalloc(inlen); if (!in) return -ENOMEM; MLX5_SET(modify_tis_in, in, bitmask.lag_tx_port_affinity, 1); tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); MLX5_SET(tisc, tisc, lag_tx_port_affinity, tx_affinity); err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen); kvfree(in); return err; } static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, const struct ib_ah_attr *ah, struct mlx5_qp_path *path, u8 port, int attr_mask, u32 path_flags, const struct ib_qp_attr *attr, bool alt) { enum rdma_link_layer ll = rdma_port_get_link_layer(&dev->ib_dev, port); int err; enum ib_gid_type gid_type; if (attr_mask & IB_QP_PKEY_INDEX) path->pkey_index = cpu_to_be16(alt ? attr->alt_pkey_index : attr->pkey_index); if (ah->ah_flags & IB_AH_GRH) { if (ah->grh.sgid_index >= dev->mdev->port_caps[port - 1].gid_table_len) { pr_err("sgid_index (%u) too large. max is %d\n", ah->grh.sgid_index, dev->mdev->port_caps[port - 1].gid_table_len); return -EINVAL; } } if (ll == IB_LINK_LAYER_ETHERNET) { if (!(ah->ah_flags & IB_AH_GRH)) return -EINVAL; err = mlx5_get_roce_gid_type(dev, port, ah->grh.sgid_index, &gid_type); if (err) return err; memcpy(path->rmac, ah->dmac, sizeof(ah->dmac)); path->udp_sport = mlx5_get_roce_udp_sport(dev, port, ah->grh.sgid_index); path->dci_cfi_prio_sl = (ah->sl & 0x7) << 4; if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) path->ecn_dscp = (ah->grh.traffic_class >> 2) & 0x3f; } else { path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; path->fl_free_ar |= (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0; path->rlid = cpu_to_be16(ah->dlid); path->grh_mlid = ah->src_path_bits & 0x7f; if (ah->ah_flags & IB_AH_GRH) path->grh_mlid |= 1 << 7; path->dci_cfi_prio_sl = ah->sl & 0xf; } if (ah->ah_flags & IB_AH_GRH) { path->mgid_index = ah->grh.sgid_index; path->hop_limit = ah->grh.hop_limit; path->tclass_flowlabel = cpu_to_be32((ah->grh.traffic_class << 20) | (ah->grh.flow_label)); memcpy(path->rgid, ah->grh.dgid.raw, 16); } err = ib_rate_to_mlx5(dev, ah->static_rate); if (err < 0) return err; path->static_rate = err; path->port = port; if (attr_mask & IB_QP_TIMEOUT) path->ackto_lt = (alt ? attr->alt_timeout : attr->timeout) << 3; if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt) return modify_raw_packet_eth_prio(dev->mdev, &qp->raw_packet_qp.sq, ah->sl & 0xf); return 0; } static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = { [MLX5_QP_STATE_INIT] = { [MLX5_QP_STATE_INIT] = { [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_PRI_PORT, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_PRI_PORT, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_Q_KEY | MLX5_QP_OPTPAR_PRI_PORT, }, [MLX5_QP_STATE_RTR] = { [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX, }, }, [MLX5_QP_STATE_RTR] = { [MLX5_QP_STATE_RTS] = { [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PM_STATE | MLX5_QP_OPTPAR_RNR_TIMEOUT, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PM_STATE, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, }, }, [MLX5_QP_STATE_RTS] = { [MLX5_QP_STATE_RTS] = { [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RNR_TIMEOUT | MLX5_QP_OPTPAR_PM_STATE | MLX5_QP_OPTPAR_ALT_ADDR_PATH, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PM_STATE | MLX5_QP_OPTPAR_ALT_ADDR_PATH, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY | MLX5_QP_OPTPAR_SRQN | MLX5_QP_OPTPAR_CQN_RCV, }, }, [MLX5_QP_STATE_SQER] = { [MLX5_QP_STATE_RTS] = { [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RRE, }, }, }; static int ib_nr_to_mlx5_nr(int ib_mask) { switch (ib_mask) { case IB_QP_STATE: return 0; case IB_QP_CUR_STATE: return 0; case IB_QP_EN_SQD_ASYNC_NOTIFY: return 0; case IB_QP_ACCESS_FLAGS: return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE; case IB_QP_PKEY_INDEX: return MLX5_QP_OPTPAR_PKEY_INDEX; case IB_QP_PORT: return MLX5_QP_OPTPAR_PRI_PORT; case IB_QP_QKEY: return MLX5_QP_OPTPAR_Q_KEY; case IB_QP_AV: return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH | MLX5_QP_OPTPAR_PRI_PORT; case IB_QP_PATH_MTU: return 0; case IB_QP_TIMEOUT: return MLX5_QP_OPTPAR_ACK_TIMEOUT; case IB_QP_RETRY_CNT: return MLX5_QP_OPTPAR_RETRY_COUNT; case IB_QP_RNR_RETRY: return MLX5_QP_OPTPAR_RNR_RETRY; case IB_QP_RQ_PSN: return 0; case IB_QP_MAX_QP_RD_ATOMIC: return MLX5_QP_OPTPAR_SRA_MAX; case IB_QP_ALT_PATH: return MLX5_QP_OPTPAR_ALT_ADDR_PATH; case IB_QP_MIN_RNR_TIMER: return MLX5_QP_OPTPAR_RNR_TIMEOUT; case IB_QP_SQ_PSN: return 0; case IB_QP_MAX_DEST_RD_ATOMIC: return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE; case IB_QP_PATH_MIG_STATE: return MLX5_QP_OPTPAR_PM_STATE; case IB_QP_CAP: return 0; case IB_QP_DEST_QPN: return 0; } return 0; } static int ib_mask_to_mlx5_opt(int ib_mask) { int result = 0; int i; for (i = 0; i < 8 * sizeof(int); i++) { if ((1 << i) & ib_mask) result |= ib_nr_to_mlx5_nr(1 << i); } return result; } static int modify_raw_packet_qp_rq(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, int new_state, const struct mlx5_modify_raw_qp_param *raw_qp_param) { void *in; void *rqc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(modify_rq_in); in = mlx5_vzalloc(inlen); if (!in) return -ENOMEM; MLX5_SET(modify_rq_in, in, rqn, rq->base.mqp.qpn); MLX5_SET(modify_rq_in, in, rq_state, rq->state); rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); MLX5_SET(rqc, rqc, state, new_state); if (raw_qp_param->set_mask & MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID) { if (MLX5_CAP_GEN(dev->mdev, modify_rq_counters_set_id)) { MLX5_SET64(modify_rq_in, in, modify_bitmask, MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_MODIFY_RQ_COUNTER_SET_ID); MLX5_SET(rqc, rqc, counter_set_id, raw_qp_param->rq_q_ctr_id); } else pr_info_once("%s: RAW PACKET QP counters are not supported on current FW\n", dev->ib_dev.name); } err = mlx5_core_modify_rq(dev->mdev, in, inlen); if (err) goto out; rq->state = new_state; out: kvfree(in); return err; } static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev, struct mlx5_ib_sq *sq, int new_state) { void *in; void *sqc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(modify_sq_in); in = mlx5_vzalloc(inlen); if (!in) return -ENOMEM; MLX5_SET(modify_sq_in, in, sqn, sq->base.mqp.qpn); MLX5_SET(modify_sq_in, in, sq_state, sq->state); sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); MLX5_SET(sqc, sqc, state, new_state); err = mlx5_core_modify_sq(dev, in, inlen); if (err) goto out; sq->state = new_state; out: kvfree(in); return err; } static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, const struct mlx5_modify_raw_qp_param *raw_qp_param, u8 tx_affinity) { struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; struct mlx5_ib_rq *rq = &raw_packet_qp->rq; struct mlx5_ib_sq *sq = &raw_packet_qp->sq; int rq_state; int sq_state; int err; switch (raw_qp_param->operation) { case MLX5_CMD_OP_RST2INIT_QP: rq_state = MLX5_RQC_STATE_RDY; sq_state = MLX5_SQC_STATE_RDY; break; case MLX5_CMD_OP_2ERR_QP: rq_state = MLX5_RQC_STATE_ERR; sq_state = MLX5_SQC_STATE_ERR; break; case MLX5_CMD_OP_2RST_QP: rq_state = MLX5_RQC_STATE_RST; sq_state = MLX5_SQC_STATE_RST; break; case MLX5_CMD_OP_INIT2INIT_QP: case MLX5_CMD_OP_INIT2RTR_QP: case MLX5_CMD_OP_RTR2RTS_QP: case MLX5_CMD_OP_RTS2RTS_QP: if (raw_qp_param->set_mask) return -EINVAL; else return 0; default: WARN_ON(1); return -EINVAL; } if (qp->rq.wqe_cnt) { err = modify_raw_packet_qp_rq(dev, rq, rq_state, raw_qp_param); if (err) return err; } if (qp->sq.wqe_cnt) { if (tx_affinity) { err = modify_raw_packet_tx_affinity(dev->mdev, sq, tx_affinity); if (err) return err; } return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state); } return 0; } static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) { static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = { [MLX5_QP_STATE_RST] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_RST2INIT_QP, }, [MLX5_QP_STATE_INIT] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_INIT2INIT_QP, [MLX5_QP_STATE_RTR] = MLX5_CMD_OP_INIT2RTR_QP, }, [MLX5_QP_STATE_RTR] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTR2RTS_QP, }, [MLX5_QP_STATE_RTS] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTS2RTS_QP, }, [MLX5_QP_STATE_SQD] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, }, [MLX5_QP_STATE_SQER] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQERR2RTS_QP, }, [MLX5_QP_STATE_ERR] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, } }; struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct mlx5_ib_cq *send_cq, *recv_cq; struct mlx5_qp_context *context; struct mlx5_ib_pd *pd; struct mlx5_ib_port *mibport = NULL; enum mlx5_qp_state mlx5_cur, mlx5_new; enum mlx5_qp_optpar optpar; int sqd_event; int mlx5_st; int err; u16 op; context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return -ENOMEM; err = to_mlx5_st(ibqp->qp_type); if (err < 0) { mlx5_ib_dbg(dev, "unsupported qp type %d\n", ibqp->qp_type); goto out; } context->flags = cpu_to_be32(err << 16); if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); } else { switch (attr->path_mig_state) { case IB_MIG_MIGRATED: context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); break; case IB_MIG_REARM: context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11); break; case IB_MIG_ARMED: context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11); break; } } if (is_sqp(ibqp->qp_type)) { context->mtu_msgmax = (IB_MTU_256 << 5) | 8; } else if (ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == MLX5_IB_QPT_REG_UMR) { context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; } else if (attr_mask & IB_QP_PATH_MTU) { if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) { mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu); err = -EINVAL; goto out; } context->mtu_msgmax = (attr->path_mtu << 5) | (u8)MLX5_CAP_GEN(dev->mdev, log_max_msg); } if (attr_mask & IB_QP_DEST_QPN) context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num); if (attr_mask & IB_QP_PKEY_INDEX) context->pri_path.pkey_index = cpu_to_be16(attr->pkey_index); /* todo implement counter_index functionality */ if (is_sqp(ibqp->qp_type)) context->pri_path.port = qp->port; if (attr_mask & IB_QP_PORT) context->pri_path.port = attr->port_num; if (attr_mask & IB_QP_AV) { err = mlx5_set_path(dev, qp, &attr->ah_attr, &context->pri_path, attr_mask & IB_QP_PORT ? attr->port_num : qp->port, attr_mask, 0, attr, false); if (err) goto out; } if (attr_mask & IB_QP_TIMEOUT) context->pri_path.ackto_lt |= attr->timeout << 3; if (attr_mask & IB_QP_ALT_PATH) { err = mlx5_set_path(dev, qp, &attr->alt_ah_attr, &context->alt_path, attr->alt_port_num, attr_mask | IB_QP_PKEY_INDEX | IB_QP_TIMEOUT, 0, attr, true); if (err) goto out; } pd = get_pd(qp); get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, &send_cq, &recv_cq); context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn); context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0; context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0; context->params1 = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28); if (attr_mask & IB_QP_RNR_RETRY) context->params1 |= cpu_to_be32(attr->rnr_retry << 13); if (attr_mask & IB_QP_RETRY_CNT) context->params1 |= cpu_to_be32(attr->retry_cnt << 16); if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { if (attr->max_rd_atomic) context->params1 |= cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); } if (attr_mask & IB_QP_SQ_PSN) context->next_send_psn = cpu_to_be32(attr->sq_psn); if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { if (attr->max_dest_rd_atomic) context->params2 |= cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); } if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { __be32 access_flags; err = to_mlx5_access_flags(qp, attr, attr_mask, &access_flags); if (err) goto out; context->params2 |= access_flags; } if (attr_mask & IB_QP_MIN_RNR_TIMER) context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); if (attr_mask & IB_QP_RQ_PSN) context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); if (attr_mask & IB_QP_QKEY) context->qkey = cpu_to_be32(attr->qkey); if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->db_rec_addr = cpu_to_be64(qp->db.dma); if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) sqd_event = 1; else sqd_event = 0; if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num : qp->port) - 1; mibport = &dev->port[port_num]; context->qp_counter_set_usr_page |= cpu_to_be32((u32)(mibport->q_cnt_id) << 24); } if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->sq_crq_size |= cpu_to_be16(1 << 4); if (qp->flags & MLX5_IB_QP_SQPN_QP1) context->deth_sqpn = cpu_to_be32(1); mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); mlx5_st = to_mlx5_st(ibqp->qp_type); if (mlx5_st < 0) goto out; /* If moving to a reset or error state, we must disable page faults on * this QP and flush all current page faults. Otherwise a stale page * fault may attempt to work on this QP after it is reset and moved * again to RTS, and may cause the driver and the device to get out of * sync. */ if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR) && (qp->ibqp.qp_type != IB_QPT_RAW_PACKET)) mlx5_ib_qp_disable_pagefaults(qp); if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE || !optab[mlx5_cur][mlx5_new]) goto out; op = optab[mlx5_cur][mlx5_new]; optpar = ib_mask_to_mlx5_opt(attr_mask); optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { struct mlx5_modify_raw_qp_param raw_qp_param = {}; raw_qp_param.operation = op; if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { raw_qp_param.rq_q_ctr_id = mibport->q_cnt_id; raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID; } err = modify_raw_packet_qp(dev, qp, &raw_qp_param, 0); } else { err = mlx5_core_qp_modify(dev->mdev, op, optpar, context, &base->mqp); } if (err) goto out; if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT && (qp->ibqp.qp_type != IB_QPT_RAW_PACKET)) mlx5_ib_qp_enable_pagefaults(qp); qp->state = new_state; if (attr_mask & IB_QP_ACCESS_FLAGS) qp->trans_qp.atomic_rd_en = attr->qp_access_flags; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) qp->trans_qp.resp_depth = attr->max_dest_rd_atomic; if (attr_mask & IB_QP_PORT) qp->port = attr->port_num; if (attr_mask & IB_QP_ALT_PATH) qp->trans_qp.alt_port = attr->alt_port_num; /* * If we moved a kernel QP to RESET, clean up all old CQ * entries and reinitialize the QP. */ if (new_state == IB_QPS_RESET && !ibqp->uobject) { mlx5_ib_cq_clean(recv_cq, base->mqp.qpn, ibqp->srq ? to_msrq(ibqp->srq) : NULL); if (send_cq != recv_cq) mlx5_ib_cq_clean(send_cq, base->mqp.qpn, NULL); qp->rq.head = 0; qp->rq.tail = 0; qp->sq.head = 0; qp->sq.tail = 0; qp->sq.cur_post = 0; qp->sq.last_poll = 0; qp->db.db[MLX5_RCV_DBR] = 0; qp->db.db[MLX5_SND_DBR] = 0; } out: kfree(context); return err; } int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); enum ib_qp_type qp_type; enum ib_qp_state cur_state, new_state; int err = -EINVAL; int port; if (ibqp->rwq_ind_tbl) return -ENOSYS; if (unlikely(ibqp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask); qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? IB_QPT_GSI : ibqp->qp_type; mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; if (qp_type != MLX5_IB_QPT_REG_UMR && !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask)) { mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", cur_state, new_state, ibqp->qp_type, attr_mask); goto out; } if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) { mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n", attr->port_num, dev->num_ports); goto out; } if (attr_mask & IB_QP_PKEY_INDEX) { port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; if (attr->pkey_index >= dev->mdev->port_caps[port - 1].pkey_table_len) { mlx5_ib_dbg(dev, "invalid pkey index %d\n", attr->pkey_index); goto out; } } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic > (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) { mlx5_ib_dbg(dev, "invalid max_rd_atomic value %d\n", attr->max_rd_atomic); goto out; } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic > (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) { mlx5_ib_dbg(dev, "invalid max_dest_rd_atomic value %d\n", attr->max_dest_rd_atomic); goto out; } if (cur_state == new_state && cur_state == IB_QPS_RESET) { err = 0; goto out; } err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); out: mutex_unlock(&qp->mutex); return err; } static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq) { struct mlx5_ib_cq *cq; unsigned cur; cur = wq->head - wq->tail; if (likely(cur + nreq < wq->max_post)) return 0; cq = to_mcq(ib_cq); spin_lock(&cq->lock); cur = wq->head - wq->tail; spin_unlock(&cq->lock); return cur + nreq >= wq->max_post; } static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, u64 remote_addr, u32 rkey) { rseg->raddr = cpu_to_be64(remote_addr); rseg->rkey = cpu_to_be32(rkey); rseg->reserved = 0; } static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg, - struct ib_send_wr *wr, void *qend, + const struct ib_send_wr *wr, void *qend, struct mlx5_ib_qp *qp, int *size) { void *seg = eseg; memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg)); if (wr->send_flags & IB_SEND_IP_CSUM) eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; seg += sizeof(struct mlx5_wqe_eth_seg); *size += sizeof(struct mlx5_wqe_eth_seg) / 16; if (wr->opcode == IB_WR_LSO) { struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr); int size_of_inl_hdr_start = sizeof(eseg->inline_hdr_start); u64 left, leftlen, copysz; void *pdata = ud_wr->header; left = ud_wr->hlen; eseg->mss = cpu_to_be16(ud_wr->mss); eseg->inline_hdr_sz = cpu_to_be16(left); /* * check if there is space till the end of queue, if yes, * copy all in one shot, otherwise copy till the end of queue, * rollback and than the copy the left */ leftlen = qend - (void *)eseg->inline_hdr_start; copysz = min_t(u64, leftlen, left); memcpy(seg - size_of_inl_hdr_start, pdata, copysz); if (likely(copysz > size_of_inl_hdr_start)) { seg += ALIGN(copysz - size_of_inl_hdr_start, 16); *size += ALIGN(copysz - size_of_inl_hdr_start, 16) / 16; } if (unlikely(copysz < left)) { /* the last wqe in the queue */ seg = mlx5_get_send_wqe(qp, 0); left -= copysz; pdata += copysz; memcpy(seg, pdata, left); seg += ALIGN(left, 16); *size += ALIGN(left, 16) / 16; } } return seg; } static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, - struct ib_send_wr *wr) + const struct ib_send_wr *wr) { memcpy(&dseg->av, &to_mah(ud_wr(wr)->ah)->av, sizeof(struct mlx5_av)); dseg->av.dqp_dct = cpu_to_be32(ud_wr(wr)->remote_qpn | MLX5_EXTENDED_UD_AV); dseg->av.key.qkey.qkey = cpu_to_be32(ud_wr(wr)->remote_qkey); } static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) { dseg->byte_count = cpu_to_be32(sg->length); dseg->lkey = cpu_to_be32(sg->lkey); dseg->addr = cpu_to_be64(sg->addr); } static __be16 get_klm_octo(int npages) { return cpu_to_be16(ALIGN(npages, 8) / 2); } static __be64 frwr_mkey_mask(void) { u64 result; result = MLX5_MKEY_MASK_LEN | MLX5_MKEY_MASK_PAGE_SIZE | MLX5_MKEY_MASK_START_ADDR | MLX5_MKEY_MASK_EN_RINVAL | MLX5_MKEY_MASK_KEY | MLX5_MKEY_MASK_LR | MLX5_MKEY_MASK_LW | MLX5_MKEY_MASK_RR | MLX5_MKEY_MASK_RW | MLX5_MKEY_MASK_A | MLX5_MKEY_MASK_SMALL_FENCE | MLX5_MKEY_MASK_FREE; return cpu_to_be64(result); } static __be64 sig_mkey_mask(void) { u64 result; result = MLX5_MKEY_MASK_LEN | MLX5_MKEY_MASK_PAGE_SIZE | MLX5_MKEY_MASK_START_ADDR | MLX5_MKEY_MASK_EN_SIGERR | MLX5_MKEY_MASK_EN_RINVAL | MLX5_MKEY_MASK_KEY | MLX5_MKEY_MASK_LR | MLX5_MKEY_MASK_LW | MLX5_MKEY_MASK_RR | MLX5_MKEY_MASK_RW | MLX5_MKEY_MASK_SMALL_FENCE | MLX5_MKEY_MASK_FREE | MLX5_MKEY_MASK_BSF_EN; return cpu_to_be64(result); } static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, struct mlx5_ib_mr *mr) { int ndescs = mr->ndescs; memset(umr, 0, sizeof(*umr)); if (mr->access_mode == MLX5_ACCESS_MODE_KLM) /* KLMs take twice the size of MTTs */ ndescs *= 2; umr->flags = MLX5_UMR_CHECK_NOT_FREE; umr->klm_octowords = get_klm_octo(ndescs); umr->mkey_mask = frwr_mkey_mask(); } static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr) { memset(umr, 0, sizeof(*umr)); umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); umr->flags = 1 << 7; } static __be64 get_umr_reg_mr_mask(void) { u64 result; result = MLX5_MKEY_MASK_LEN | MLX5_MKEY_MASK_PAGE_SIZE | MLX5_MKEY_MASK_START_ADDR | MLX5_MKEY_MASK_PD | MLX5_MKEY_MASK_LR | MLX5_MKEY_MASK_LW | MLX5_MKEY_MASK_KEY | MLX5_MKEY_MASK_RR | MLX5_MKEY_MASK_RW | MLX5_MKEY_MASK_A | MLX5_MKEY_MASK_FREE; return cpu_to_be64(result); } static __be64 get_umr_unreg_mr_mask(void) { u64 result; result = MLX5_MKEY_MASK_FREE; return cpu_to_be64(result); } static __be64 get_umr_update_mtt_mask(void) { u64 result; result = MLX5_MKEY_MASK_FREE; return cpu_to_be64(result); } static __be64 get_umr_update_translation_mask(void) { u64 result; result = MLX5_MKEY_MASK_LEN | MLX5_MKEY_MASK_PAGE_SIZE | MLX5_MKEY_MASK_START_ADDR | MLX5_MKEY_MASK_KEY | MLX5_MKEY_MASK_FREE; return cpu_to_be64(result); } static __be64 get_umr_update_access_mask(void) { u64 result; result = MLX5_MKEY_MASK_LW | MLX5_MKEY_MASK_RR | MLX5_MKEY_MASK_RW | MLX5_MKEY_MASK_A | MLX5_MKEY_MASK_KEY | MLX5_MKEY_MASK_FREE; return cpu_to_be64(result); } static __be64 get_umr_update_pd_mask(void) { u64 result; result = MLX5_MKEY_MASK_PD | MLX5_MKEY_MASK_KEY | MLX5_MKEY_MASK_FREE; return cpu_to_be64(result); } static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, - struct ib_send_wr *wr) + const struct ib_send_wr *wr) { - struct mlx5_umr_wr *umrwr = umr_wr(wr); + const struct mlx5_umr_wr *umrwr = umr_wr(wr); memset(umr, 0, sizeof(*umr)); if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE) umr->flags = MLX5_UMR_CHECK_FREE; /* fail if free */ else umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */ if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) { umr->klm_octowords = get_klm_octo(umrwr->npages); if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) { umr->mkey_mask = get_umr_update_mtt_mask(); umr->bsf_octowords = get_klm_octo(umrwr->target.offset); umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; } if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION) umr->mkey_mask |= get_umr_update_translation_mask(); if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_ACCESS) umr->mkey_mask |= get_umr_update_access_mask(); if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD) umr->mkey_mask |= get_umr_update_pd_mask(); if (!umr->mkey_mask) umr->mkey_mask = get_umr_reg_mr_mask(); } else { umr->mkey_mask = get_umr_unreg_mr_mask(); } if (!wr->num_sge) umr->flags |= MLX5_UMR_INLINE; } static u8 get_umr_flags(int acc) { return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN; } static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg, struct mlx5_ib_mr *mr, u32 key, int access) { int ndescs = ALIGN(mr->ndescs, 8) >> 1; memset(seg, 0, sizeof(*seg)); if (mr->access_mode == MLX5_ACCESS_MODE_MTT) seg->log2_page_size = ilog2(mr->ibmr.page_size); else if (mr->access_mode == MLX5_ACCESS_MODE_KLM) /* KLMs take twice the size of MTTs */ ndescs *= 2; seg->flags = get_umr_flags(access) | mr->access_mode; seg->qpn_mkey7_0 = cpu_to_be32((key & 0xff) | 0xffffff00); seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); seg->start_addr = cpu_to_be64(mr->ibmr.iova); seg->len = cpu_to_be64(mr->ibmr.length); seg->xlt_oct_size = cpu_to_be32(ndescs); } static void set_linv_mkey_seg(struct mlx5_mkey_seg *seg) { memset(seg, 0, sizeof(*seg)); seg->status = MLX5_MKEY_STATUS_FREE; } -static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr) +static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, const struct ib_send_wr *wr) { - struct mlx5_umr_wr *umrwr = umr_wr(wr); + const struct mlx5_umr_wr *umrwr = umr_wr(wr); memset(seg, 0, sizeof(*seg)); if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) { seg->status = MLX5_MKEY_STATUS_FREE; return; } seg->flags = convert_access(umrwr->access_flags); if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) { if (umrwr->pd) seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); seg->start_addr = cpu_to_be64(umrwr->target.virt_addr); } seg->len = cpu_to_be64(umrwr->length); seg->log2_page_size = umrwr->page_shift; seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | mlx5_mkey_variant(umrwr->mkey)); } static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg, struct mlx5_ib_mr *mr, struct mlx5_ib_pd *pd) { int bcount = mr->desc_size * mr->ndescs; dseg->addr = cpu_to_be64(mr->desc_map); dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64)); dseg->lkey = cpu_to_be32(pd->ibpd.local_dma_lkey); } -static __be32 send_ieth(struct ib_send_wr *wr) +static __be32 send_ieth(const struct ib_send_wr *wr) { switch (wr->opcode) { case IB_WR_SEND_WITH_IMM: case IB_WR_RDMA_WRITE_WITH_IMM: return wr->ex.imm_data; case IB_WR_SEND_WITH_INV: return cpu_to_be32(wr->ex.invalidate_rkey); default: return 0; } } static u8 calc_sig(void *wqe, int size) { u8 *p = wqe; u8 res = 0; int i; for (i = 0; i < size; i++) res ^= p[i]; return ~res; } static u8 wq_sig(void *wqe) { return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4); } -static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr, +static int set_data_inl_seg(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, void *wqe, int *sz) { struct mlx5_wqe_inline_seg *seg; void *qend = qp->sq.qend; void *addr; int inl = 0; int copy; int len; int i; seg = wqe; wqe += sizeof(*seg); for (i = 0; i < wr->num_sge; i++) { addr = (void *)(unsigned long)(wr->sg_list[i].addr); len = wr->sg_list[i].length; inl += len; if (unlikely(inl > qp->max_inline_data)) return -ENOMEM; if (unlikely(wqe + len > qend)) { copy = qend - wqe; memcpy(wqe, addr, copy); addr += copy; len -= copy; wqe = mlx5_get_send_wqe(qp, 0); } memcpy(wqe, addr, len); wqe += len; } seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG); *sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16; return 0; } static u16 prot_field_size(enum ib_signature_type type) { switch (type) { case IB_SIG_TYPE_T10_DIF: return MLX5_DIF_SIZE; default: return 0; } } static u8 bs_selector(int block_size) { switch (block_size) { case 512: return 0x1; case 520: return 0x2; case 4096: return 0x3; case 4160: return 0x4; case 1073741824: return 0x5; default: return 0; } } static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain, struct mlx5_bsf_inl *inl) { /* Valid inline section and allow BSF refresh */ inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID | MLX5_BSF_REFRESH_DIF); inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag); inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag); /* repeating block */ inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK; inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ? MLX5_DIF_CRC : MLX5_DIF_IPCS; if (domain->sig.dif.ref_remap) inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG; if (domain->sig.dif.app_escape) { if (domain->sig.dif.ref_escape) inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE; else inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE; } inl->dif_app_bitmask_check = cpu_to_be16(domain->sig.dif.apptag_check_mask); } static int mlx5_set_bsf(struct ib_mr *sig_mr, struct ib_sig_attrs *sig_attrs, struct mlx5_bsf *bsf, u32 data_size) { struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig; struct mlx5_bsf_basic *basic = &bsf->basic; struct ib_sig_domain *mem = &sig_attrs->mem; struct ib_sig_domain *wire = &sig_attrs->wire; memset(bsf, 0, sizeof(*bsf)); /* Basic + Extended + Inline */ basic->bsf_size_sbs = 1 << 7; /* Input domain check byte mask */ basic->check_byte_mask = sig_attrs->check_mask; basic->raw_data_size = cpu_to_be32(data_size); /* Memory domain */ switch (sig_attrs->mem.sig_type) { case IB_SIG_TYPE_NONE: break; case IB_SIG_TYPE_T10_DIF: basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval); basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx); mlx5_fill_inl_bsf(mem, &bsf->m_inl); break; default: return -EINVAL; } /* Wire domain */ switch (sig_attrs->wire.sig_type) { case IB_SIG_TYPE_NONE: break; case IB_SIG_TYPE_T10_DIF: if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval && mem->sig_type == wire->sig_type) { /* Same block structure */ basic->bsf_size_sbs |= 1 << 4; if (mem->sig.dif.bg_type == wire->sig.dif.bg_type) basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK; if (mem->sig.dif.app_tag == wire->sig.dif.app_tag) basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK; if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag) basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK; } else basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval); basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx); mlx5_fill_inl_bsf(wire, &bsf->w_inl); break; default: return -EINVAL; } return 0; } -static int set_sig_data_segment(struct ib_sig_handover_wr *wr, +static int set_sig_data_segment(const struct ib_sig_handover_wr *wr, struct mlx5_ib_qp *qp, void **seg, int *size) { struct ib_sig_attrs *sig_attrs = wr->sig_attrs; struct ib_mr *sig_mr = wr->sig_mr; struct mlx5_bsf *bsf; u32 data_len = wr->wr.sg_list->length; u32 data_key = wr->wr.sg_list->lkey; u64 data_va = wr->wr.sg_list->addr; int ret; int wqe_size; if (!wr->prot || (data_key == wr->prot->lkey && data_va == wr->prot->addr && data_len == wr->prot->length)) { /** * Source domain doesn't contain signature information * or data and protection are interleaved in memory. * So need construct: * ------------------ * | data_klm | * ------------------ * | BSF | * ------------------ **/ struct mlx5_klm *data_klm = *seg; data_klm->bcount = cpu_to_be32(data_len); data_klm->key = cpu_to_be32(data_key); data_klm->va = cpu_to_be64(data_va); wqe_size = ALIGN(sizeof(*data_klm), 64); } else { /** * Source domain contains signature information * So need construct a strided block format: * --------------------------- * | stride_block_ctrl | * --------------------------- * | data_klm | * --------------------------- * | prot_klm | * --------------------------- * | BSF | * --------------------------- **/ struct mlx5_stride_block_ctrl_seg *sblock_ctrl; struct mlx5_stride_block_entry *data_sentry; struct mlx5_stride_block_entry *prot_sentry; u32 prot_key = wr->prot->lkey; u64 prot_va = wr->prot->addr; u16 block_size = sig_attrs->mem.sig.dif.pi_interval; int prot_size; sblock_ctrl = *seg; data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl); prot_sentry = (void *)data_sentry + sizeof(*data_sentry); prot_size = prot_field_size(sig_attrs->mem.sig_type); if (!prot_size) { pr_err("Bad block size given: %u\n", block_size); return -EINVAL; } sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size + prot_size); sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP); sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size); sblock_ctrl->num_entries = cpu_to_be16(2); data_sentry->bcount = cpu_to_be16(block_size); data_sentry->key = cpu_to_be32(data_key); data_sentry->va = cpu_to_be64(data_va); data_sentry->stride = cpu_to_be16(block_size); prot_sentry->bcount = cpu_to_be16(prot_size); prot_sentry->key = cpu_to_be32(prot_key); prot_sentry->va = cpu_to_be64(prot_va); prot_sentry->stride = cpu_to_be16(prot_size); wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) + sizeof(*prot_sentry), 64); } *seg += wqe_size; *size += wqe_size / 16; if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); bsf = *seg; ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len); if (ret) return -EINVAL; *seg += sizeof(*bsf); *size += sizeof(*bsf) / 16; if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); return 0; } static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, - struct ib_sig_handover_wr *wr, u32 nelements, + const struct ib_sig_handover_wr *wr, u32 nelements, u32 length, u32 pdn) { struct ib_mr *sig_mr = wr->sig_mr; u32 sig_key = sig_mr->rkey; u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1; memset(seg, 0, sizeof(*seg)); seg->flags = get_umr_flags(wr->access_flags) | MLX5_ACCESS_MODE_KLM; seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | MLX5_MKEY_BSF_EN | pdn); seg->len = cpu_to_be64(length); seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements))); seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); } static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, u32 nelements) { memset(umr, 0, sizeof(*umr)); umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; umr->klm_octowords = get_klm_octo(nelements); umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); umr->mkey_mask = sig_mkey_mask(); } -static int set_sig_umr_wr(struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp, +static int set_sig_umr_wr(const struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp, void **seg, int *size) { - struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr); + const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr); struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr); u32 pdn = get_pd(qp)->pdn; u32 klm_oct_size; int region_len, ret; if (unlikely(wr->wr.num_sge != 1) || unlikely(wr->access_flags & IB_ACCESS_REMOTE_ATOMIC) || unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) || unlikely(!sig_mr->sig->sig_status_checked)) return -EINVAL; /* length of the protected region, data + protection */ region_len = wr->wr.sg_list->length; if (wr->prot && (wr->prot->lkey != wr->wr.sg_list->lkey || wr->prot->addr != wr->wr.sg_list->addr || wr->prot->length != wr->wr.sg_list->length)) region_len += wr->prot->length; /** * KLM octoword size - if protection was provided * then we use strided block format (3 octowords), * else we use single KLM (1 octoword) **/ klm_oct_size = wr->prot ? 3 : 1; set_sig_umr_segment(*seg, klm_oct_size); *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn); *seg += sizeof(struct mlx5_mkey_seg); *size += sizeof(struct mlx5_mkey_seg) / 16; if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); ret = set_sig_data_segment(wr, qp, seg, size); if (ret) return ret; sig_mr->sig->sig_status_checked = false; return 0; } static int set_psv_wr(struct ib_sig_domain *domain, u32 psv_idx, void **seg, int *size) { struct mlx5_seg_set_psv *psv_seg = *seg; memset(psv_seg, 0, sizeof(*psv_seg)); psv_seg->psv_num = cpu_to_be32(psv_idx); switch (domain->sig_type) { case IB_SIG_TYPE_NONE: break; case IB_SIG_TYPE_T10_DIF: psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 | domain->sig.dif.app_tag); psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag); break; default: pr_err("Bad signature type given.\n"); return 1; } *seg += sizeof(*psv_seg); *size += sizeof(*psv_seg) / 16; return 0; } static int set_reg_wr(struct mlx5_ib_qp *qp, - struct ib_reg_wr *wr, + const struct ib_reg_wr *wr, void **seg, int *size) { struct mlx5_ib_mr *mr = to_mmr(wr->mr); struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd); if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) { mlx5_ib_warn(to_mdev(qp->ibqp.device), "Invalid IB_SEND_INLINE send flag\n"); return -EINVAL; } set_reg_umr_seg(*seg, mr); *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); set_reg_mkey_seg(*seg, mr, wr->key, wr->access); *seg += sizeof(struct mlx5_mkey_seg); *size += sizeof(struct mlx5_mkey_seg) / 16; if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); set_reg_data_seg(*seg, mr, pd); *seg += sizeof(struct mlx5_wqe_data_seg); *size += (sizeof(struct mlx5_wqe_data_seg) / 16); return 0; } static void set_linv_wr(struct mlx5_ib_qp *qp, void **seg, int *size) { set_linv_umr_seg(*seg); *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); set_linv_mkey_seg(*seg); *seg += sizeof(struct mlx5_mkey_seg); *size += sizeof(struct mlx5_mkey_seg) / 16; if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); } static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16) { __be32 *p = NULL; int tidx = idx; int i, j; pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx)); for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) { if ((i & 0xf) == 0) { void *buf = mlx5_get_send_wqe(qp, tidx); tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1); p = buf; j = 0; } pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]), be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]), be32_to_cpu(p[j + 3])); } } -static u8 get_fence(u8 fence, struct ib_send_wr *wr) +static u8 get_fence(u8 fence, const struct ib_send_wr *wr) { if (unlikely(wr->opcode == IB_WR_LOCAL_INV && wr->send_flags & IB_SEND_FENCE)) return MLX5_FENCE_MODE_STRONG_ORDERING; if (unlikely(fence)) { if (wr->send_flags & IB_SEND_FENCE) return MLX5_FENCE_MODE_SMALL_AND_FENCE; else return fence; } else if (unlikely(wr->send_flags & IB_SEND_FENCE)) { return MLX5_FENCE_MODE_FENCE; } return 0; } -static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, - struct mlx5_wqe_ctrl_seg **ctrl, - struct ib_send_wr *wr, unsigned *idx, - int *size, int nreq) +static int __begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + const struct ib_send_wr *wr, unsigned *idx, + int *size, int nreq, bool send_signaled, bool solicited) { if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) return -ENOMEM; *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); *seg = mlx5_get_send_wqe(qp, *idx); *ctrl = *seg; *(uint32_t *)(*seg + 8) = 0; (*ctrl)->imm = send_ieth(wr); (*ctrl)->fm_ce_se = qp->sq_signal_bits | - (wr->send_flags & IB_SEND_SIGNALED ? - MLX5_WQE_CTRL_CQ_UPDATE : 0) | - (wr->send_flags & IB_SEND_SOLICITED ? - MLX5_WQE_CTRL_SOLICITED : 0); + (send_signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0) | + (solicited ? MLX5_WQE_CTRL_SOLICITED : 0); *seg += sizeof(**ctrl); *size = sizeof(**ctrl) / 16; return 0; } +static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + const struct ib_send_wr *wr, unsigned *idx, + int *size, int nreq) +{ + return __begin_wqe(qp, seg, ctrl, wr, idx, size, nreq, + wr->send_flags & IB_SEND_SIGNALED, + wr->send_flags & IB_SEND_SOLICITED); +} + static void finish_wqe(struct mlx5_ib_qp *qp, struct mlx5_wqe_ctrl_seg *ctrl, u8 size, unsigned idx, u64 wr_id, int nreq, u8 fence, u8 next_fence, u32 mlx5_opcode) { u8 opmod = 0; ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | mlx5_opcode | ((u32)opmod << 24)); ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8)); ctrl->fm_ce_se |= fence; qp->fm_cache = next_fence; if (unlikely(qp->wq_sig)) ctrl->signature = wq_sig(ctrl); qp->sq.wrid[idx] = wr_id; qp->sq.w_list[idx].opcode = mlx5_opcode; qp->sq.wqe_head[idx] = qp->sq.head + nreq; qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); qp->sq.w_list[idx].next = qp->sq.cur_post; } -int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_ib_qp *qp; struct mlx5_ib_mr *mr; struct mlx5_wqe_data_seg *dpseg; struct mlx5_wqe_xrc_seg *xrc; struct mlx5_bf *bf; int uninitialized_var(size); void *qend; unsigned long flags; unsigned idx; int err = 0; int inl = 0; int num_sge; void *seg; int nreq; int i; u8 next_fence = 0; u8 fence; if (unlikely(ibqp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr); qp = to_mqp(ibqp); bf = &qp->bf; qend = qp->sq.qend; spin_lock_irqsave(&qp->sq.lock, flags); if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { err = -EIO; *bad_wr = wr; nreq = 0; goto out; } for (nreq = 0; wr; nreq++, wr = wr->next) { if (unlikely(wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { mlx5_ib_warn(dev, "\n"); err = -EINVAL; *bad_wr = wr; goto out; } fence = qp->fm_cache; num_sge = wr->num_sge; if (unlikely(num_sge > qp->sq.max_gs)) { mlx5_ib_warn(dev, "\n"); err = -EINVAL; *bad_wr = wr; goto out; } err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq); if (err) { mlx5_ib_warn(dev, "\n"); err = -ENOMEM; *bad_wr = wr; goto out; } switch (ibqp->qp_type) { case IB_QPT_XRC_INI: xrc = seg; seg += sizeof(*xrc); size += sizeof(*xrc) / 16; /* fall through */ case IB_QPT_RC: switch (wr->opcode) { case IB_WR_RDMA_READ: case IB_WR_RDMA_WRITE: case IB_WR_RDMA_WRITE_WITH_IMM: set_raddr_seg(seg, rdma_wr(wr)->remote_addr, rdma_wr(wr)->rkey); seg += sizeof(struct mlx5_wqe_raddr_seg); size += sizeof(struct mlx5_wqe_raddr_seg) / 16; break; case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); err = -ENOSYS; *bad_wr = wr; goto out; case IB_WR_LOCAL_INV: next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; qp->sq.wr_data[idx] = IB_WR_LOCAL_INV; ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey); set_linv_wr(qp, &seg, &size); num_sge = 0; break; case IB_WR_REG_MR: next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; qp->sq.wr_data[idx] = IB_WR_REG_MR; ctrl->imm = cpu_to_be32(reg_wr(wr)->key); err = set_reg_wr(qp, reg_wr(wr), &seg, &size); if (err) { *bad_wr = wr; goto out; } num_sge = 0; break; case IB_WR_REG_SIG_MR: qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR; mr = to_mmr(sig_handover_wr(wr)->sig_mr); ctrl->imm = cpu_to_be32(mr->ibmr.rkey); err = set_sig_umr_wr(wr, qp, &seg, &size); if (err) { mlx5_ib_warn(dev, "\n"); *bad_wr = wr; goto out; } finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, get_fence(fence, wr), next_fence, MLX5_OPCODE_UMR); /* * SET_PSV WQEs are not signaled and solicited * on error */ - wr->send_flags &= ~IB_SEND_SIGNALED; - wr->send_flags |= IB_SEND_SOLICITED; - err = begin_wqe(qp, &seg, &ctrl, wr, - &idx, &size, nreq); + err = __begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq, false, true); if (err) { mlx5_ib_warn(dev, "\n"); err = -ENOMEM; *bad_wr = wr; goto out; } err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->mem, mr->sig->psv_memory.psv_idx, &seg, &size); if (err) { mlx5_ib_warn(dev, "\n"); *bad_wr = wr; goto out; } finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, get_fence(fence, wr), next_fence, MLX5_OPCODE_SET_PSV); - err = begin_wqe(qp, &seg, &ctrl, wr, - &idx, &size, nreq); + err = __begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq, false, true); if (err) { mlx5_ib_warn(dev, "\n"); err = -ENOMEM; *bad_wr = wr; goto out; } next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->wire, mr->sig->psv_wire.psv_idx, &seg, &size); if (err) { mlx5_ib_warn(dev, "\n"); *bad_wr = wr; goto out; } finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, get_fence(fence, wr), next_fence, MLX5_OPCODE_SET_PSV); num_sge = 0; goto skip_psv; default: break; } break; case IB_QPT_UC: switch (wr->opcode) { case IB_WR_RDMA_WRITE: case IB_WR_RDMA_WRITE_WITH_IMM: set_raddr_seg(seg, rdma_wr(wr)->remote_addr, rdma_wr(wr)->rkey); seg += sizeof(struct mlx5_wqe_raddr_seg); size += sizeof(struct mlx5_wqe_raddr_seg) / 16; break; default: break; } break; case IB_QPT_SMI: case MLX5_IB_QPT_HW_GSI: set_datagram_seg(seg, wr); seg += sizeof(struct mlx5_wqe_datagram_seg); size += sizeof(struct mlx5_wqe_datagram_seg) / 16; if (unlikely((seg == qend))) seg = mlx5_get_send_wqe(qp, 0); break; case IB_QPT_UD: set_datagram_seg(seg, wr); seg += sizeof(struct mlx5_wqe_datagram_seg); size += sizeof(struct mlx5_wqe_datagram_seg) / 16; if (unlikely((seg == qend))) seg = mlx5_get_send_wqe(qp, 0); /* handle qp that supports ud offload */ if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) { struct mlx5_wqe_eth_pad *pad; pad = seg; memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad)); seg += sizeof(struct mlx5_wqe_eth_pad); size += sizeof(struct mlx5_wqe_eth_pad) / 16; seg = set_eth_seg(seg, wr, qend, qp, &size); if (unlikely((seg == qend))) seg = mlx5_get_send_wqe(qp, 0); } break; case MLX5_IB_QPT_REG_UMR: if (wr->opcode != MLX5_IB_WR_UMR) { err = -EINVAL; mlx5_ib_warn(dev, "bad opcode\n"); goto out; } qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; ctrl->imm = cpu_to_be32(umr_wr(wr)->mkey); set_reg_umr_segment(seg, wr); seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; if (unlikely((seg == qend))) seg = mlx5_get_send_wqe(qp, 0); set_reg_mkey_segment(seg, wr); seg += sizeof(struct mlx5_mkey_seg); size += sizeof(struct mlx5_mkey_seg) / 16; if (unlikely((seg == qend))) seg = mlx5_get_send_wqe(qp, 0); break; default: break; } if (wr->send_flags & IB_SEND_INLINE && num_sge) { int uninitialized_var(sz); err = set_data_inl_seg(qp, wr, seg, &sz); if (unlikely(err)) { mlx5_ib_warn(dev, "\n"); *bad_wr = wr; goto out; } inl = 1; size += sz; } else { dpseg = seg; for (i = 0; i < num_sge; i++) { if (unlikely(dpseg == qend)) { seg = mlx5_get_send_wqe(qp, 0); dpseg = seg; } if (likely(wr->sg_list[i].length)) { set_data_ptr_seg(dpseg, wr->sg_list + i); size += sizeof(struct mlx5_wqe_data_seg) / 16; dpseg++; } } } finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, get_fence(fence, wr), next_fence, mlx5_ib_opcode[wr->opcode]); skip_psv: if (0) dump_wqe(qp, idx, size); } out: if (likely(nreq)) { qp->sq.head += nreq; /* Make sure that descriptors are written before * updating doorbell record and ringing the doorbell */ wmb(); qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); /* Make sure doorbell record is visible to the HCA before * we hit doorbell */ wmb(); mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset, MLX5_GET_DOORBELL_LOCK(&bf->lock32)); /* Make sure doorbells don't leak out of SQ spinlock * and reach the HCA out of order. */ bf->offset ^= bf->buf_size; } spin_unlock_irqrestore(&qp->sq.lock, flags); return err; } static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size) { sig->signature = calc_sig(sig, size); } -int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_wqe_data_seg *scat; struct mlx5_rwqe_sig *sig; struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_core_dev *mdev = dev->mdev; unsigned long flags; int err = 0; int nreq; int ind; int i; if (unlikely(ibqp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr); spin_lock_irqsave(&qp->rq.lock, flags); if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { err = -EIO; *bad_wr = wr; nreq = 0; goto out; } ind = qp->rq.head & (qp->rq.wqe_cnt - 1); for (nreq = 0; wr; nreq++, wr = wr->next) { if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { err = -ENOMEM; *bad_wr = wr; goto out; } if (unlikely(wr->num_sge > qp->rq.max_gs)) { err = -EINVAL; *bad_wr = wr; goto out; } scat = get_recv_wqe(qp, ind); if (qp->wq_sig) scat++; for (i = 0; i < wr->num_sge; i++) set_data_ptr_seg(scat + i, wr->sg_list + i); if (i < qp->rq.max_gs) { scat[i].byte_count = 0; scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); scat[i].addr = 0; } if (qp->wq_sig) { sig = (struct mlx5_rwqe_sig *)scat; set_sig_seg(sig, (qp->rq.max_gs + 1) << 2); } qp->rq.wrid[ind] = wr->wr_id; ind = (ind + 1) & (qp->rq.wqe_cnt - 1); } out: if (likely(nreq)) { qp->rq.head += nreq; /* Make sure that descriptors are written before * doorbell record. */ wmb(); *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); } spin_unlock_irqrestore(&qp->rq.lock, flags); return err; } static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state) { switch (mlx5_state) { case MLX5_QP_STATE_RST: return IB_QPS_RESET; case MLX5_QP_STATE_INIT: return IB_QPS_INIT; case MLX5_QP_STATE_RTR: return IB_QPS_RTR; case MLX5_QP_STATE_RTS: return IB_QPS_RTS; case MLX5_QP_STATE_SQ_DRAINING: case MLX5_QP_STATE_SQD: return IB_QPS_SQD; case MLX5_QP_STATE_SQER: return IB_QPS_SQE; case MLX5_QP_STATE_ERR: return IB_QPS_ERR; default: return -1; } } static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state) { switch (mlx5_mig_state) { case MLX5_QP_PM_ARMED: return IB_MIG_ARMED; case MLX5_QP_PM_REARM: return IB_MIG_REARM; case MLX5_QP_PM_MIGRATED: return IB_MIG_MIGRATED; default: return -1; } } static int to_ib_qp_access_flags(int mlx5_flags) { int ib_flags = 0; if (mlx5_flags & MLX5_QP_BIT_RRE) ib_flags |= IB_ACCESS_REMOTE_READ; if (mlx5_flags & MLX5_QP_BIT_RWE) ib_flags |= IB_ACCESS_REMOTE_WRITE; if (mlx5_flags & MLX5_QP_BIT_RAE) ib_flags |= IB_ACCESS_REMOTE_ATOMIC; return ib_flags; } static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr, struct mlx5_qp_path *path) { struct mlx5_core_dev *dev = ibdev->mdev; memset(ib_ah_attr, 0, sizeof(*ib_ah_attr)); ib_ah_attr->port_num = path->port; if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > MLX5_CAP_GEN(dev, num_ports)) return; ib_ah_attr->sl = path->dci_cfi_prio_sl & 0xf; ib_ah_attr->dlid = be16_to_cpu(path->rlid); ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f; ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; ib_ah_attr->ah_flags = (path->grh_mlid & (1 << 7)) ? IB_AH_GRH : 0; if (ib_ah_attr->ah_flags) { ib_ah_attr->grh.sgid_index = path->mgid_index; ib_ah_attr->grh.hop_limit = path->hop_limit; ib_ah_attr->grh.traffic_class = (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; ib_ah_attr->grh.flow_label = be32_to_cpu(path->tclass_flowlabel) & 0xfffff; memcpy(ib_ah_attr->grh.dgid.raw, path->rgid, sizeof(ib_ah_attr->grh.dgid.raw)); } } static int query_raw_packet_qp_sq_state(struct mlx5_ib_dev *dev, struct mlx5_ib_sq *sq, u8 *sq_state) { void *out; void *sqc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(query_sq_out); out = mlx5_vzalloc(inlen); if (!out) return -ENOMEM; err = mlx5_core_query_sq(dev->mdev, sq->base.mqp.qpn, out); if (err) goto out; sqc = MLX5_ADDR_OF(query_sq_out, out, sq_context); *sq_state = MLX5_GET(sqc, sqc, state); sq->state = *sq_state; out: kvfree(out); return err; } static int query_raw_packet_qp_rq_state(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, u8 *rq_state) { void *out; void *rqc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(query_rq_out); out = mlx5_vzalloc(inlen); if (!out) return -ENOMEM; err = mlx5_core_query_rq(dev->mdev, rq->base.mqp.qpn, out); if (err) goto out; rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context); *rq_state = MLX5_GET(rqc, rqc, state); rq->state = *rq_state; out: kvfree(out); return err; } static int sqrq_state_to_qp_state(u8 sq_state, u8 rq_state, struct mlx5_ib_qp *qp, u8 *qp_state) { static const u8 sqrq_trans[MLX5_RQ_NUM_STATE][MLX5_SQ_NUM_STATE] = { [MLX5_RQC_STATE_RST] = { [MLX5_SQC_STATE_RST] = IB_QPS_RESET, [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD, [MLX5_SQC_STATE_ERR] = MLX5_QP_STATE_BAD, [MLX5_SQ_STATE_NA] = IB_QPS_RESET, }, [MLX5_RQC_STATE_RDY] = { [MLX5_SQC_STATE_RST] = MLX5_QP_STATE_BAD, [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE, [MLX5_SQC_STATE_ERR] = IB_QPS_SQE, [MLX5_SQ_STATE_NA] = MLX5_QP_STATE, }, [MLX5_RQC_STATE_ERR] = { [MLX5_SQC_STATE_RST] = MLX5_QP_STATE_BAD, [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD, [MLX5_SQC_STATE_ERR] = IB_QPS_ERR, [MLX5_SQ_STATE_NA] = IB_QPS_ERR, }, [MLX5_RQ_STATE_NA] = { [MLX5_SQC_STATE_RST] = IB_QPS_RESET, [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE, [MLX5_SQC_STATE_ERR] = MLX5_QP_STATE, [MLX5_SQ_STATE_NA] = MLX5_QP_STATE_BAD, }, }; *qp_state = sqrq_trans[rq_state][sq_state]; if (*qp_state == MLX5_QP_STATE_BAD) { WARN(1, "Buggy Raw Packet QP state, SQ 0x%x state: 0x%x, RQ 0x%x state: 0x%x", qp->raw_packet_qp.sq.base.mqp.qpn, sq_state, qp->raw_packet_qp.rq.base.mqp.qpn, rq_state); return -EINVAL; } if (*qp_state == MLX5_QP_STATE) *qp_state = qp->state; return 0; } static int query_raw_packet_qp_state(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, u8 *raw_packet_qp_state) { struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; struct mlx5_ib_sq *sq = &raw_packet_qp->sq; struct mlx5_ib_rq *rq = &raw_packet_qp->rq; int err; u8 sq_state = MLX5_SQ_STATE_NA; u8 rq_state = MLX5_RQ_STATE_NA; if (qp->sq.wqe_cnt) { err = query_raw_packet_qp_sq_state(dev, sq, &sq_state); if (err) return err; } if (qp->rq.wqe_cnt) { err = query_raw_packet_qp_rq_state(dev, rq, &rq_state); if (err) return err; } return sqrq_state_to_qp_state(sq_state, rq_state, qp, raw_packet_qp_state); } static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct ib_qp_attr *qp_attr) { int outlen = MLX5_ST_SZ_BYTES(query_qp_out); struct mlx5_qp_context *context; int mlx5_state; u32 *outb; int err = 0; outb = kzalloc(outlen, GFP_KERNEL); if (!outb) return -ENOMEM; err = mlx5_core_qp_query(dev->mdev, &qp->trans_qp.base.mqp, outb, outlen); if (err) goto out; /* FIXME: use MLX5_GET rather than mlx5_qp_context manual struct */ context = (struct mlx5_qp_context *)MLX5_ADDR_OF(query_qp_out, outb, qpc); mlx5_state = be32_to_cpu(context->flags) >> 28; qp->state = to_ib_qp_state(mlx5_state); qp_attr->path_mtu = context->mtu_msgmax >> 5; qp_attr->path_mig_state = to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); qp_attr->qkey = be32_to_cpu(context->qkey); qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff; qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff; qp_attr->dest_qp_num = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff; qp_attr->qp_access_flags = to_ib_qp_access_flags(be32_to_cpu(context->params2)); if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); qp_attr->alt_pkey_index = be16_to_cpu(context->alt_path.pkey_index); qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; } qp_attr->pkey_index = be16_to_cpu(context->pri_path.pkey_index); qp_attr->port_num = context->pri_path.port; /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING; qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7); qp_attr->max_dest_rd_atomic = 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7); qp_attr->min_rnr_timer = (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f; qp_attr->timeout = context->pri_path.ackto_lt >> 3; qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; qp_attr->rnr_retry = (be32_to_cpu(context->params1) >> 13) & 0x7; qp_attr->alt_timeout = context->alt_path.ackto_lt >> 3; out: kfree(outb); return err; } int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); int err = 0; u8 raw_packet_qp_state; if (ibqp->rwq_ind_tbl) return -ENOSYS; if (unlikely(ibqp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask, qp_init_attr); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING /* * Wait for any outstanding page faults, in case the user frees memory * based upon this query's result. */ flush_workqueue(mlx5_ib_page_fault_wq); #endif mutex_lock(&qp->mutex); if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state); if (err) goto out; qp->state = raw_packet_qp_state; qp_attr->port_num = 1; } else { err = query_qp_attr(dev, qp, qp_attr); if (err) goto out; } qp_attr->qp_state = qp->state; qp_attr->cur_qp_state = qp_attr->qp_state; qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; qp_attr->cap.max_recv_sge = qp->rq.max_gs; if (!ibqp->uobject) { qp_attr->cap.max_send_wr = qp->sq.max_post; qp_attr->cap.max_send_sge = qp->sq.max_gs; qp_init_attr->qp_context = ibqp->qp_context; } else { qp_attr->cap.max_send_wr = 0; qp_attr->cap.max_send_sge = 0; } qp_init_attr->qp_type = ibqp->qp_type; qp_init_attr->recv_cq = ibqp->recv_cq; qp_init_attr->send_cq = ibqp->send_cq; qp_init_attr->srq = ibqp->srq; qp_attr->cap.max_inline_data = qp->max_inline_data; qp_init_attr->cap = qp_attr->cap; qp_init_attr->create_flags = 0; if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL; if (qp->flags & MLX5_IB_QP_MANAGED_SEND) qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND; if (qp->flags & MLX5_IB_QP_MANAGED_RECV) qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV; if (qp->flags & MLX5_IB_QP_SQPN_QP1) qp_init_attr->create_flags |= MLX5_IB_QP_CREATE_SQPN_QP1; qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; out: mutex_unlock(&qp->mutex); return err; } struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_xrcd *xrcd; int err; if (!MLX5_CAP_GEN(dev->mdev, xrc)) return ERR_PTR(-ENOSYS); xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL); if (!xrcd) return ERR_PTR(-ENOMEM); err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn); if (err) { kfree(xrcd); return ERR_PTR(-ENOMEM); } return &xrcd->ibxrcd; } int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd) { struct mlx5_ib_dev *dev = to_mdev(xrcd->device); u32 xrcdn = to_mxrcd(xrcd)->xrcdn; int err; err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn); if (err) { mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn); return err; } kfree(xrcd); return 0; } static void mlx5_ib_wq_event(struct mlx5_core_qp *core_qp, int type) { struct mlx5_ib_rwq *rwq = to_mibrwq(core_qp); struct mlx5_ib_dev *dev = to_mdev(rwq->ibwq.device); struct ib_event event; if (rwq->ibwq.event_handler) { event.device = rwq->ibwq.device; event.element.wq = &rwq->ibwq; switch (type) { case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: event.event = IB_EVENT_WQ_FATAL; break; default: mlx5_ib_warn(dev, "Unexpected event type %d on WQ %06x\n", type, core_qp->qpn); return; } rwq->ibwq.event_handler(&event, rwq->ibwq.wq_context); } } static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, struct ib_wq_init_attr *init_attr) { struct mlx5_ib_dev *dev; __be64 *rq_pas0; void *in; void *rqc; void *wq; int inlen; int err; dev = to_mdev(pd->device); inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rwq->rq_num_pas; in = mlx5_vzalloc(inlen); if (!in) return -ENOMEM; rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_RQ_TYPE_MEMORY_RQ_INLINE); MLX5_SET(rqc, rqc, user_index, rwq->user_index); MLX5_SET(rqc, rqc, cqn, to_mcq(init_attr->cq)->mcq.cqn); MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); MLX5_SET(rqc, rqc, flush_in_error_en, 1); wq = MLX5_ADDR_OF(rqc, rqc, wq); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN); MLX5_SET(wq, wq, log_wq_stride, rwq->log_rq_stride); MLX5_SET(wq, wq, log_wq_sz, rwq->log_rq_size); MLX5_SET(wq, wq, pd, to_mpd(pd)->pdn); MLX5_SET(wq, wq, page_offset, rwq->rq_page_offset); MLX5_SET(wq, wq, log_wq_pg_sz, rwq->log_page_size); MLX5_SET(wq, wq, wq_signature, rwq->wq_sig); MLX5_SET64(wq, wq, dbr_addr, rwq->db.dma); rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0); err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rwq->core_qp); kvfree(in); return err; } static int set_user_rq_size(struct mlx5_ib_dev *dev, struct ib_wq_init_attr *wq_init_attr, struct mlx5_ib_create_wq *ucmd, struct mlx5_ib_rwq *rwq) { /* Sanity check RQ size before proceeding */ if (wq_init_attr->max_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_wq_sz))) return -EINVAL; if (!ucmd->rq_wqe_count) return -EINVAL; rwq->wqe_count = ucmd->rq_wqe_count; rwq->wqe_shift = ucmd->rq_wqe_shift; rwq->buf_size = (rwq->wqe_count << rwq->wqe_shift); rwq->log_rq_stride = rwq->wqe_shift; rwq->log_rq_size = ilog2(rwq->wqe_count); return 0; } static int prepare_user_rq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata, struct mlx5_ib_rwq *rwq) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_create_wq ucmd = {}; int err; size_t required_cmd_sz; required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved); if (udata->inlen < required_cmd_sz) { mlx5_ib_dbg(dev, "invalid inlen\n"); return -EINVAL; } if (udata->inlen > sizeof(ucmd) && !ib_is_udata_cleared(udata, sizeof(ucmd), udata->inlen - sizeof(ucmd))) { mlx5_ib_dbg(dev, "inlen is not supported\n"); return -EOPNOTSUPP; } if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) { mlx5_ib_dbg(dev, "copy failed\n"); return -EFAULT; } if (ucmd.comp_mask) { mlx5_ib_dbg(dev, "invalid comp mask\n"); return -EOPNOTSUPP; } if (ucmd.reserved) { mlx5_ib_dbg(dev, "invalid reserved\n"); return -EOPNOTSUPP; } err = set_user_rq_size(dev, init_attr, &ucmd, rwq); if (err) { mlx5_ib_dbg(dev, "err %d\n", err); return err; } err = create_user_rq(dev, pd, rwq, &ucmd); if (err) { mlx5_ib_dbg(dev, "err %d\n", err); if (err) return err; } rwq->user_index = ucmd.user_index; return 0; } struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata) { struct mlx5_ib_dev *dev; struct mlx5_ib_rwq *rwq; struct mlx5_ib_create_wq_resp resp = {}; size_t min_resp_len; int err; if (!udata) return ERR_PTR(-ENOSYS); min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); if (udata->outlen && udata->outlen < min_resp_len) return ERR_PTR(-EINVAL); dev = to_mdev(pd->device); switch (init_attr->wq_type) { case IB_WQT_RQ: rwq = kzalloc(sizeof(*rwq), GFP_KERNEL); if (!rwq) return ERR_PTR(-ENOMEM); err = prepare_user_rq(pd, init_attr, udata, rwq); if (err) goto err; err = create_rq(rwq, pd, init_attr); if (err) goto err_user_rq; break; default: mlx5_ib_dbg(dev, "unsupported wq type %d\n", init_attr->wq_type); return ERR_PTR(-EINVAL); } rwq->ibwq.wq_num = rwq->core_qp.qpn; rwq->ibwq.state = IB_WQS_RESET; if (udata->outlen) { resp.response_length = offsetof(typeof(resp), response_length) + sizeof(resp.response_length); err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) goto err_copy; } rwq->core_qp.event = mlx5_ib_wq_event; rwq->ibwq.event_handler = init_attr->event_handler; return &rwq->ibwq; err_copy: mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); err_user_rq: destroy_user_rq(pd, rwq); err: kfree(rwq); return ERR_PTR(err); } int mlx5_ib_destroy_wq(struct ib_wq *wq) { struct mlx5_ib_dev *dev = to_mdev(wq->device); struct mlx5_ib_rwq *rwq = to_mrwq(wq); mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); destroy_user_rq(wq->pd, rwq); kfree(rwq); return 0; } struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, struct ib_rwq_ind_table_init_attr *init_attr, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(device); struct mlx5_ib_rwq_ind_table *rwq_ind_tbl; int sz = 1 << init_attr->log_ind_tbl_size; struct mlx5_ib_create_rwq_ind_tbl_resp resp = {}; size_t min_resp_len; int inlen; int err; int i; u32 *in; void *rqtc; if (udata->inlen > 0 && !ib_is_udata_cleared(udata, 0, udata->inlen)) return ERR_PTR(-EOPNOTSUPP); if (init_attr->log_ind_tbl_size > MLX5_CAP_GEN(dev->mdev, log_max_rqt_size)) { mlx5_ib_dbg(dev, "log_ind_tbl_size = %d is bigger than supported = %d\n", init_attr->log_ind_tbl_size, MLX5_CAP_GEN(dev->mdev, log_max_rqt_size)); return ERR_PTR(-EINVAL); } min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); if (udata->outlen && udata->outlen < min_resp_len) return ERR_PTR(-EINVAL); rwq_ind_tbl = kzalloc(sizeof(*rwq_ind_tbl), GFP_KERNEL); if (!rwq_ind_tbl) return ERR_PTR(-ENOMEM); inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz; in = mlx5_vzalloc(inlen); if (!in) { err = -ENOMEM; goto err; } rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context); MLX5_SET(rqtc, rqtc, rqt_actual_size, sz); MLX5_SET(rqtc, rqtc, rqt_max_size, sz); for (i = 0; i < sz; i++) MLX5_SET(rqtc, rqtc, rq_num[i], init_attr->ind_tbl[i]->wq_num); err = mlx5_core_create_rqt(dev->mdev, in, inlen, &rwq_ind_tbl->rqtn); kvfree(in); if (err) goto err; rwq_ind_tbl->ib_rwq_ind_tbl.ind_tbl_num = rwq_ind_tbl->rqtn; if (udata->outlen) { resp.response_length = offsetof(typeof(resp), response_length) + sizeof(resp.response_length); err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) goto err_copy; } return &rwq_ind_tbl->ib_rwq_ind_tbl; err_copy: mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn); err: kfree(rwq_ind_tbl); return ERR_PTR(err); } int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) { struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = to_mrwq_ind_table(ib_rwq_ind_tbl); struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_tbl->device); mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn); kfree(rwq_ind_tbl); return 0; } int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, u32 wq_attr_mask, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(wq->device); struct mlx5_ib_rwq *rwq = to_mrwq(wq); struct mlx5_ib_modify_wq ucmd = {}; size_t required_cmd_sz; int curr_wq_state; int wq_state; int inlen; int err; void *rqc; void *in; required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved); if (udata->inlen < required_cmd_sz) return -EINVAL; if (udata->inlen > sizeof(ucmd) && !ib_is_udata_cleared(udata, sizeof(ucmd), udata->inlen - sizeof(ucmd))) return -EOPNOTSUPP; if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) return -EFAULT; if (ucmd.comp_mask || ucmd.reserved) return -EOPNOTSUPP; inlen = MLX5_ST_SZ_BYTES(modify_rq_in); in = mlx5_vzalloc(inlen); if (!in) return -ENOMEM; rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); MLX5_SET(modify_rq_in, in, rqn, rwq->core_qp.qpn); curr_wq_state = (wq_attr_mask & IB_WQ_CUR_STATE) ? wq_attr->curr_wq_state : wq->state; wq_state = (wq_attr_mask & IB_WQ_STATE) ? wq_attr->wq_state : curr_wq_state; if (curr_wq_state == IB_WQS_ERR) curr_wq_state = MLX5_RQC_STATE_ERR; if (wq_state == IB_WQS_ERR) wq_state = MLX5_RQC_STATE_ERR; MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state); MLX5_SET(rqc, rqc, state, wq_state); err = mlx5_core_modify_rq(dev->mdev, in, inlen); kvfree(in); if (!err) rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state; return err; } diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c index 42e10f9a50de..f0d09a2aca8d 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c @@ -1,497 +1,497 @@ /*- * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include "mlx5_ib.h" /* not supported currently */ static int srq_signature; static void *get_wqe(struct mlx5_ib_srq *srq, int n) { return mlx5_buf_offset(&srq->buf, n << srq->msrq.wqe_shift); } static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, int type) { struct ib_event event; struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq; if (ibsrq->event_handler) { event.device = ibsrq->device; event.element.srq = ibsrq; switch (type) { case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: event.event = IB_EVENT_SRQ_LIMIT_REACHED; break; case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: event.event = IB_EVENT_SRQ_ERR; break; default: pr_warn("mlx5_ib: Unexpected event type %d on SRQ %06x\n", type, srq->srqn); return; } ibsrq->event_handler(&event, ibsrq->srq_context); } } static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, struct mlx5_srq_attr *in, struct ib_udata *udata, int buf_size) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_create_srq ucmd = {}; size_t ucmdlen; int err; int npages; int page_shift; int ncont; u32 offset; u32 uidx = MLX5_IB_DEFAULT_UIDX; ucmdlen = min(udata->inlen, sizeof(ucmd)); if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { mlx5_ib_dbg(dev, "failed copy udata\n"); return -EFAULT; } if (ucmd.reserved0 || ucmd.reserved1) return -EINVAL; if (udata->inlen > sizeof(ucmd) && !ib_is_udata_cleared(udata, sizeof(ucmd), udata->inlen - sizeof(ucmd))) return -EINVAL; if (in->type == IB_SRQT_XRC) { err = get_srq_user_index(to_mucontext(pd->uobject->context), &ucmd, udata->inlen, &uidx); if (err) return err; } srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, 0, 0); if (IS_ERR(srq->umem)) { mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size); err = PTR_ERR(srq->umem); return err; } mlx5_ib_cont_pages(srq->umem, ucmd.buf_addr, 0, &npages, &page_shift, &ncont, NULL); err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset); if (err) { mlx5_ib_warn(dev, "bad offset\n"); goto err_umem; } in->pas = mlx5_vzalloc(sizeof(*in->pas) * ncont); if (!in->pas) { err = -ENOMEM; goto err_umem; } mlx5_ib_populate_pas(dev, srq->umem, page_shift, in->pas, 0); err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), ucmd.db_addr, &srq->db); if (err) { mlx5_ib_dbg(dev, "map doorbell failed\n"); goto err_in; } in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT; in->page_offset = offset; if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 && in->type == IB_SRQT_XRC) in->user_index = uidx; return 0; err_in: kvfree(in->pas); err_umem: ib_umem_release(srq->umem); return err; } static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, struct mlx5_srq_attr *in, int buf_size) { int err; int i; struct mlx5_wqe_srq_next_seg *next; err = mlx5_db_alloc(dev->mdev, &srq->db); if (err) { mlx5_ib_warn(dev, "alloc dbell rec failed\n"); return err; } if (mlx5_buf_alloc(dev->mdev, buf_size, 2 * PAGE_SIZE, &srq->buf)) { mlx5_ib_dbg(dev, "buf alloc failed\n"); err = -ENOMEM; goto err_db; } srq->head = 0; srq->tail = srq->msrq.max - 1; srq->wqe_ctr = 0; for (i = 0; i < srq->msrq.max; i++) { next = get_wqe(srq, i); next->next_wqe_index = cpu_to_be16((i + 1) & (srq->msrq.max - 1)); } mlx5_ib_dbg(dev, "srq->buf.page_shift = %d\n", srq->buf.page_shift); in->pas = mlx5_vzalloc(sizeof(*in->pas) * srq->buf.npages); if (!in->pas) { err = -ENOMEM; goto err_buf; } mlx5_fill_page_array(&srq->buf, in->pas); srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL); if (!srq->wrid) { mlx5_ib_dbg(dev, "kmalloc failed %lu\n", (unsigned long)(srq->msrq.max * sizeof(u64))); err = -ENOMEM; goto err_in; } srq->wq_sig = !!srq_signature; in->log_page_size = srq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 && in->type == IB_SRQT_XRC) in->user_index = MLX5_IB_DEFAULT_UIDX; return 0; err_in: kvfree(in->pas); err_buf: mlx5_buf_free(dev->mdev, &srq->buf); err_db: mlx5_db_free(dev->mdev, &srq->db); return err; } static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq) { mlx5_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db); ib_umem_release(srq->umem); } static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq) { kfree(srq->wrid); mlx5_buf_free(dev->mdev, &srq->buf); mlx5_db_free(dev->mdev, &srq->db); } struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *init_attr, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_srq *srq; size_t desc_size; size_t buf_size; int err; struct mlx5_srq_attr in = {0}; __u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); /* Sanity check SRQ size before proceeding */ if (init_attr->attr.max_wr >= max_srq_wqes) { mlx5_ib_dbg(dev, "max_wr %d, cap %d\n", init_attr->attr.max_wr, max_srq_wqes); return ERR_PTR(-EINVAL); } srq = kmalloc(sizeof(*srq), GFP_KERNEL); if (!srq) return ERR_PTR(-ENOMEM); mutex_init(&srq->mutex); spin_lock_init(&srq->lock); srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); srq->msrq.max_gs = init_attr->attr.max_sge; desc_size = sizeof(struct mlx5_wqe_srq_next_seg) + srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg); if (desc_size == 0 || srq->msrq.max_gs > desc_size) { err = -EINVAL; goto err_srq; } desc_size = roundup_pow_of_two(desc_size); desc_size = max_t(size_t, 32, desc_size); if (desc_size < sizeof(struct mlx5_wqe_srq_next_seg)) { err = -EINVAL; goto err_srq; } srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) / sizeof(struct mlx5_wqe_data_seg); srq->msrq.wqe_shift = ilog2(desc_size); buf_size = srq->msrq.max * desc_size; if (buf_size < desc_size) { err = -EINVAL; goto err_srq; } in.type = init_attr->srq_type; if (pd->uobject) err = create_srq_user(pd, srq, &in, udata, buf_size); else err = create_srq_kernel(dev, srq, &in, buf_size); if (err || !in.pas) { mlx5_ib_warn(dev, "create srq %s failed, err %d\n", pd->uobject ? "user" : "kernel", err); goto err_srq; } in.log_size = ilog2(srq->msrq.max); in.wqe_shift = srq->msrq.wqe_shift - 4; if (srq->wq_sig) in.flags |= MLX5_SRQ_FLAG_WQ_SIG; if (init_attr->srq_type == IB_SRQT_XRC) { in.xrcd = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn; in.cqn = to_mcq(init_attr->ext.xrc.cq)->mcq.cqn; } else if (init_attr->srq_type == IB_SRQT_BASIC) { in.xrcd = to_mxrcd(dev->devr.x0)->xrcdn; in.cqn = to_mcq(dev->devr.c0)->mcq.cqn; } in.pd = to_mpd(pd)->pdn; in.db_record = srq->db.dma; err = mlx5_core_create_srq(dev->mdev, &srq->msrq, &in); kvfree(in.pas); if (err) { mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err); goto err_usr_kern_srq; } mlx5_ib_dbg(dev, "create SRQ with srqn 0x%x\n", srq->msrq.srqn); srq->msrq.event = mlx5_ib_srq_event; srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; if (pd->uobject) if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) { mlx5_ib_dbg(dev, "copy to user failed\n"); err = -EFAULT; goto err_core; } init_attr->attr.max_wr = srq->msrq.max - 1; return &srq->ibsrq; err_core: mlx5_core_destroy_srq(dev->mdev, &srq->msrq); err_usr_kern_srq: if (pd->uobject) destroy_srq_user(pd, srq); else destroy_srq_kernel(dev, srq); err_srq: kfree(srq); return ERR_PTR(err); } int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); struct mlx5_ib_srq *srq = to_msrq(ibsrq); int ret; /* We don't support resizing SRQs yet */ if (attr_mask & IB_SRQ_MAX_WR) return -EINVAL; if (attr_mask & IB_SRQ_LIMIT) { if (attr->srq_limit >= srq->msrq.max) return -EINVAL; mutex_lock(&srq->mutex); ret = mlx5_core_arm_srq(dev->mdev, &srq->msrq, attr->srq_limit, 1); mutex_unlock(&srq->mutex); if (ret) return ret; } return 0; } int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) { struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); struct mlx5_ib_srq *srq = to_msrq(ibsrq); int ret; struct mlx5_srq_attr *out; out = kzalloc(sizeof(*out), GFP_KERNEL); if (!out) return -ENOMEM; ret = mlx5_core_query_srq(dev->mdev, &srq->msrq, out); if (ret) goto out_box; srq_attr->srq_limit = out->lwm; srq_attr->max_wr = srq->msrq.max - 1; srq_attr->max_sge = srq->msrq.max_gs; out_box: kfree(out); return ret; } int mlx5_ib_destroy_srq(struct ib_srq *srq) { struct mlx5_ib_dev *dev = to_mdev(srq->device); struct mlx5_ib_srq *msrq = to_msrq(srq); mlx5_core_destroy_srq(dev->mdev, &msrq->msrq); if (srq->uobject) { mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); ib_umem_release(msrq->umem); } else { destroy_srq_kernel(dev, msrq); } kfree(srq); return 0; } void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index) { struct mlx5_wqe_srq_next_seg *next; /* always called with interrupts disabled. */ spin_lock(&srq->lock); next = get_wqe(srq, srq->tail); next->next_wqe_index = cpu_to_be16(wqe_index); srq->tail = wqe_index; spin_unlock(&srq->lock); } -int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mlx5_ib_srq *srq = to_msrq(ibsrq); struct mlx5_wqe_srq_next_seg *next; struct mlx5_wqe_data_seg *scat; struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); struct mlx5_core_dev *mdev = dev->mdev; unsigned long flags; int err = 0; int nreq; int i; spin_lock_irqsave(&srq->lock, flags); if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { err = -EIO; *bad_wr = wr; goto out; } for (nreq = 0; wr; nreq++, wr = wr->next) { if (unlikely(wr->num_sge > srq->msrq.max_gs)) { err = -EINVAL; *bad_wr = wr; break; } if (unlikely(srq->head == srq->tail)) { err = -ENOMEM; *bad_wr = wr; break; } srq->wrid[srq->head] = wr->wr_id; next = get_wqe(srq, srq->head); srq->head = be16_to_cpu(next->next_wqe_index); scat = (struct mlx5_wqe_data_seg *)(next + 1); for (i = 0; i < wr->num_sge; i++) { scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); } if (i < srq->msrq.max_avail_gather) { scat[i].byte_count = 0; scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); scat[i].addr = 0; } } if (likely(nreq)) { srq->wqe_ctr += nreq; /* Make sure that descriptors are written before * doorbell record. */ wmb(); *srq->db.db = cpu_to_be32(srq->wqe_ctr); } out: spin_unlock_irqrestore(&srq->lock, flags); return err; } diff --git a/sys/dev/mthca/mthca_dev.h b/sys/dev/mthca/mthca_dev.h index 854d37e4541a..f9dcef8e7b88 100644 --- a/sys/dev/mthca/mthca_dev.h +++ b/sys/dev/mthca/mthca_dev.h @@ -1,599 +1,599 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef MTHCA_DEV_H #define MTHCA_DEV_H #include #include #include #include #include #include #include #include #include "mthca_provider.h" #include "mthca_doorbell.h" #define DRV_NAME "ib_mthca" #define PFX DRV_NAME ": " #ifndef DRV_VERSION #define DRV_VERSION "1.0" #endif #define DRV_RELDATE "April 4, 2008" enum { MTHCA_FLAG_DDR_HIDDEN = 1 << 1, MTHCA_FLAG_SRQ = 1 << 2, MTHCA_FLAG_MSI_X = 1 << 3, MTHCA_FLAG_NO_LAM = 1 << 4, MTHCA_FLAG_FMR = 1 << 5, MTHCA_FLAG_MEMFREE = 1 << 6, MTHCA_FLAG_PCIE = 1 << 7, MTHCA_FLAG_SINAI_OPT = 1 << 8 }; enum { MTHCA_MAX_PORTS = 2 }; enum { MTHCA_BOARD_ID_LEN = 64 }; enum { MTHCA_EQ_CONTEXT_SIZE = 0x40, MTHCA_CQ_CONTEXT_SIZE = 0x40, MTHCA_QP_CONTEXT_SIZE = 0x200, MTHCA_RDB_ENTRY_SIZE = 0x20, MTHCA_AV_SIZE = 0x20, MTHCA_MGM_ENTRY_SIZE = 0x100, /* Arbel FW gives us these, but we need them for Tavor */ MTHCA_MPT_ENTRY_SIZE = 0x40, MTHCA_MTT_SEG_SIZE = 0x40, MTHCA_QP_PER_MGM = 4 * (MTHCA_MGM_ENTRY_SIZE / 16 - 2) }; enum { MTHCA_EQ_CMD, MTHCA_EQ_ASYNC, MTHCA_EQ_COMP, MTHCA_NUM_EQ }; enum { MTHCA_OPCODE_NOP = 0x00, MTHCA_OPCODE_RDMA_WRITE = 0x08, MTHCA_OPCODE_RDMA_WRITE_IMM = 0x09, MTHCA_OPCODE_SEND = 0x0a, MTHCA_OPCODE_SEND_IMM = 0x0b, MTHCA_OPCODE_RDMA_READ = 0x10, MTHCA_OPCODE_ATOMIC_CS = 0x11, MTHCA_OPCODE_ATOMIC_FA = 0x12, MTHCA_OPCODE_BIND_MW = 0x18, MTHCA_OPCODE_INVALID = 0xff }; enum { MTHCA_CMD_USE_EVENTS = 1 << 0, MTHCA_CMD_POST_DOORBELLS = 1 << 1 }; enum { MTHCA_CMD_NUM_DBELL_DWORDS = 8 }; struct mthca_cmd { struct pci_pool *pool; struct mutex hcr_mutex; struct semaphore poll_sem; struct semaphore event_sem; int max_cmds; spinlock_t context_lock; int free_head; struct mthca_cmd_context *context; u16 token_mask; u32 flags; void __iomem *dbell_map; u16 dbell_offsets[MTHCA_CMD_NUM_DBELL_DWORDS]; }; struct mthca_limits { int num_ports; int vl_cap; int mtu_cap; int gid_table_len; int pkey_table_len; int local_ca_ack_delay; int num_uars; int max_sg; int num_qps; int max_wqes; int max_desc_sz; int max_qp_init_rdma; int reserved_qps; int num_srqs; int max_srq_wqes; int max_srq_sge; int reserved_srqs; int num_eecs; int reserved_eecs; int num_cqs; int max_cqes; int reserved_cqs; int num_eqs; int reserved_eqs; int num_mpts; int num_mtt_segs; int mtt_seg_size; int fmr_reserved_mtts; int reserved_mtts; int reserved_mrws; int reserved_uars; int num_mgms; int num_amgms; int reserved_mcgs; int num_pds; int reserved_pds; u32 page_size_cap; u32 flags; u16 stat_rate_support; u8 port_width_cap; }; struct mthca_alloc { u32 last; u32 top; u32 max; u32 mask; spinlock_t lock; unsigned long *table; }; struct mthca_array { struct { void **page; int used; } *page_list; }; struct mthca_uar_table { struct mthca_alloc alloc; u64 uarc_base; int uarc_size; }; struct mthca_pd_table { struct mthca_alloc alloc; }; struct mthca_buddy { unsigned long **bits; int *num_free; int max_order; spinlock_t lock; }; struct mthca_mr_table { struct mthca_alloc mpt_alloc; struct mthca_buddy mtt_buddy; struct mthca_buddy *fmr_mtt_buddy; u64 mtt_base; u64 mpt_base; struct mthca_icm_table *mtt_table; struct mthca_icm_table *mpt_table; struct { void __iomem *mpt_base; void __iomem *mtt_base; struct mthca_buddy mtt_buddy; } tavor_fmr; }; struct mthca_eq_table { struct mthca_alloc alloc; void __iomem *clr_int; u32 clr_mask; u32 arm_mask; struct mthca_eq eq[MTHCA_NUM_EQ]; u64 icm_virt; struct page *icm_page; dma_addr_t icm_dma; int have_irq; u8 inta_pin; }; struct mthca_cq_table { struct mthca_alloc alloc; spinlock_t lock; struct mthca_array cq; struct mthca_icm_table *table; }; struct mthca_srq_table { struct mthca_alloc alloc; spinlock_t lock; struct mthca_array srq; struct mthca_icm_table *table; }; struct mthca_qp_table { struct mthca_alloc alloc; u32 rdb_base; int rdb_shift; int sqp_start; spinlock_t lock; struct mthca_array qp; struct mthca_icm_table *qp_table; struct mthca_icm_table *eqp_table; struct mthca_icm_table *rdb_table; }; struct mthca_av_table { struct pci_pool *pool; int num_ddr_avs; u64 ddr_av_base; void __iomem *av_map; struct mthca_alloc alloc; }; struct mthca_mcg_table { struct mutex mutex; struct mthca_alloc alloc; struct mthca_icm_table *table; }; struct mthca_catas_err { u64 addr; u32 __iomem *map; u32 size; struct timer_list timer; struct list_head list; }; extern struct mutex mthca_device_mutex; struct mthca_dev { struct ib_device ib_dev; struct pci_dev *pdev; int hca_type; unsigned long mthca_flags; unsigned long device_cap_flags; u32 rev_id; char board_id[MTHCA_BOARD_ID_LEN]; /* firmware info */ u64 fw_ver; union { struct { u64 fw_start; u64 fw_end; } tavor; struct { u64 clr_int_base; u64 eq_arm_base; u64 eq_set_ci_base; struct mthca_icm *fw_icm; struct mthca_icm *aux_icm; u16 fw_pages; } arbel; } fw; u64 ddr_start; u64 ddr_end; MTHCA_DECLARE_DOORBELL_LOCK(doorbell_lock) struct mutex cap_mask_mutex; void __iomem *hcr; void __iomem *kar; void __iomem *clr_base; union { struct { void __iomem *ecr_base; } tavor; struct { void __iomem *eq_arm; void __iomem *eq_set_ci_base; } arbel; } eq_regs; struct mthca_cmd cmd; struct mthca_limits limits; struct mthca_uar_table uar_table; struct mthca_pd_table pd_table; struct mthca_mr_table mr_table; struct mthca_eq_table eq_table; struct mthca_cq_table cq_table; struct mthca_srq_table srq_table; struct mthca_qp_table qp_table; struct mthca_av_table av_table; struct mthca_mcg_table mcg_table; struct mthca_catas_err catas_err; struct mthca_uar driver_uar; struct mthca_db_table *db_tab; struct mthca_pd driver_pd; struct mthca_mr driver_mr; struct ib_mad_agent *send_agent[MTHCA_MAX_PORTS][2]; struct ib_ah *sm_ah[MTHCA_MAX_PORTS]; spinlock_t sm_lock; u8 rate[MTHCA_MAX_PORTS]; bool active; }; #ifdef CONFIG_INFINIBAND_MTHCA_DEBUG extern int mthca_debug_level; #define mthca_dbg(mdev, format, arg...) \ do { \ if (mthca_debug_level) \ dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \ } while (0) #else /* CONFIG_INFINIBAND_MTHCA_DEBUG */ #define mthca_dbg(mdev, format, arg...) do { (void) mdev; } while (0) #endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */ #define mthca_err(mdev, format, arg...) \ dev_err(&mdev->pdev->dev, format, ## arg) #define mthca_info(mdev, format, arg...) \ dev_info(&mdev->pdev->dev, format, ## arg) #define mthca_warn(mdev, format, arg...) \ dev_warn(&mdev->pdev->dev, format, ## arg) extern void __buggy_use_of_MTHCA_GET(void); extern void __buggy_use_of_MTHCA_PUT(void); #define MTHCA_GET(dest, source, offset) \ do { \ void *__p = (char *) (source) + (offset); \ switch (sizeof (dest)) { \ case 1: (dest) = *(u8 *) __p; break; \ case 2: (dest) = be16_to_cpup(__p); break; \ case 4: (dest) = be32_to_cpup(__p); break; \ case 8: (dest) = be64_to_cpup(__p); break; \ default: __buggy_use_of_MTHCA_GET(); \ } \ } while (0) #define MTHCA_PUT(dest, source, offset) \ do { \ void *__d = ((char *) (dest) + (offset)); \ switch (sizeof(source)) { \ case 1: *(u8 *) __d = (source); break; \ case 2: *(__be16 *) __d = cpu_to_be16(source); break; \ case 4: *(__be32 *) __d = cpu_to_be32(source); break; \ case 8: *(__be64 *) __d = cpu_to_be64(source); break; \ default: __buggy_use_of_MTHCA_PUT(); \ } \ } while (0) int mthca_reset(struct mthca_dev *mdev); u32 mthca_alloc(struct mthca_alloc *alloc); void mthca_free(struct mthca_alloc *alloc, u32 obj); int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask, u32 reserved); void mthca_alloc_cleanup(struct mthca_alloc *alloc); void *mthca_array_get(struct mthca_array *array, int index); int mthca_array_set(struct mthca_array *array, int index, void *value); void mthca_array_clear(struct mthca_array *array, int index); int mthca_array_init(struct mthca_array *array, int nent); void mthca_array_cleanup(struct mthca_array *array, int nent); int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct, union mthca_buf *buf, int *is_direct, struct mthca_pd *pd, int hca_write, struct mthca_mr *mr); void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf, int is_direct, struct mthca_mr *mr); int mthca_init_uar_table(struct mthca_dev *dev); int mthca_init_pd_table(struct mthca_dev *dev); int mthca_init_mr_table(struct mthca_dev *dev); int mthca_init_eq_table(struct mthca_dev *dev); int mthca_init_cq_table(struct mthca_dev *dev); int mthca_init_srq_table(struct mthca_dev *dev); int mthca_init_qp_table(struct mthca_dev *dev); int mthca_init_av_table(struct mthca_dev *dev); int mthca_init_mcg_table(struct mthca_dev *dev); void mthca_cleanup_uar_table(struct mthca_dev *dev); void mthca_cleanup_pd_table(struct mthca_dev *dev); void mthca_cleanup_mr_table(struct mthca_dev *dev); void mthca_cleanup_eq_table(struct mthca_dev *dev); void mthca_cleanup_cq_table(struct mthca_dev *dev); void mthca_cleanup_srq_table(struct mthca_dev *dev); void mthca_cleanup_qp_table(struct mthca_dev *dev); void mthca_cleanup_av_table(struct mthca_dev *dev); void mthca_cleanup_mcg_table(struct mthca_dev *dev); int mthca_register_device(struct mthca_dev *dev); void mthca_unregister_device(struct mthca_dev *dev); void mthca_start_catas_poll(struct mthca_dev *dev); void mthca_stop_catas_poll(struct mthca_dev *dev); int __mthca_restart_one(struct pci_dev *pdev); int mthca_catas_init(void); void mthca_catas_cleanup(void); int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar); void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar); int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd); void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd); int mthca_write_mtt_size(struct mthca_dev *dev); struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size); void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt); int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, int start_index, u64 *buffer_list, int list_len); int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift, u64 iova, u64 total_size, u32 access, struct mthca_mr *mr); int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd, u32 access, struct mthca_mr *mr); int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd, u64 *buffer_list, int buffer_size_shift, int list_len, u64 iova, u64 total_size, u32 access, struct mthca_mr *mr); void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr); int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd, u32 access, struct mthca_fmr *fmr); int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int list_len, u64 iova); void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr); int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int list_len, u64 iova); void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr); int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr); int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt); void mthca_unmap_eq_icm(struct mthca_dev *dev); int mthca_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); int mthca_init_cq(struct mthca_dev *dev, int nent, struct mthca_ucontext *ctx, u32 pdn, struct mthca_cq *cq); void mthca_free_cq(struct mthca_dev *dev, struct mthca_cq *cq); void mthca_cq_completion(struct mthca_dev *dev, u32 cqn); void mthca_cq_event(struct mthca_dev *dev, u32 cqn, enum ib_event_type event_type); void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn, struct mthca_srq *srq); void mthca_cq_resize_copy_cqes(struct mthca_cq *cq); int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent); void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe); int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd, struct ib_srq_attr *attr, struct mthca_srq *srq); void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq); int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); int mthca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); int mthca_max_srq_sge(struct mthca_dev *dev); void mthca_srq_event(struct mthca_dev *dev, u32 srqn, enum ib_event_type event_type); void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr); -int mthca_tavor_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); -int mthca_arbel_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int mthca_tavor_post_srq_recv(struct ib_srq *srq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +int mthca_arbel_post_srq_recv(struct ib_srq *srq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); void mthca_qp_event(struct mthca_dev *dev, u32 qpn, enum ib_event_type event_type); int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); -int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); -int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); -int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); -int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int mthca_tavor_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int mthca_tavor_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +int mthca_arbel_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int mthca_arbel_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send, int index, int *dbd, __be32 *new_wqe); int mthca_alloc_qp(struct mthca_dev *dev, struct mthca_pd *pd, struct mthca_cq *send_cq, struct mthca_cq *recv_cq, enum ib_qp_type type, enum ib_sig_type send_policy, struct ib_qp_cap *cap, struct mthca_qp *qp); int mthca_alloc_sqp(struct mthca_dev *dev, struct mthca_pd *pd, struct mthca_cq *send_cq, struct mthca_cq *recv_cq, enum ib_sig_type send_policy, struct ib_qp_cap *cap, int qpn, int port, struct mthca_sqp *sqp); void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp); int mthca_create_ah(struct mthca_dev *dev, struct mthca_pd *pd, struct ib_ah_attr *ah_attr, struct mthca_ah *ah); int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah); int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah, struct ib_ud_header *header); int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr); int mthca_ah_grh_present(struct mthca_ah *ah); u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port); enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port); int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); int mthca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *in, size_t in_mad_size, struct ib_mad_hdr *out, size_t *out_mad_size, u16 *out_mad_pkey_index); int mthca_create_agents(struct mthca_dev *dev); void mthca_free_agents(struct mthca_dev *dev); static inline struct mthca_dev *to_mdev(struct ib_device *ibdev) { return container_of(ibdev, struct mthca_dev, ib_dev); } static inline int mthca_is_memfree(struct mthca_dev *dev) { return dev->mthca_flags & MTHCA_FLAG_MEMFREE; } #endif /* MTHCA_DEV_H */ diff --git a/sys/dev/mthca/mthca_qp.c b/sys/dev/mthca/mthca_qp.c index b726450ba4aa..51b083638f0f 100644 --- a/sys/dev/mthca/mthca_qp.c +++ b/sys/dev/mthca/mthca_qp.c @@ -1,2308 +1,2308 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Cisco Systems. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include "mthca_dev.h" #include "mthca_cmd.h" #include "mthca_memfree.h" #include "mthca_wqe.h" enum { MTHCA_MAX_DIRECT_QP_SIZE = 4 * PAGE_SIZE, MTHCA_ACK_REQ_FREQ = 10, MTHCA_FLIGHT_LIMIT = 9, MTHCA_UD_HEADER_SIZE = 72, /* largest UD header possible */ MTHCA_INLINE_HEADER_SIZE = 4, /* data segment overhead for inline */ MTHCA_INLINE_CHUNK_SIZE = 16 /* inline data segment chunk */ }; enum { MTHCA_QP_STATE_RST = 0, MTHCA_QP_STATE_INIT = 1, MTHCA_QP_STATE_RTR = 2, MTHCA_QP_STATE_RTS = 3, MTHCA_QP_STATE_SQE = 4, MTHCA_QP_STATE_SQD = 5, MTHCA_QP_STATE_ERR = 6, MTHCA_QP_STATE_DRAINING = 7 }; enum { MTHCA_QP_ST_RC = 0x0, MTHCA_QP_ST_UC = 0x1, MTHCA_QP_ST_RD = 0x2, MTHCA_QP_ST_UD = 0x3, MTHCA_QP_ST_MLX = 0x7 }; enum { MTHCA_QP_PM_MIGRATED = 0x3, MTHCA_QP_PM_ARMED = 0x0, MTHCA_QP_PM_REARM = 0x1 }; enum { /* qp_context flags */ MTHCA_QP_BIT_DE = 1 << 8, /* params1 */ MTHCA_QP_BIT_SRE = 1 << 15, MTHCA_QP_BIT_SWE = 1 << 14, MTHCA_QP_BIT_SAE = 1 << 13, MTHCA_QP_BIT_SIC = 1 << 4, MTHCA_QP_BIT_SSC = 1 << 3, /* params2 */ MTHCA_QP_BIT_RRE = 1 << 15, MTHCA_QP_BIT_RWE = 1 << 14, MTHCA_QP_BIT_RAE = 1 << 13, MTHCA_QP_BIT_RIC = 1 << 4, MTHCA_QP_BIT_RSC = 1 << 3 }; enum { MTHCA_SEND_DOORBELL_FENCE = 1 << 5 }; struct mthca_qp_path { __be32 port_pkey; u8 rnr_retry; u8 g_mylmc; __be16 rlid; u8 ackto; u8 mgid_index; u8 static_rate; u8 hop_limit; __be32 sl_tclass_flowlabel; u8 rgid[16]; } __attribute__((packed)); struct mthca_qp_context { __be32 flags; __be32 tavor_sched_queue; /* Reserved on Arbel */ u8 mtu_msgmax; u8 rq_size_stride; /* Reserved on Tavor */ u8 sq_size_stride; /* Reserved on Tavor */ u8 rlkey_arbel_sched_queue; /* Reserved on Tavor */ __be32 usr_page; __be32 local_qpn; __be32 remote_qpn; u32 reserved1[2]; struct mthca_qp_path pri_path; struct mthca_qp_path alt_path; __be32 rdd; __be32 pd; __be32 wqe_base; __be32 wqe_lkey; __be32 params1; __be32 reserved2; __be32 next_send_psn; __be32 cqn_snd; __be32 snd_wqe_base_l; /* Next send WQE on Tavor */ __be32 snd_db_index; /* (debugging only entries) */ __be32 last_acked_psn; __be32 ssn; __be32 params2; __be32 rnr_nextrecvpsn; __be32 ra_buff_indx; __be32 cqn_rcv; __be32 rcv_wqe_base_l; /* Next recv WQE on Tavor */ __be32 rcv_db_index; /* (debugging only entries) */ __be32 qkey; __be32 srqn; __be32 rmsn; __be16 rq_wqe_counter; /* reserved on Tavor */ __be16 sq_wqe_counter; /* reserved on Tavor */ u32 reserved3[18]; } __attribute__((packed)); struct mthca_qp_param { __be32 opt_param_mask; u32 reserved1; struct mthca_qp_context context; u32 reserved2[62]; } __attribute__((packed)); enum { MTHCA_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0, MTHCA_QP_OPTPAR_RRE = 1 << 1, MTHCA_QP_OPTPAR_RAE = 1 << 2, MTHCA_QP_OPTPAR_RWE = 1 << 3, MTHCA_QP_OPTPAR_PKEY_INDEX = 1 << 4, MTHCA_QP_OPTPAR_Q_KEY = 1 << 5, MTHCA_QP_OPTPAR_RNR_TIMEOUT = 1 << 6, MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH = 1 << 7, MTHCA_QP_OPTPAR_SRA_MAX = 1 << 8, MTHCA_QP_OPTPAR_RRA_MAX = 1 << 9, MTHCA_QP_OPTPAR_PM_STATE = 1 << 10, MTHCA_QP_OPTPAR_PORT_NUM = 1 << 11, MTHCA_QP_OPTPAR_RETRY_COUNT = 1 << 12, MTHCA_QP_OPTPAR_ALT_RNR_RETRY = 1 << 13, MTHCA_QP_OPTPAR_ACK_TIMEOUT = 1 << 14, MTHCA_QP_OPTPAR_RNR_RETRY = 1 << 15, MTHCA_QP_OPTPAR_SCHED_QUEUE = 1 << 16 }; static const u8 mthca_opcode[] = { [IB_WR_SEND] = MTHCA_OPCODE_SEND, [IB_WR_SEND_WITH_IMM] = MTHCA_OPCODE_SEND_IMM, [IB_WR_RDMA_WRITE] = MTHCA_OPCODE_RDMA_WRITE, [IB_WR_RDMA_WRITE_WITH_IMM] = MTHCA_OPCODE_RDMA_WRITE_IMM, [IB_WR_RDMA_READ] = MTHCA_OPCODE_RDMA_READ, [IB_WR_ATOMIC_CMP_AND_SWP] = MTHCA_OPCODE_ATOMIC_CS, [IB_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA, }; static int is_sqp(struct mthca_dev *dev, struct mthca_qp *qp) { return qp->qpn >= dev->qp_table.sqp_start && qp->qpn <= dev->qp_table.sqp_start + 3; } static int is_qp0(struct mthca_dev *dev, struct mthca_qp *qp) { return qp->qpn >= dev->qp_table.sqp_start && qp->qpn <= dev->qp_table.sqp_start + 1; } static void *get_recv_wqe(struct mthca_qp *qp, int n) { if (qp->is_direct) return qp->queue.direct.buf + (n << qp->rq.wqe_shift); else return qp->queue.page_list[(n << qp->rq.wqe_shift) >> PAGE_SHIFT].buf + ((n << qp->rq.wqe_shift) & (PAGE_SIZE - 1)); } static void *get_send_wqe(struct mthca_qp *qp, int n) { if (qp->is_direct) return qp->queue.direct.buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift); else return qp->queue.page_list[(qp->send_wqe_offset + (n << qp->sq.wqe_shift)) >> PAGE_SHIFT].buf + ((qp->send_wqe_offset + (n << qp->sq.wqe_shift)) & (PAGE_SIZE - 1)); } static void mthca_wq_reset(struct mthca_wq *wq) { wq->next_ind = 0; wq->last_comp = wq->max - 1; wq->head = 0; wq->tail = 0; } void mthca_qp_event(struct mthca_dev *dev, u32 qpn, enum ib_event_type event_type) { struct mthca_qp *qp; struct ib_event event; spin_lock(&dev->qp_table.lock); qp = mthca_array_get(&dev->qp_table.qp, qpn & (dev->limits.num_qps - 1)); if (qp) ++qp->refcount; spin_unlock(&dev->qp_table.lock); if (!qp) { mthca_warn(dev, "Async event %d for bogus QP %08x\n", event_type, qpn); return; } if (event_type == IB_EVENT_PATH_MIG) qp->port = qp->alt_port; event.device = &dev->ib_dev; event.event = event_type; event.element.qp = &qp->ibqp; if (qp->ibqp.event_handler) qp->ibqp.event_handler(&event, qp->ibqp.qp_context); spin_lock(&dev->qp_table.lock); if (!--qp->refcount) wake_up(&qp->wait); spin_unlock(&dev->qp_table.lock); } static int to_mthca_state(enum ib_qp_state ib_state) { switch (ib_state) { case IB_QPS_RESET: return MTHCA_QP_STATE_RST; case IB_QPS_INIT: return MTHCA_QP_STATE_INIT; case IB_QPS_RTR: return MTHCA_QP_STATE_RTR; case IB_QPS_RTS: return MTHCA_QP_STATE_RTS; case IB_QPS_SQD: return MTHCA_QP_STATE_SQD; case IB_QPS_SQE: return MTHCA_QP_STATE_SQE; case IB_QPS_ERR: return MTHCA_QP_STATE_ERR; default: return -1; } } enum { RC, UC, UD, RD, RDEE, MLX, NUM_TRANS }; static int to_mthca_st(int transport) { switch (transport) { case RC: return MTHCA_QP_ST_RC; case UC: return MTHCA_QP_ST_UC; case UD: return MTHCA_QP_ST_UD; case RD: return MTHCA_QP_ST_RD; case MLX: return MTHCA_QP_ST_MLX; default: return -1; } } static void store_attrs(struct mthca_sqp *sqp, const struct ib_qp_attr *attr, int attr_mask) { if (attr_mask & IB_QP_PKEY_INDEX) sqp->pkey_index = attr->pkey_index; if (attr_mask & IB_QP_QKEY) sqp->qkey = attr->qkey; if (attr_mask & IB_QP_SQ_PSN) sqp->send_psn = attr->sq_psn; } static void init_port(struct mthca_dev *dev, int port) { int err; struct mthca_init_ib_param param; memset(¶m, 0, sizeof param); param.port_width = dev->limits.port_width_cap; param.vl_cap = dev->limits.vl_cap; param.mtu_cap = dev->limits.mtu_cap; param.gid_cap = dev->limits.gid_table_len; param.pkey_cap = dev->limits.pkey_table_len; err = mthca_INIT_IB(dev, ¶m, port); if (err) mthca_warn(dev, "INIT_IB failed, return code %d.\n", err); } static __be32 get_hw_access_flags(struct mthca_qp *qp, const struct ib_qp_attr *attr, int attr_mask) { u8 dest_rd_atomic; u32 access_flags; u32 hw_access_flags = 0; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) dest_rd_atomic = attr->max_dest_rd_atomic; else dest_rd_atomic = qp->resp_depth; if (attr_mask & IB_QP_ACCESS_FLAGS) access_flags = attr->qp_access_flags; else access_flags = qp->atomic_rd_en; if (!dest_rd_atomic) access_flags &= IB_ACCESS_REMOTE_WRITE; if (access_flags & IB_ACCESS_REMOTE_READ) hw_access_flags |= MTHCA_QP_BIT_RRE; if (access_flags & IB_ACCESS_REMOTE_ATOMIC) hw_access_flags |= MTHCA_QP_BIT_RAE; if (access_flags & IB_ACCESS_REMOTE_WRITE) hw_access_flags |= MTHCA_QP_BIT_RWE; return cpu_to_be32(hw_access_flags); } static inline enum ib_qp_state to_ib_qp_state(int mthca_state) { switch (mthca_state) { case MTHCA_QP_STATE_RST: return IB_QPS_RESET; case MTHCA_QP_STATE_INIT: return IB_QPS_INIT; case MTHCA_QP_STATE_RTR: return IB_QPS_RTR; case MTHCA_QP_STATE_RTS: return IB_QPS_RTS; case MTHCA_QP_STATE_DRAINING: case MTHCA_QP_STATE_SQD: return IB_QPS_SQD; case MTHCA_QP_STATE_SQE: return IB_QPS_SQE; case MTHCA_QP_STATE_ERR: return IB_QPS_ERR; default: return -1; } } static inline enum ib_mig_state to_ib_mig_state(int mthca_mig_state) { switch (mthca_mig_state) { case 0: return IB_MIG_ARMED; case 1: return IB_MIG_REARM; case 3: return IB_MIG_MIGRATED; default: return -1; } } static int to_ib_qp_access_flags(int mthca_flags) { int ib_flags = 0; if (mthca_flags & MTHCA_QP_BIT_RRE) ib_flags |= IB_ACCESS_REMOTE_READ; if (mthca_flags & MTHCA_QP_BIT_RWE) ib_flags |= IB_ACCESS_REMOTE_WRITE; if (mthca_flags & MTHCA_QP_BIT_RAE) ib_flags |= IB_ACCESS_REMOTE_ATOMIC; return ib_flags; } static void to_ib_ah_attr(struct mthca_dev *dev, struct ib_ah_attr *ib_ah_attr, struct mthca_qp_path *path) { memset(ib_ah_attr, 0, sizeof *ib_ah_attr); ib_ah_attr->port_num = (be32_to_cpu(path->port_pkey) >> 24) & 0x3; if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->limits.num_ports) return; ib_ah_attr->dlid = be16_to_cpu(path->rlid); ib_ah_attr->sl = be32_to_cpu(path->sl_tclass_flowlabel) >> 28; ib_ah_attr->src_path_bits = path->g_mylmc & 0x7f; ib_ah_attr->static_rate = mthca_rate_to_ib(dev, path->static_rate & 0xf, ib_ah_attr->port_num); ib_ah_attr->ah_flags = (path->g_mylmc & (1 << 7)) ? IB_AH_GRH : 0; if (ib_ah_attr->ah_flags) { ib_ah_attr->grh.sgid_index = path->mgid_index & (dev->limits.gid_table_len - 1); ib_ah_attr->grh.hop_limit = path->hop_limit; ib_ah_attr->grh.traffic_class = (be32_to_cpu(path->sl_tclass_flowlabel) >> 20) & 0xff; ib_ah_attr->grh.flow_label = be32_to_cpu(path->sl_tclass_flowlabel) & 0xfffff; memcpy(ib_ah_attr->grh.dgid.raw, path->rgid, sizeof ib_ah_attr->grh.dgid.raw); } } int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); int err = 0; struct mthca_mailbox *mailbox = NULL; struct mthca_qp_param *qp_param; struct mthca_qp_context *context; int mthca_state; mutex_lock(&qp->mutex); if (qp->state == IB_QPS_RESET) { qp_attr->qp_state = IB_QPS_RESET; goto done; } mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) { err = PTR_ERR(mailbox); goto out; } err = mthca_QUERY_QP(dev, qp->qpn, 0, mailbox); if (err) { mthca_warn(dev, "QUERY_QP failed (%d)\n", err); goto out_mailbox; } qp_param = mailbox->buf; context = &qp_param->context; mthca_state = be32_to_cpu(context->flags) >> 28; qp->state = to_ib_qp_state(mthca_state); qp_attr->qp_state = qp->state; qp_attr->path_mtu = context->mtu_msgmax >> 5; qp_attr->path_mig_state = to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); qp_attr->qkey = be32_to_cpu(context->qkey); qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff; qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff; qp_attr->dest_qp_num = be32_to_cpu(context->remote_qpn) & 0xffffff; qp_attr->qp_access_flags = to_ib_qp_access_flags(be32_to_cpu(context->params2)); if (qp->transport == RC || qp->transport == UC) { to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); qp_attr->alt_pkey_index = be32_to_cpu(context->alt_path.port_pkey) & 0x7f; qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; } qp_attr->pkey_index = be32_to_cpu(context->pri_path.port_pkey) & 0x7f; qp_attr->port_num = (be32_to_cpu(context->pri_path.port_pkey) >> 24) & 0x3; /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ qp_attr->sq_draining = mthca_state == MTHCA_QP_STATE_DRAINING; qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7); qp_attr->max_dest_rd_atomic = 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7); qp_attr->min_rnr_timer = (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f; qp_attr->timeout = context->pri_path.ackto >> 3; qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; qp_attr->rnr_retry = context->pri_path.rnr_retry >> 5; qp_attr->alt_timeout = context->alt_path.ackto >> 3; done: qp_attr->cur_qp_state = qp_attr->qp_state; qp_attr->cap.max_send_wr = qp->sq.max; qp_attr->cap.max_recv_wr = qp->rq.max; qp_attr->cap.max_send_sge = qp->sq.max_gs; qp_attr->cap.max_recv_sge = qp->rq.max_gs; qp_attr->cap.max_inline_data = qp->max_inline_data; qp_init_attr->cap = qp_attr->cap; qp_init_attr->sq_sig_type = qp->sq_policy; out_mailbox: mthca_free_mailbox(dev, mailbox); out: mutex_unlock(&qp->mutex); return err; } static int mthca_path_set(struct mthca_dev *dev, const struct ib_ah_attr *ah, struct mthca_qp_path *path, u8 port) { path->g_mylmc = ah->src_path_bits & 0x7f; path->rlid = cpu_to_be16(ah->dlid); path->static_rate = mthca_get_rate(dev, ah->static_rate, port); if (ah->ah_flags & IB_AH_GRH) { if (ah->grh.sgid_index >= dev->limits.gid_table_len) { mthca_dbg(dev, "sgid_index (%u) too large. max is %d\n", ah->grh.sgid_index, dev->limits.gid_table_len-1); return -1; } path->g_mylmc |= 1 << 7; path->mgid_index = ah->grh.sgid_index; path->hop_limit = ah->grh.hop_limit; path->sl_tclass_flowlabel = cpu_to_be32((ah->sl << 28) | (ah->grh.traffic_class << 20) | (ah->grh.flow_label)); memcpy(path->rgid, ah->grh.dgid.raw, 16); } else path->sl_tclass_flowlabel = cpu_to_be32(ah->sl << 28); return 0; } static int __mthca_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); struct mthca_mailbox *mailbox; struct mthca_qp_param *qp_param; struct mthca_qp_context *qp_context; u32 sqd_event = 0; int err = -EINVAL; mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) { err = PTR_ERR(mailbox); goto out; } qp_param = mailbox->buf; qp_context = &qp_param->context; memset(qp_param, 0, sizeof *qp_param); qp_context->flags = cpu_to_be32((to_mthca_state(new_state) << 28) | (to_mthca_st(qp->transport) << 16)); qp_context->flags |= cpu_to_be32(MTHCA_QP_BIT_DE); if (!(attr_mask & IB_QP_PATH_MIG_STATE)) qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11); else { qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PM_STATE); switch (attr->path_mig_state) { case IB_MIG_MIGRATED: qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11); break; case IB_MIG_REARM: qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_REARM << 11); break; case IB_MIG_ARMED: qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_ARMED << 11); break; } } /* leave tavor_sched_queue as 0 */ if (qp->transport == MLX || qp->transport == UD) qp_context->mtu_msgmax = (IB_MTU_2048 << 5) | 11; else if (attr_mask & IB_QP_PATH_MTU) { if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_2048) { mthca_dbg(dev, "path MTU (%u) is invalid\n", attr->path_mtu); goto out_mailbox; } qp_context->mtu_msgmax = (attr->path_mtu << 5) | 31; } if (mthca_is_memfree(dev)) { if (qp->rq.max) qp_context->rq_size_stride = ilog2(qp->rq.max) << 3; qp_context->rq_size_stride |= qp->rq.wqe_shift - 4; if (qp->sq.max) qp_context->sq_size_stride = ilog2(qp->sq.max) << 3; qp_context->sq_size_stride |= qp->sq.wqe_shift - 4; } /* leave arbel_sched_queue as 0 */ if (qp->ibqp.uobject) qp_context->usr_page = cpu_to_be32(to_mucontext(qp->ibqp.uobject->context)->uar.index); else qp_context->usr_page = cpu_to_be32(dev->driver_uar.index); qp_context->local_qpn = cpu_to_be32(qp->qpn); if (attr_mask & IB_QP_DEST_QPN) { qp_context->remote_qpn = cpu_to_be32(attr->dest_qp_num); } if (qp->transport == MLX) qp_context->pri_path.port_pkey |= cpu_to_be32(qp->port << 24); else { if (attr_mask & IB_QP_PORT) { qp_context->pri_path.port_pkey |= cpu_to_be32(attr->port_num << 24); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PORT_NUM); } } if (attr_mask & IB_QP_PKEY_INDEX) { qp_context->pri_path.port_pkey |= cpu_to_be32(attr->pkey_index); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PKEY_INDEX); } if (attr_mask & IB_QP_RNR_RETRY) { qp_context->alt_path.rnr_retry = qp_context->pri_path.rnr_retry = attr->rnr_retry << 5; qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_RETRY | MTHCA_QP_OPTPAR_ALT_RNR_RETRY); } if (attr_mask & IB_QP_AV) { if (mthca_path_set(dev, &attr->ah_attr, &qp_context->pri_path, attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) goto out_mailbox; qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH); } if (ibqp->qp_type == IB_QPT_RC && cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { u8 sched_queue = ibqp->uobject ? 0x2 : 0x1; if (mthca_is_memfree(dev)) qp_context->rlkey_arbel_sched_queue |= sched_queue; else qp_context->tavor_sched_queue |= cpu_to_be32(sched_queue); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_SCHED_QUEUE); } if (attr_mask & IB_QP_TIMEOUT) { qp_context->pri_path.ackto = attr->timeout << 3; qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ACK_TIMEOUT); } if (attr_mask & IB_QP_ALT_PATH) { if (attr->alt_pkey_index >= dev->limits.pkey_table_len) { mthca_dbg(dev, "Alternate P_Key index (%u) too large. max is %d\n", attr->alt_pkey_index, dev->limits.pkey_table_len-1); goto out_mailbox; } if (attr->alt_port_num == 0 || attr->alt_port_num > dev->limits.num_ports) { mthca_dbg(dev, "Alternate port number (%u) is invalid\n", attr->alt_port_num); goto out_mailbox; } if (mthca_path_set(dev, &attr->alt_ah_attr, &qp_context->alt_path, attr->alt_ah_attr.port_num)) goto out_mailbox; qp_context->alt_path.port_pkey |= cpu_to_be32(attr->alt_pkey_index | attr->alt_port_num << 24); qp_context->alt_path.ackto = attr->alt_timeout << 3; qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ALT_ADDR_PATH); } /* leave rdd as 0 */ qp_context->pd = cpu_to_be32(to_mpd(ibqp->pd)->pd_num); /* leave wqe_base as 0 (we always create an MR based at 0 for WQs) */ qp_context->wqe_lkey = cpu_to_be32(qp->mr.ibmr.lkey); qp_context->params1 = cpu_to_be32((MTHCA_ACK_REQ_FREQ << 28) | (MTHCA_FLIGHT_LIMIT << 24) | MTHCA_QP_BIT_SWE); if (qp->sq_policy == IB_SIGNAL_ALL_WR) qp_context->params1 |= cpu_to_be32(MTHCA_QP_BIT_SSC); if (attr_mask & IB_QP_RETRY_CNT) { qp_context->params1 |= cpu_to_be32(attr->retry_cnt << 16); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RETRY_COUNT); } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { if (attr->max_rd_atomic) { qp_context->params1 |= cpu_to_be32(MTHCA_QP_BIT_SRE | MTHCA_QP_BIT_SAE); qp_context->params1 |= cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); } qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_SRA_MAX); } if (attr_mask & IB_QP_SQ_PSN) qp_context->next_send_psn = cpu_to_be32(attr->sq_psn); qp_context->cqn_snd = cpu_to_be32(to_mcq(ibqp->send_cq)->cqn); if (mthca_is_memfree(dev)) { qp_context->snd_wqe_base_l = cpu_to_be32(qp->send_wqe_offset); qp_context->snd_db_index = cpu_to_be32(qp->sq.db_index); } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { if (attr->max_dest_rd_atomic) qp_context->params2 |= cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRA_MAX); } if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { qp_context->params2 |= get_hw_access_flags(qp, attr, attr_mask); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE | MTHCA_QP_OPTPAR_RRE | MTHCA_QP_OPTPAR_RAE); } qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC); if (ibqp->srq) qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RIC); if (attr_mask & IB_QP_MIN_RNR_TIMER) { qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_TIMEOUT); } if (attr_mask & IB_QP_RQ_PSN) qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); qp_context->ra_buff_indx = cpu_to_be32(dev->qp_table.rdb_base + ((qp->qpn & (dev->limits.num_qps - 1)) * MTHCA_RDB_ENTRY_SIZE << dev->qp_table.rdb_shift)); qp_context->cqn_rcv = cpu_to_be32(to_mcq(ibqp->recv_cq)->cqn); if (mthca_is_memfree(dev)) qp_context->rcv_db_index = cpu_to_be32(qp->rq.db_index); if (attr_mask & IB_QP_QKEY) { qp_context->qkey = cpu_to_be32(attr->qkey); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_Q_KEY); } if (ibqp->srq) qp_context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->srqn); if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) sqd_event = 1 << 31; err = mthca_MODIFY_QP(dev, cur_state, new_state, qp->qpn, 0, mailbox, sqd_event); if (err) { mthca_warn(dev, "modify QP %d->%d returned %d.\n", cur_state, new_state, err); goto out_mailbox; } qp->state = new_state; if (attr_mask & IB_QP_ACCESS_FLAGS) qp->atomic_rd_en = attr->qp_access_flags; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) qp->resp_depth = attr->max_dest_rd_atomic; if (attr_mask & IB_QP_PORT) qp->port = attr->port_num; if (attr_mask & IB_QP_ALT_PATH) qp->alt_port = attr->alt_port_num; if (is_sqp(dev, qp)) store_attrs(to_msqp(qp), attr, attr_mask); /* * If we moved QP0 to RTR, bring the IB link up; if we moved * QP0 to RESET or ERROR, bring the link back down. */ if (is_qp0(dev, qp)) { if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR) init_port(dev, qp->port); if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) mthca_CLOSE_IB(dev, qp->port); } /* * If we moved a kernel QP to RESET, clean up all old CQ * entries and reinitialize the QP. */ if (new_state == IB_QPS_RESET && !qp->ibqp.uobject) { mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq), qp->qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); if (qp->ibqp.send_cq != qp->ibqp.recv_cq) mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq), qp->qpn, NULL); mthca_wq_reset(&qp->sq); qp->sq.last = get_send_wqe(qp, qp->sq.max - 1); mthca_wq_reset(&qp->rq); qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1); if (mthca_is_memfree(dev)) { *qp->sq.db = 0; *qp->rq.db = 0; } } out_mailbox: mthca_free_mailbox(dev, mailbox); out: return err; } int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; int err = -EINVAL; mutex_lock(&qp->mutex); if (attr_mask & IB_QP_CUR_STATE) { cur_state = attr->cur_qp_state; } else { spin_lock_irq(&qp->sq.lock); spin_lock(&qp->rq.lock); cur_state = qp->state; spin_unlock(&qp->rq.lock); spin_unlock_irq(&qp->sq.lock); } new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) { mthca_dbg(dev, "Bad QP transition (transport %d) " "%d->%d with attr 0x%08x\n", qp->transport, cur_state, new_state, attr_mask); goto out; } if ((attr_mask & IB_QP_PKEY_INDEX) && attr->pkey_index >= dev->limits.pkey_table_len) { mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n", attr->pkey_index, dev->limits.pkey_table_len-1); goto out; } if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) { mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num); goto out; } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic > dev->limits.max_qp_init_rdma) { mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n", attr->max_rd_atomic, dev->limits.max_qp_init_rdma); goto out; } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) { mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n", attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift); goto out; } if (cur_state == new_state && cur_state == IB_QPS_RESET) { err = 0; goto out; } err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); out: mutex_unlock(&qp->mutex); return err; } static int mthca_max_data_size(struct mthca_dev *dev, struct mthca_qp *qp, int desc_sz) { /* * Calculate the maximum size of WQE s/g segments, excluding * the next segment and other non-data segments. */ int max_data_size = desc_sz - sizeof (struct mthca_next_seg); switch (qp->transport) { case MLX: max_data_size -= 2 * sizeof (struct mthca_data_seg); break; case UD: if (mthca_is_memfree(dev)) max_data_size -= sizeof (struct mthca_arbel_ud_seg); else max_data_size -= sizeof (struct mthca_tavor_ud_seg); break; default: max_data_size -= sizeof (struct mthca_raddr_seg); break; } return max_data_size; } static inline int mthca_max_inline_data(struct mthca_pd *pd, int max_data_size) { /* We don't support inline data for kernel QPs (yet). */ return pd->ibpd.uobject ? max_data_size - MTHCA_INLINE_HEADER_SIZE : 0; } static void mthca_adjust_qp_caps(struct mthca_dev *dev, struct mthca_pd *pd, struct mthca_qp *qp) { int max_data_size = mthca_max_data_size(dev, qp, min(dev->limits.max_desc_sz, 1 << qp->sq.wqe_shift)); qp->max_inline_data = mthca_max_inline_data(pd, max_data_size); qp->sq.max_gs = min_t(int, dev->limits.max_sg, max_data_size / sizeof (struct mthca_data_seg)); qp->rq.max_gs = min_t(int, dev->limits.max_sg, (min(dev->limits.max_desc_sz, 1 << qp->rq.wqe_shift) - sizeof (struct mthca_next_seg)) / sizeof (struct mthca_data_seg)); } /* * Allocate and register buffer for WQEs. qp->rq.max, sq.max, * rq.max_gs and sq.max_gs must all be assigned. * mthca_alloc_wqe_buf will calculate rq.wqe_shift and * sq.wqe_shift (as well as send_wqe_offset, is_direct, and * queue) */ static int mthca_alloc_wqe_buf(struct mthca_dev *dev, struct mthca_pd *pd, struct mthca_qp *qp) { int size; int err = -ENOMEM; size = sizeof (struct mthca_next_seg) + qp->rq.max_gs * sizeof (struct mthca_data_seg); if (size > dev->limits.max_desc_sz) return -EINVAL; for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size; qp->rq.wqe_shift++) ; /* nothing */ size = qp->sq.max_gs * sizeof (struct mthca_data_seg); switch (qp->transport) { case MLX: size += 2 * sizeof (struct mthca_data_seg); break; case UD: size += mthca_is_memfree(dev) ? sizeof (struct mthca_arbel_ud_seg) : sizeof (struct mthca_tavor_ud_seg); break; case UC: size += sizeof (struct mthca_raddr_seg); break; case RC: size += sizeof (struct mthca_raddr_seg); /* * An atomic op will require an atomic segment, a * remote address segment and one scatter entry. */ size = max_t(int, size, sizeof (struct mthca_atomic_seg) + sizeof (struct mthca_raddr_seg) + sizeof (struct mthca_data_seg)); break; default: break; } /* Make sure that we have enough space for a bind request */ size = max_t(int, size, sizeof (struct mthca_bind_seg)); size += sizeof (struct mthca_next_seg); if (size > dev->limits.max_desc_sz) return -EINVAL; for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size; qp->sq.wqe_shift++) ; /* nothing */ qp->send_wqe_offset = ALIGN(qp->rq.max << qp->rq.wqe_shift, 1 << qp->sq.wqe_shift); /* * If this is a userspace QP, we don't actually have to * allocate anything. All we need is to calculate the WQE * sizes and the send_wqe_offset, so we're done now. */ if (pd->ibpd.uobject) return 0; size = PAGE_ALIGN(qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift)); qp->wrid = kmalloc((qp->rq.max + qp->sq.max) * sizeof (u64), GFP_KERNEL); if (!qp->wrid) goto err_out; err = mthca_buf_alloc(dev, size, MTHCA_MAX_DIRECT_QP_SIZE, &qp->queue, &qp->is_direct, pd, 0, &qp->mr); if (err) goto err_out; return 0; err_out: kfree(qp->wrid); return err; } static void mthca_free_wqe_buf(struct mthca_dev *dev, struct mthca_qp *qp) { mthca_buf_free(dev, PAGE_ALIGN(qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift)), &qp->queue, qp->is_direct, &qp->mr); kfree(qp->wrid); } static int mthca_map_memfree(struct mthca_dev *dev, struct mthca_qp *qp) { int ret; if (mthca_is_memfree(dev)) { ret = mthca_table_get(dev, dev->qp_table.qp_table, qp->qpn); if (ret) return ret; ret = mthca_table_get(dev, dev->qp_table.eqp_table, qp->qpn); if (ret) goto err_qpc; ret = mthca_table_get(dev, dev->qp_table.rdb_table, qp->qpn << dev->qp_table.rdb_shift); if (ret) goto err_eqpc; } return 0; err_eqpc: mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn); err_qpc: mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn); return ret; } static void mthca_unmap_memfree(struct mthca_dev *dev, struct mthca_qp *qp) { mthca_table_put(dev, dev->qp_table.rdb_table, qp->qpn << dev->qp_table.rdb_shift); mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn); mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn); } static int mthca_alloc_memfree(struct mthca_dev *dev, struct mthca_qp *qp) { if (mthca_is_memfree(dev)) { qp->rq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_RQ, qp->qpn, &qp->rq.db); if (qp->rq.db_index < 0) return -ENOMEM; qp->sq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SQ, qp->qpn, &qp->sq.db); if (qp->sq.db_index < 0) { mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index); return -ENOMEM; } } return 0; } static void mthca_free_memfree(struct mthca_dev *dev, struct mthca_qp *qp) { if (mthca_is_memfree(dev)) { mthca_free_db(dev, MTHCA_DB_TYPE_SQ, qp->sq.db_index); mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index); } } static int mthca_alloc_qp_common(struct mthca_dev *dev, struct mthca_pd *pd, struct mthca_cq *send_cq, struct mthca_cq *recv_cq, enum ib_sig_type send_policy, struct mthca_qp *qp) { int ret; int i; struct mthca_next_seg *next; qp->refcount = 1; init_waitqueue_head(&qp->wait); mutex_init(&qp->mutex); qp->state = IB_QPS_RESET; qp->atomic_rd_en = 0; qp->resp_depth = 0; qp->sq_policy = send_policy; mthca_wq_reset(&qp->sq); mthca_wq_reset(&qp->rq); spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); ret = mthca_map_memfree(dev, qp); if (ret) return ret; ret = mthca_alloc_wqe_buf(dev, pd, qp); if (ret) { mthca_unmap_memfree(dev, qp); return ret; } mthca_adjust_qp_caps(dev, pd, qp); /* * If this is a userspace QP, we're done now. The doorbells * will be allocated and buffers will be initialized in * userspace. */ if (pd->ibpd.uobject) return 0; ret = mthca_alloc_memfree(dev, qp); if (ret) { mthca_free_wqe_buf(dev, qp); mthca_unmap_memfree(dev, qp); return ret; } if (mthca_is_memfree(dev)) { struct mthca_data_seg *scatter; int size = (sizeof (struct mthca_next_seg) + qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16; for (i = 0; i < qp->rq.max; ++i) { next = get_recv_wqe(qp, i); next->nda_op = cpu_to_be32(((i + 1) & (qp->rq.max - 1)) << qp->rq.wqe_shift); next->ee_nds = cpu_to_be32(size); for (scatter = (void *) (next + 1); (void *) scatter < (void *) next + (1 << qp->rq.wqe_shift); ++scatter) scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY); } for (i = 0; i < qp->sq.max; ++i) { next = get_send_wqe(qp, i); next->nda_op = cpu_to_be32((((i + 1) & (qp->sq.max - 1)) << qp->sq.wqe_shift) + qp->send_wqe_offset); } } else { for (i = 0; i < qp->rq.max; ++i) { next = get_recv_wqe(qp, i); next->nda_op = htonl((((i + 1) % qp->rq.max) << qp->rq.wqe_shift) | 1); } } qp->sq.last = get_send_wqe(qp, qp->sq.max - 1); qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1); return 0; } static int mthca_set_qp_size(struct mthca_dev *dev, struct ib_qp_cap *cap, struct mthca_pd *pd, struct mthca_qp *qp) { int max_data_size = mthca_max_data_size(dev, qp, dev->limits.max_desc_sz); /* Sanity check QP size before proceeding */ if (cap->max_send_wr > dev->limits.max_wqes || cap->max_recv_wr > dev->limits.max_wqes || cap->max_send_sge > dev->limits.max_sg || cap->max_recv_sge > dev->limits.max_sg || cap->max_inline_data > mthca_max_inline_data(pd, max_data_size)) return -EINVAL; /* * For MLX transport we need 2 extra send gather entries: * one for the header and one for the checksum at the end */ if (qp->transport == MLX && cap->max_send_sge + 2 > dev->limits.max_sg) return -EINVAL; if (mthca_is_memfree(dev)) { qp->rq.max = cap->max_recv_wr ? roundup_pow_of_two(cap->max_recv_wr) : 0; qp->sq.max = cap->max_send_wr ? roundup_pow_of_two(cap->max_send_wr) : 0; } else { qp->rq.max = cap->max_recv_wr; qp->sq.max = cap->max_send_wr; } qp->rq.max_gs = cap->max_recv_sge; qp->sq.max_gs = max_t(int, cap->max_send_sge, ALIGN(cap->max_inline_data + MTHCA_INLINE_HEADER_SIZE, MTHCA_INLINE_CHUNK_SIZE) / sizeof (struct mthca_data_seg)); return 0; } int mthca_alloc_qp(struct mthca_dev *dev, struct mthca_pd *pd, struct mthca_cq *send_cq, struct mthca_cq *recv_cq, enum ib_qp_type type, enum ib_sig_type send_policy, struct ib_qp_cap *cap, struct mthca_qp *qp) { int err; switch (type) { case IB_QPT_RC: qp->transport = RC; break; case IB_QPT_UC: qp->transport = UC; break; case IB_QPT_UD: qp->transport = UD; break; default: return -EINVAL; } err = mthca_set_qp_size(dev, cap, pd, qp); if (err) return err; qp->qpn = mthca_alloc(&dev->qp_table.alloc); if (qp->qpn == -1) return -ENOMEM; /* initialize port to zero for error-catching. */ qp->port = 0; err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq, send_policy, qp); if (err) { mthca_free(&dev->qp_table.alloc, qp->qpn); return err; } spin_lock_irq(&dev->qp_table.lock); mthca_array_set(&dev->qp_table.qp, qp->qpn & (dev->limits.num_qps - 1), qp); spin_unlock_irq(&dev->qp_table.lock); return 0; } static void mthca_lock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq) __acquires(&send_cq->lock) __acquires(&recv_cq->lock) { if (send_cq == recv_cq) { spin_lock_irq(&send_cq->lock); __acquire(&recv_cq->lock); } else if (send_cq->cqn < recv_cq->cqn) { spin_lock_irq(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else { spin_lock_irq(&recv_cq->lock); spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); } } static void mthca_unlock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq) __releases(&send_cq->lock) __releases(&recv_cq->lock) { if (send_cq == recv_cq) { __release(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); } else if (send_cq->cqn < recv_cq->cqn) { spin_unlock(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); } else { spin_unlock(&send_cq->lock); spin_unlock_irq(&recv_cq->lock); } } int mthca_alloc_sqp(struct mthca_dev *dev, struct mthca_pd *pd, struct mthca_cq *send_cq, struct mthca_cq *recv_cq, enum ib_sig_type send_policy, struct ib_qp_cap *cap, int qpn, int port, struct mthca_sqp *sqp) { u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1; int err; sqp->qp.transport = MLX; err = mthca_set_qp_size(dev, cap, pd, &sqp->qp); if (err) return err; sqp->header_buf_size = sqp->qp.sq.max * MTHCA_UD_HEADER_SIZE; sqp->header_buf = dma_alloc_coherent(&dev->pdev->dev, sqp->header_buf_size, &sqp->header_dma, GFP_KERNEL); if (!sqp->header_buf) return -ENOMEM; spin_lock_irq(&dev->qp_table.lock); if (mthca_array_get(&dev->qp_table.qp, mqpn)) err = -EBUSY; else mthca_array_set(&dev->qp_table.qp, mqpn, sqp); spin_unlock_irq(&dev->qp_table.lock); if (err) goto err_out; sqp->qp.port = port; sqp->qp.qpn = mqpn; sqp->qp.transport = MLX; err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq, send_policy, &sqp->qp); if (err) goto err_out_free; atomic_inc(&pd->sqp_count); return 0; err_out_free: /* * Lock CQs here, so that CQ polling code can do QP lookup * without taking a lock. */ mthca_lock_cqs(send_cq, recv_cq); spin_lock(&dev->qp_table.lock); mthca_array_clear(&dev->qp_table.qp, mqpn); spin_unlock(&dev->qp_table.lock); mthca_unlock_cqs(send_cq, recv_cq); err_out: dma_free_coherent(&dev->pdev->dev, sqp->header_buf_size, sqp->header_buf, sqp->header_dma); return err; } static inline int get_qp_refcount(struct mthca_dev *dev, struct mthca_qp *qp) { int c; spin_lock_irq(&dev->qp_table.lock); c = qp->refcount; spin_unlock_irq(&dev->qp_table.lock); return c; } void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp) { struct mthca_cq *send_cq; struct mthca_cq *recv_cq; send_cq = to_mcq(qp->ibqp.send_cq); recv_cq = to_mcq(qp->ibqp.recv_cq); /* * Lock CQs here, so that CQ polling code can do QP lookup * without taking a lock. */ mthca_lock_cqs(send_cq, recv_cq); spin_lock(&dev->qp_table.lock); mthca_array_clear(&dev->qp_table.qp, qp->qpn & (dev->limits.num_qps - 1)); --qp->refcount; spin_unlock(&dev->qp_table.lock); mthca_unlock_cqs(send_cq, recv_cq); wait_event(qp->wait, !get_qp_refcount(dev, qp)); if (qp->state != IB_QPS_RESET) mthca_MODIFY_QP(dev, qp->state, IB_QPS_RESET, qp->qpn, 0, NULL, 0); /* * If this is a userspace QP, the buffers, MR, CQs and so on * will be cleaned up in userspace, so all we have to do is * unref the mem-free tables and free the QPN in our table. */ if (!qp->ibqp.uobject) { mthca_cq_clean(dev, recv_cq, qp->qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); if (send_cq != recv_cq) mthca_cq_clean(dev, send_cq, qp->qpn, NULL); mthca_free_memfree(dev, qp); mthca_free_wqe_buf(dev, qp); } mthca_unmap_memfree(dev, qp); if (is_sqp(dev, qp)) { atomic_dec(&(to_mpd(qp->ibqp.pd)->sqp_count)); dma_free_coherent(&dev->pdev->dev, to_msqp(qp)->header_buf_size, to_msqp(qp)->header_buf, to_msqp(qp)->header_dma); } else mthca_free(&dev->qp_table.alloc, qp->qpn); } /* Create UD header for an MLX send and build a data segment for it */ static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp, - int ind, struct ib_ud_wr *wr, + int ind, const struct ib_ud_wr *wr, struct mthca_mlx_seg *mlx, struct mthca_data_seg *data) { int header_size; int err; u16 pkey; ib_ud_header_init(256, /* assume a MAD */ 1, 0, 0, mthca_ah_grh_present(to_mah(wr->ah)), 0, 0, 0, &sqp->ud_header); err = mthca_read_ah(dev, to_mah(wr->ah), &sqp->ud_header); if (err) return err; mlx->flags &= ~cpu_to_be32(MTHCA_NEXT_SOLICIT | 1); mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MTHCA_MLX_VL15 : 0) | (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE ? MTHCA_MLX_SLR : 0) | (sqp->ud_header.lrh.service_level << 8)); mlx->rlid = sqp->ud_header.lrh.destination_lid; mlx->vcrc = 0; switch (wr->wr.opcode) { case IB_WR_SEND: sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; sqp->ud_header.immediate_present = 0; break; case IB_WR_SEND_WITH_IMM: sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; sqp->ud_header.immediate_present = 1; sqp->ud_header.immediate_data = wr->wr.ex.imm_data; break; default: return -EINVAL; } sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED); if (!sqp->qp.ibqp.qp_num) ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); else ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port, wr->pkey_index, &pkey); sqp->ud_header.bth.pkey = cpu_to_be16(pkey); sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn); sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); sqp->ud_header.deth.qkey = cpu_to_be32(wr->remote_qkey & 0x80000000 ? sqp->qkey : wr->remote_qkey); sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf + ind * MTHCA_UD_HEADER_SIZE); data->byte_count = cpu_to_be32(header_size); data->lkey = cpu_to_be32(to_mpd(sqp->qp.ibqp.pd)->ntmr.ibmr.lkey); data->addr = cpu_to_be64(sqp->header_dma + ind * MTHCA_UD_HEADER_SIZE); return 0; } static inline int mthca_wq_overflow(struct mthca_wq *wq, int nreq, struct ib_cq *ib_cq) { unsigned cur; struct mthca_cq *cq; cur = wq->head - wq->tail; if (likely(cur + nreq < wq->max)) return 0; cq = to_mcq(ib_cq); spin_lock(&cq->lock); cur = wq->head - wq->tail; spin_unlock(&cq->lock); return cur + nreq >= wq->max; } static __always_inline void set_raddr_seg(struct mthca_raddr_seg *rseg, u64 remote_addr, u32 rkey) { rseg->raddr = cpu_to_be64(remote_addr); rseg->rkey = cpu_to_be32(rkey); rseg->reserved = 0; } static __always_inline void set_atomic_seg(struct mthca_atomic_seg *aseg, - struct ib_atomic_wr *wr) + const struct ib_atomic_wr *wr) { if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { aseg->swap_add = cpu_to_be64(wr->swap); aseg->compare = cpu_to_be64(wr->compare_add); } else { aseg->swap_add = cpu_to_be64(wr->compare_add); aseg->compare = 0; } } static void set_tavor_ud_seg(struct mthca_tavor_ud_seg *useg, - struct ib_ud_wr *wr) + const struct ib_ud_wr *wr) { useg->lkey = cpu_to_be32(to_mah(wr->ah)->key); useg->av_addr = cpu_to_be64(to_mah(wr->ah)->avdma); useg->dqpn = cpu_to_be32(wr->remote_qpn); useg->qkey = cpu_to_be32(wr->remote_qkey); } static void set_arbel_ud_seg(struct mthca_arbel_ud_seg *useg, - struct ib_ud_wr *wr) + const struct ib_ud_wr *wr) { memcpy(useg->av, to_mah(wr->ah)->av, MTHCA_AV_SIZE); useg->dqpn = cpu_to_be32(wr->remote_qpn); useg->qkey = cpu_to_be32(wr->remote_qkey); } -int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int mthca_tavor_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); void *wqe; void *prev_wqe; unsigned long flags; int err = 0; int nreq; int i; int size; /* * f0 and size0 are only used if nreq != 0, and they will * always be initialized the first time through the main loop * before nreq is incremented. So nreq cannot become non-zero * without initializing f0 and size0, and they are in fact * never used uninitialized. */ int uninitialized_var(size0); u32 uninitialized_var(f0); int ind; u8 op0 = 0; spin_lock_irqsave(&qp->sq.lock, flags); /* XXX check that state is OK to post send */ ind = qp->sq.next_ind; for (nreq = 0; wr; ++nreq, wr = wr->next) { if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { mthca_err(dev, "SQ %06x full (%u head, %u tail," " %d max, %d nreq)\n", qp->qpn, qp->sq.head, qp->sq.tail, qp->sq.max, nreq); err = -ENOMEM; *bad_wr = wr; goto out; } wqe = get_send_wqe(qp, ind); prev_wqe = qp->sq.last; qp->sq.last = wqe; ((struct mthca_next_seg *) wqe)->nda_op = 0; ((struct mthca_next_seg *) wqe)->ee_nds = 0; ((struct mthca_next_seg *) wqe)->flags = ((wr->send_flags & IB_SEND_SIGNALED) ? cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) | ((wr->send_flags & IB_SEND_SOLICITED) ? cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0) | cpu_to_be32(1); if (wr->opcode == IB_WR_SEND_WITH_IMM || wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) ((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data; wqe += sizeof (struct mthca_next_seg); size = sizeof (struct mthca_next_seg) / 16; switch (qp->transport) { case RC: switch (wr->opcode) { case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: set_raddr_seg(wqe, atomic_wr(wr)->remote_addr, atomic_wr(wr)->rkey); wqe += sizeof (struct mthca_raddr_seg); set_atomic_seg(wqe, atomic_wr(wr)); wqe += sizeof (struct mthca_atomic_seg); size += (sizeof (struct mthca_raddr_seg) + sizeof (struct mthca_atomic_seg)) / 16; break; case IB_WR_RDMA_WRITE: case IB_WR_RDMA_WRITE_WITH_IMM: case IB_WR_RDMA_READ: set_raddr_seg(wqe, rdma_wr(wr)->remote_addr, rdma_wr(wr)->rkey); wqe += sizeof (struct mthca_raddr_seg); size += sizeof (struct mthca_raddr_seg) / 16; break; default: /* No extra segments required for sends */ break; } break; case UC: switch (wr->opcode) { case IB_WR_RDMA_WRITE: case IB_WR_RDMA_WRITE_WITH_IMM: set_raddr_seg(wqe, rdma_wr(wr)->remote_addr, rdma_wr(wr)->rkey); wqe += sizeof (struct mthca_raddr_seg); size += sizeof (struct mthca_raddr_seg) / 16; break; default: /* No extra segments required for sends */ break; } break; case UD: set_tavor_ud_seg(wqe, ud_wr(wr)); wqe += sizeof (struct mthca_tavor_ud_seg); size += sizeof (struct mthca_tavor_ud_seg) / 16; break; case MLX: err = build_mlx_header(dev, to_msqp(qp), ind, ud_wr(wr), wqe - sizeof (struct mthca_next_seg), wqe); if (err) { *bad_wr = wr; goto out; } wqe += sizeof (struct mthca_data_seg); size += sizeof (struct mthca_data_seg) / 16; break; } if (wr->num_sge > qp->sq.max_gs) { mthca_err(dev, "too many gathers\n"); err = -EINVAL; *bad_wr = wr; goto out; } for (i = 0; i < wr->num_sge; ++i) { mthca_set_data_seg(wqe, wr->sg_list + i); wqe += sizeof (struct mthca_data_seg); size += sizeof (struct mthca_data_seg) / 16; } /* Add one more inline data segment for ICRC */ if (qp->transport == MLX) { ((struct mthca_data_seg *) wqe)->byte_count = cpu_to_be32((1 << 31) | 4); ((u32 *) wqe)[1] = 0; wqe += sizeof (struct mthca_data_seg); size += sizeof (struct mthca_data_seg) / 16; } qp->wrid[ind + qp->rq.max] = wr->wr_id; if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) { mthca_err(dev, "opcode invalid\n"); err = -EINVAL; *bad_wr = wr; goto out; } ((struct mthca_next_seg *) prev_wqe)->nda_op = cpu_to_be32(((ind << qp->sq.wqe_shift) + qp->send_wqe_offset) | mthca_opcode[wr->opcode]); wmb(); ((struct mthca_next_seg *) prev_wqe)->ee_nds = cpu_to_be32((nreq ? 0 : MTHCA_NEXT_DBD) | size | ((wr->send_flags & IB_SEND_FENCE) ? MTHCA_NEXT_FENCE : 0)); if (!nreq) { size0 = size; op0 = mthca_opcode[wr->opcode]; f0 = wr->send_flags & IB_SEND_FENCE ? MTHCA_SEND_DOORBELL_FENCE : 0; } ++ind; if (unlikely(ind >= qp->sq.max)) ind -= qp->sq.max; } out: if (likely(nreq)) { wmb(); mthca_write64(((qp->sq.next_ind << qp->sq.wqe_shift) + qp->send_wqe_offset) | f0 | op0, (qp->qpn << 8) | size0, dev->kar + MTHCA_SEND_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); /* * Make sure doorbells don't leak out of SQ spinlock * and reach the HCA out of order: */ mmiowb(); } qp->sq.next_ind = ind; qp->sq.head += nreq; spin_unlock_irqrestore(&qp->sq.lock, flags); return err; } -int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mthca_tavor_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); unsigned long flags; int err = 0; int nreq; int i; int size; /* * size0 is only used if nreq != 0, and it will always be * initialized the first time through the main loop before * nreq is incremented. So nreq cannot become non-zero * without initializing size0, and it is in fact never used * uninitialized. */ int uninitialized_var(size0); int ind; void *wqe; void *prev_wqe; spin_lock_irqsave(&qp->rq.lock, flags); /* XXX check that state is OK to post receive */ ind = qp->rq.next_ind; for (nreq = 0; wr; wr = wr->next) { if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { mthca_err(dev, "RQ %06x full (%u head, %u tail," " %d max, %d nreq)\n", qp->qpn, qp->rq.head, qp->rq.tail, qp->rq.max, nreq); err = -ENOMEM; *bad_wr = wr; goto out; } wqe = get_recv_wqe(qp, ind); prev_wqe = qp->rq.last; qp->rq.last = wqe; ((struct mthca_next_seg *) wqe)->ee_nds = cpu_to_be32(MTHCA_NEXT_DBD); ((struct mthca_next_seg *) wqe)->flags = 0; wqe += sizeof (struct mthca_next_seg); size = sizeof (struct mthca_next_seg) / 16; if (unlikely(wr->num_sge > qp->rq.max_gs)) { err = -EINVAL; *bad_wr = wr; goto out; } for (i = 0; i < wr->num_sge; ++i) { mthca_set_data_seg(wqe, wr->sg_list + i); wqe += sizeof (struct mthca_data_seg); size += sizeof (struct mthca_data_seg) / 16; } qp->wrid[ind] = wr->wr_id; ((struct mthca_next_seg *) prev_wqe)->ee_nds = cpu_to_be32(MTHCA_NEXT_DBD | size); if (!nreq) size0 = size; ++ind; if (unlikely(ind >= qp->rq.max)) ind -= qp->rq.max; ++nreq; if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) { nreq = 0; wmb(); mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0, qp->qpn << 8, dev->kar + MTHCA_RECEIVE_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); qp->rq.next_ind = ind; qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB; } } out: if (likely(nreq)) { wmb(); mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0, qp->qpn << 8 | nreq, dev->kar + MTHCA_RECEIVE_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); } qp->rq.next_ind = ind; qp->rq.head += nreq; /* * Make sure doorbells don't leak out of RQ spinlock and reach * the HCA out of order: */ mmiowb(); spin_unlock_irqrestore(&qp->rq.lock, flags); return err; } -int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int mthca_arbel_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); u32 dbhi; void *wqe; void *prev_wqe; unsigned long flags; int err = 0; int nreq; int i; int size; /* * f0 and size0 are only used if nreq != 0, and they will * always be initialized the first time through the main loop * before nreq is incremented. So nreq cannot become non-zero * without initializing f0 and size0, and they are in fact * never used uninitialized. */ int uninitialized_var(size0); u32 uninitialized_var(f0); int ind; u8 op0 = 0; spin_lock_irqsave(&qp->sq.lock, flags); /* XXX check that state is OK to post send */ ind = qp->sq.head & (qp->sq.max - 1); for (nreq = 0; wr; ++nreq, wr = wr->next) { if (unlikely(nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB)) { nreq = 0; dbhi = (MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) | ((qp->sq.head & 0xffff) << 8) | f0 | op0; qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB; /* * Make sure that descriptors are written before * doorbell record. */ wmb(); *qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff); /* * Make sure doorbell record is written before we * write MMIO send doorbell. */ wmb(); mthca_write64(dbhi, (qp->qpn << 8) | size0, dev->kar + MTHCA_SEND_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); } if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { mthca_err(dev, "SQ %06x full (%u head, %u tail," " %d max, %d nreq)\n", qp->qpn, qp->sq.head, qp->sq.tail, qp->sq.max, nreq); err = -ENOMEM; *bad_wr = wr; goto out; } wqe = get_send_wqe(qp, ind); prev_wqe = qp->sq.last; qp->sq.last = wqe; ((struct mthca_next_seg *) wqe)->flags = ((wr->send_flags & IB_SEND_SIGNALED) ? cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) | ((wr->send_flags & IB_SEND_SOLICITED) ? cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0) | ((wr->send_flags & IB_SEND_IP_CSUM) ? cpu_to_be32(MTHCA_NEXT_IP_CSUM | MTHCA_NEXT_TCP_UDP_CSUM) : 0) | cpu_to_be32(1); if (wr->opcode == IB_WR_SEND_WITH_IMM || wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) ((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data; wqe += sizeof (struct mthca_next_seg); size = sizeof (struct mthca_next_seg) / 16; switch (qp->transport) { case RC: switch (wr->opcode) { case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: set_raddr_seg(wqe, atomic_wr(wr)->remote_addr, atomic_wr(wr)->rkey); wqe += sizeof (struct mthca_raddr_seg); set_atomic_seg(wqe, atomic_wr(wr)); wqe += sizeof (struct mthca_atomic_seg); size += (sizeof (struct mthca_raddr_seg) + sizeof (struct mthca_atomic_seg)) / 16; break; case IB_WR_RDMA_READ: case IB_WR_RDMA_WRITE: case IB_WR_RDMA_WRITE_WITH_IMM: set_raddr_seg(wqe, rdma_wr(wr)->remote_addr, rdma_wr(wr)->rkey); wqe += sizeof (struct mthca_raddr_seg); size += sizeof (struct mthca_raddr_seg) / 16; break; default: /* No extra segments required for sends */ break; } break; case UC: switch (wr->opcode) { case IB_WR_RDMA_WRITE: case IB_WR_RDMA_WRITE_WITH_IMM: set_raddr_seg(wqe, rdma_wr(wr)->remote_addr, rdma_wr(wr)->rkey); wqe += sizeof (struct mthca_raddr_seg); size += sizeof (struct mthca_raddr_seg) / 16; break; default: /* No extra segments required for sends */ break; } break; case UD: set_arbel_ud_seg(wqe, ud_wr(wr)); wqe += sizeof (struct mthca_arbel_ud_seg); size += sizeof (struct mthca_arbel_ud_seg) / 16; break; case MLX: err = build_mlx_header(dev, to_msqp(qp), ind, ud_wr(wr), wqe - sizeof (struct mthca_next_seg), wqe); if (err) { *bad_wr = wr; goto out; } wqe += sizeof (struct mthca_data_seg); size += sizeof (struct mthca_data_seg) / 16; break; } if (wr->num_sge > qp->sq.max_gs) { mthca_err(dev, "too many gathers\n"); err = -EINVAL; *bad_wr = wr; goto out; } for (i = 0; i < wr->num_sge; ++i) { mthca_set_data_seg(wqe, wr->sg_list + i); wqe += sizeof (struct mthca_data_seg); size += sizeof (struct mthca_data_seg) / 16; } /* Add one more inline data segment for ICRC */ if (qp->transport == MLX) { ((struct mthca_data_seg *) wqe)->byte_count = cpu_to_be32((1 << 31) | 4); ((u32 *) wqe)[1] = 0; wqe += sizeof (struct mthca_data_seg); size += sizeof (struct mthca_data_seg) / 16; } qp->wrid[ind + qp->rq.max] = wr->wr_id; if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) { mthca_err(dev, "opcode invalid\n"); err = -EINVAL; *bad_wr = wr; goto out; } ((struct mthca_next_seg *) prev_wqe)->nda_op = cpu_to_be32(((ind << qp->sq.wqe_shift) + qp->send_wqe_offset) | mthca_opcode[wr->opcode]); wmb(); ((struct mthca_next_seg *) prev_wqe)->ee_nds = cpu_to_be32(MTHCA_NEXT_DBD | size | ((wr->send_flags & IB_SEND_FENCE) ? MTHCA_NEXT_FENCE : 0)); if (!nreq) { size0 = size; op0 = mthca_opcode[wr->opcode]; f0 = wr->send_flags & IB_SEND_FENCE ? MTHCA_SEND_DOORBELL_FENCE : 0; } ++ind; if (unlikely(ind >= qp->sq.max)) ind -= qp->sq.max; } out: if (likely(nreq)) { dbhi = (nreq << 24) | ((qp->sq.head & 0xffff) << 8) | f0 | op0; qp->sq.head += nreq; /* * Make sure that descriptors are written before * doorbell record. */ wmb(); *qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff); /* * Make sure doorbell record is written before we * write MMIO send doorbell. */ wmb(); mthca_write64(dbhi, (qp->qpn << 8) | size0, dev->kar + MTHCA_SEND_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); } /* * Make sure doorbells don't leak out of SQ spinlock and reach * the HCA out of order: */ mmiowb(); spin_unlock_irqrestore(&qp->sq.lock, flags); return err; } -int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mthca_arbel_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); unsigned long flags; int err = 0; int nreq; int ind; int i; void *wqe; spin_lock_irqsave(&qp->rq.lock, flags); /* XXX check that state is OK to post receive */ ind = qp->rq.head & (qp->rq.max - 1); for (nreq = 0; wr; ++nreq, wr = wr->next) { if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { mthca_err(dev, "RQ %06x full (%u head, %u tail," " %d max, %d nreq)\n", qp->qpn, qp->rq.head, qp->rq.tail, qp->rq.max, nreq); err = -ENOMEM; *bad_wr = wr; goto out; } wqe = get_recv_wqe(qp, ind); ((struct mthca_next_seg *) wqe)->flags = 0; wqe += sizeof (struct mthca_next_seg); if (unlikely(wr->num_sge > qp->rq.max_gs)) { err = -EINVAL; *bad_wr = wr; goto out; } for (i = 0; i < wr->num_sge; ++i) { mthca_set_data_seg(wqe, wr->sg_list + i); wqe += sizeof (struct mthca_data_seg); } if (i < qp->rq.max_gs) mthca_set_data_seg_inval(wqe); qp->wrid[ind] = wr->wr_id; ++ind; if (unlikely(ind >= qp->rq.max)) ind -= qp->rq.max; } out: if (likely(nreq)) { qp->rq.head += nreq; /* * Make sure that descriptors are written before * doorbell record. */ wmb(); *qp->rq.db = cpu_to_be32(qp->rq.head & 0xffff); } spin_unlock_irqrestore(&qp->rq.lock, flags); return err; } void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send, int index, int *dbd, __be32 *new_wqe) { struct mthca_next_seg *next; /* * For SRQs, all receive WQEs generate a CQE, so we're always * at the end of the doorbell chain. */ if (qp->ibqp.srq && !is_send) { *new_wqe = 0; return; } if (is_send) next = get_send_wqe(qp, index); else next = get_recv_wqe(qp, index); *dbd = !!(next->ee_nds & cpu_to_be32(MTHCA_NEXT_DBD)); if (next->ee_nds & cpu_to_be32(0x3f)) *new_wqe = (next->nda_op & cpu_to_be32(~0x3f)) | (next->ee_nds & cpu_to_be32(0x3f)); else *new_wqe = 0; } int mthca_init_qp_table(struct mthca_dev *dev) { int err; int i; spin_lock_init(&dev->qp_table.lock); /* * We reserve 2 extra QPs per port for the special QPs. The * special QP for port 1 has to be even, so round up. */ dev->qp_table.sqp_start = (dev->limits.reserved_qps + 1) & ~1UL; err = mthca_alloc_init(&dev->qp_table.alloc, dev->limits.num_qps, (1 << 24) - 1, dev->qp_table.sqp_start + MTHCA_MAX_PORTS * 2); if (err) return err; err = mthca_array_init(&dev->qp_table.qp, dev->limits.num_qps); if (err) { mthca_alloc_cleanup(&dev->qp_table.alloc); return err; } for (i = 0; i < 2; ++i) { err = mthca_CONF_SPECIAL_QP(dev, i ? IB_QPT_GSI : IB_QPT_SMI, dev->qp_table.sqp_start + i * 2); if (err) { mthca_warn(dev, "CONF_SPECIAL_QP returned " "%d, aborting.\n", err); goto err_out; } } return 0; err_out: for (i = 0; i < 2; ++i) mthca_CONF_SPECIAL_QP(dev, i, 0); mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps); mthca_alloc_cleanup(&dev->qp_table.alloc); return err; } void mthca_cleanup_qp_table(struct mthca_dev *dev) { int i; for (i = 0; i < 2; ++i) mthca_CONF_SPECIAL_QP(dev, i, 0); mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps); mthca_alloc_cleanup(&dev->qp_table.alloc); } diff --git a/sys/dev/mthca/mthca_srq.c b/sys/dev/mthca/mthca_srq.c index 579c23a73654..3faa8186c182 100644 --- a/sys/dev/mthca/mthca_srq.c +++ b/sys/dev/mthca/mthca_srq.c @@ -1,690 +1,690 @@ /* * Copyright (c) 2005 Cisco Systems. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include "mthca_dev.h" #include "mthca_cmd.h" #include "mthca_memfree.h" #include "mthca_wqe.h" enum { MTHCA_MAX_DIRECT_SRQ_SIZE = 4 * PAGE_SIZE }; struct mthca_tavor_srq_context { __be64 wqe_base_ds; /* low 6 bits is descriptor size */ __be32 state_pd; __be32 lkey; __be32 uar; __be16 limit_watermark; __be16 wqe_cnt; u32 reserved[2]; }; struct mthca_arbel_srq_context { __be32 state_logsize_srqn; __be32 lkey; __be32 db_index; __be32 logstride_usrpage; __be64 wqe_base; __be32 eq_pd; __be16 limit_watermark; __be16 wqe_cnt; u16 reserved1; __be16 wqe_counter; u32 reserved2[3]; }; static void *get_wqe(struct mthca_srq *srq, int n) { if (srq->is_direct) return srq->queue.direct.buf + (n << srq->wqe_shift); else return srq->queue.page_list[(n << srq->wqe_shift) >> PAGE_SHIFT].buf + ((n << srq->wqe_shift) & (PAGE_SIZE - 1)); } /* * Return a pointer to the location within a WQE that we're using as a * link when the WQE is in the free list. We use the imm field * because in the Tavor case, posting a WQE may overwrite the next * segment of the previous WQE, but a receive WQE will never touch the * imm field. This avoids corrupting our free list if the previous * WQE has already completed and been put on the free list when we * post the next WQE. */ static inline int *wqe_to_link(void *wqe) { return (int *) (wqe + offsetof(struct mthca_next_seg, imm)); } static void mthca_tavor_init_srq_context(struct mthca_dev *dev, struct mthca_pd *pd, struct mthca_srq *srq, struct mthca_tavor_srq_context *context) { memset(context, 0, sizeof *context); context->wqe_base_ds = cpu_to_be64(1 << (srq->wqe_shift - 4)); context->state_pd = cpu_to_be32(pd->pd_num); context->lkey = cpu_to_be32(srq->mr.ibmr.lkey); if (pd->ibpd.uobject) context->uar = cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index); else context->uar = cpu_to_be32(dev->driver_uar.index); } static void mthca_arbel_init_srq_context(struct mthca_dev *dev, struct mthca_pd *pd, struct mthca_srq *srq, struct mthca_arbel_srq_context *context) { int logsize; memset(context, 0, sizeof *context); logsize = ilog2(srq->max); context->state_logsize_srqn = cpu_to_be32(logsize << 24 | srq->srqn); context->lkey = cpu_to_be32(srq->mr.ibmr.lkey); context->db_index = cpu_to_be32(srq->db_index); context->logstride_usrpage = cpu_to_be32((srq->wqe_shift - 4) << 29); if (pd->ibpd.uobject) context->logstride_usrpage |= cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index); else context->logstride_usrpage |= cpu_to_be32(dev->driver_uar.index); context->eq_pd = cpu_to_be32(MTHCA_EQ_ASYNC << 24 | pd->pd_num); } static void mthca_free_srq_buf(struct mthca_dev *dev, struct mthca_srq *srq) { mthca_buf_free(dev, srq->max << srq->wqe_shift, &srq->queue, srq->is_direct, &srq->mr); kfree(srq->wrid); } static int mthca_alloc_srq_buf(struct mthca_dev *dev, struct mthca_pd *pd, struct mthca_srq *srq) { struct mthca_data_seg *scatter; void *wqe; int err; int i; if (pd->ibpd.uobject) return 0; srq->wrid = kmalloc(srq->max * sizeof (u64), GFP_KERNEL); if (!srq->wrid) return -ENOMEM; err = mthca_buf_alloc(dev, srq->max << srq->wqe_shift, MTHCA_MAX_DIRECT_SRQ_SIZE, &srq->queue, &srq->is_direct, pd, 1, &srq->mr); if (err) { kfree(srq->wrid); return err; } /* * Now initialize the SRQ buffer so that all of the WQEs are * linked into the list of free WQEs. In addition, set the * scatter list L_Keys to the sentry value of 0x100. */ for (i = 0; i < srq->max; ++i) { struct mthca_next_seg *next; next = wqe = get_wqe(srq, i); if (i < srq->max - 1) { *wqe_to_link(wqe) = i + 1; next->nda_op = htonl(((i + 1) << srq->wqe_shift) | 1); } else { *wqe_to_link(wqe) = -1; next->nda_op = 0; } for (scatter = wqe + sizeof (struct mthca_next_seg); (void *) scatter < wqe + (1 << srq->wqe_shift); ++scatter) scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY); } srq->last = get_wqe(srq, srq->max - 1); return 0; } int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd, struct ib_srq_attr *attr, struct mthca_srq *srq) { struct mthca_mailbox *mailbox; int ds; int err; /* Sanity check SRQ size before proceeding */ if (attr->max_wr > dev->limits.max_srq_wqes || attr->max_sge > dev->limits.max_srq_sge) return -EINVAL; srq->max = attr->max_wr; srq->max_gs = attr->max_sge; srq->counter = 0; if (mthca_is_memfree(dev)) srq->max = roundup_pow_of_two(srq->max + 1); else srq->max = srq->max + 1; ds = max(64UL, roundup_pow_of_two(sizeof (struct mthca_next_seg) + srq->max_gs * sizeof (struct mthca_data_seg))); if (!mthca_is_memfree(dev) && (ds > dev->limits.max_desc_sz)) return -EINVAL; srq->wqe_shift = ilog2(ds); srq->srqn = mthca_alloc(&dev->srq_table.alloc); if (srq->srqn == -1) return -ENOMEM; if (mthca_is_memfree(dev)) { err = mthca_table_get(dev, dev->srq_table.table, srq->srqn); if (err) goto err_out; if (!pd->ibpd.uobject) { srq->db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SRQ, srq->srqn, &srq->db); if (srq->db_index < 0) { err = -ENOMEM; goto err_out_icm; } } } mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) { err = PTR_ERR(mailbox); goto err_out_db; } err = mthca_alloc_srq_buf(dev, pd, srq); if (err) goto err_out_mailbox; spin_lock_init(&srq->lock); srq->refcount = 1; init_waitqueue_head(&srq->wait); mutex_init(&srq->mutex); if (mthca_is_memfree(dev)) mthca_arbel_init_srq_context(dev, pd, srq, mailbox->buf); else mthca_tavor_init_srq_context(dev, pd, srq, mailbox->buf); err = mthca_SW2HW_SRQ(dev, mailbox, srq->srqn); if (err) { mthca_warn(dev, "SW2HW_SRQ failed (%d)\n", err); goto err_out_free_buf; } spin_lock_irq(&dev->srq_table.lock); if (mthca_array_set(&dev->srq_table.srq, srq->srqn & (dev->limits.num_srqs - 1), srq)) { spin_unlock_irq(&dev->srq_table.lock); goto err_out_free_srq; } spin_unlock_irq(&dev->srq_table.lock); mthca_free_mailbox(dev, mailbox); srq->first_free = 0; srq->last_free = srq->max - 1; attr->max_wr = srq->max - 1; attr->max_sge = srq->max_gs; return 0; err_out_free_srq: err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn); if (err) mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err); err_out_free_buf: if (!pd->ibpd.uobject) mthca_free_srq_buf(dev, srq); err_out_mailbox: mthca_free_mailbox(dev, mailbox); err_out_db: if (!pd->ibpd.uobject && mthca_is_memfree(dev)) mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index); err_out_icm: mthca_table_put(dev, dev->srq_table.table, srq->srqn); err_out: mthca_free(&dev->srq_table.alloc, srq->srqn); return err; } static inline int get_srq_refcount(struct mthca_dev *dev, struct mthca_srq *srq) { int c; spin_lock_irq(&dev->srq_table.lock); c = srq->refcount; spin_unlock_irq(&dev->srq_table.lock); return c; } void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq) { struct mthca_mailbox *mailbox; int err; mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) { mthca_warn(dev, "No memory for mailbox to free SRQ.\n"); return; } err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn); if (err) mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err); spin_lock_irq(&dev->srq_table.lock); mthca_array_clear(&dev->srq_table.srq, srq->srqn & (dev->limits.num_srqs - 1)); --srq->refcount; spin_unlock_irq(&dev->srq_table.lock); wait_event(srq->wait, !get_srq_refcount(dev, srq)); if (!srq->ibsrq.uobject) { mthca_free_srq_buf(dev, srq); if (mthca_is_memfree(dev)) mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index); } mthca_table_put(dev, dev->srq_table.table, srq->srqn); mthca_free(&dev->srq_table.alloc, srq->srqn); mthca_free_mailbox(dev, mailbox); } int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) { struct mthca_dev *dev = to_mdev(ibsrq->device); struct mthca_srq *srq = to_msrq(ibsrq); int ret = 0; /* We don't support resizing SRQs (yet?) */ if (attr_mask & IB_SRQ_MAX_WR) return -EINVAL; if (attr_mask & IB_SRQ_LIMIT) { u32 max_wr = mthca_is_memfree(dev) ? srq->max - 1 : srq->max; if (attr->srq_limit > max_wr) return -EINVAL; mutex_lock(&srq->mutex); ret = mthca_ARM_SRQ(dev, srq->srqn, attr->srq_limit); mutex_unlock(&srq->mutex); } return ret; } int mthca_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) { struct mthca_dev *dev = to_mdev(ibsrq->device); struct mthca_srq *srq = to_msrq(ibsrq); struct mthca_mailbox *mailbox; struct mthca_arbel_srq_context *arbel_ctx; struct mthca_tavor_srq_context *tavor_ctx; int err; mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); err = mthca_QUERY_SRQ(dev, srq->srqn, mailbox); if (err) goto out; if (mthca_is_memfree(dev)) { arbel_ctx = mailbox->buf; srq_attr->srq_limit = be16_to_cpu(arbel_ctx->limit_watermark); } else { tavor_ctx = mailbox->buf; srq_attr->srq_limit = be16_to_cpu(tavor_ctx->limit_watermark); } srq_attr->max_wr = srq->max - 1; srq_attr->max_sge = srq->max_gs; out: mthca_free_mailbox(dev, mailbox); return err; } void mthca_srq_event(struct mthca_dev *dev, u32 srqn, enum ib_event_type event_type) { struct mthca_srq *srq; struct ib_event event; spin_lock(&dev->srq_table.lock); srq = mthca_array_get(&dev->srq_table.srq, srqn & (dev->limits.num_srqs - 1)); if (srq) ++srq->refcount; spin_unlock(&dev->srq_table.lock); if (!srq) { mthca_warn(dev, "Async event for bogus SRQ %08x\n", srqn); return; } if (!srq->ibsrq.event_handler) goto out; event.device = &dev->ib_dev; event.event = event_type; event.element.srq = &srq->ibsrq; srq->ibsrq.event_handler(&event, srq->ibsrq.srq_context); out: spin_lock(&dev->srq_table.lock); if (!--srq->refcount) wake_up(&srq->wait); spin_unlock(&dev->srq_table.lock); } /* * This function must be called with IRQs disabled. */ void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr) { int ind; struct mthca_next_seg *last_free; ind = wqe_addr >> srq->wqe_shift; spin_lock(&srq->lock); last_free = get_wqe(srq, srq->last_free); *wqe_to_link(last_free) = ind; last_free->nda_op = htonl((ind << srq->wqe_shift) | 1); *wqe_to_link(get_wqe(srq, ind)) = -1; srq->last_free = ind; spin_unlock(&srq->lock); } -int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mthca_dev *dev = to_mdev(ibsrq->device); struct mthca_srq *srq = to_msrq(ibsrq); unsigned long flags; int err = 0; int first_ind; int ind; int next_ind; int nreq; int i; void *wqe; void *prev_wqe; spin_lock_irqsave(&srq->lock, flags); first_ind = srq->first_free; for (nreq = 0; wr; wr = wr->next) { ind = srq->first_free; wqe = get_wqe(srq, ind); next_ind = *wqe_to_link(wqe); if (unlikely(next_ind < 0)) { mthca_err(dev, "SRQ %06x full\n", srq->srqn); err = -ENOMEM; *bad_wr = wr; break; } prev_wqe = srq->last; srq->last = wqe; ((struct mthca_next_seg *) wqe)->ee_nds = 0; /* flags field will always remain 0 */ wqe += sizeof (struct mthca_next_seg); if (unlikely(wr->num_sge > srq->max_gs)) { err = -EINVAL; *bad_wr = wr; srq->last = prev_wqe; break; } for (i = 0; i < wr->num_sge; ++i) { mthca_set_data_seg(wqe, wr->sg_list + i); wqe += sizeof (struct mthca_data_seg); } if (i < srq->max_gs) mthca_set_data_seg_inval(wqe); ((struct mthca_next_seg *) prev_wqe)->ee_nds = cpu_to_be32(MTHCA_NEXT_DBD); srq->wrid[ind] = wr->wr_id; srq->first_free = next_ind; ++nreq; if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) { nreq = 0; /* * Make sure that descriptors are written * before doorbell is rung. */ wmb(); mthca_write64(first_ind << srq->wqe_shift, srq->srqn << 8, dev->kar + MTHCA_RECEIVE_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); first_ind = srq->first_free; } } if (likely(nreq)) { /* * Make sure that descriptors are written before * doorbell is rung. */ wmb(); mthca_write64(first_ind << srq->wqe_shift, (srq->srqn << 8) | nreq, dev->kar + MTHCA_RECEIVE_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); } /* * Make sure doorbells don't leak out of SRQ spinlock and * reach the HCA out of order: */ mmiowb(); spin_unlock_irqrestore(&srq->lock, flags); return err; } -int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mthca_dev *dev = to_mdev(ibsrq->device); struct mthca_srq *srq = to_msrq(ibsrq); unsigned long flags; int err = 0; int ind; int next_ind; int nreq; int i; void *wqe; spin_lock_irqsave(&srq->lock, flags); for (nreq = 0; wr; ++nreq, wr = wr->next) { ind = srq->first_free; wqe = get_wqe(srq, ind); next_ind = *wqe_to_link(wqe); if (unlikely(next_ind < 0)) { mthca_err(dev, "SRQ %06x full\n", srq->srqn); err = -ENOMEM; *bad_wr = wr; break; } ((struct mthca_next_seg *) wqe)->ee_nds = 0; /* flags field will always remain 0 */ wqe += sizeof (struct mthca_next_seg); if (unlikely(wr->num_sge > srq->max_gs)) { err = -EINVAL; *bad_wr = wr; break; } for (i = 0; i < wr->num_sge; ++i) { mthca_set_data_seg(wqe, wr->sg_list + i); wqe += sizeof (struct mthca_data_seg); } if (i < srq->max_gs) mthca_set_data_seg_inval(wqe); srq->wrid[ind] = wr->wr_id; srq->first_free = next_ind; } if (likely(nreq)) { srq->counter += nreq; /* * Make sure that descriptors are written before * we write doorbell record. */ wmb(); *srq->db = cpu_to_be32(srq->counter); } spin_unlock_irqrestore(&srq->lock, flags); return err; } int mthca_max_srq_sge(struct mthca_dev *dev) { if (mthca_is_memfree(dev)) return dev->limits.max_sg; /* * SRQ allocations are based on powers of 2 for Tavor, * (although they only need to be multiples of 16 bytes). * * Therefore, we need to base the max number of sg entries on * the largest power of 2 descriptor size that is <= to the * actual max WQE descriptor size, rather than return the * max_sg value given by the firmware (which is based on WQE * sizes as multiples of 16, not powers of 2). * * If SRQ implementation is changed for Tavor to be based on * multiples of 16, the calculation below can be deleted and * the FW max_sg value returned. */ return min_t(int, dev->limits.max_sg, ((1 << (fls(dev->limits.max_desc_sz) - 1)) - sizeof (struct mthca_next_seg)) / sizeof (struct mthca_data_seg)); } int mthca_init_srq_table(struct mthca_dev *dev) { int err; if (!(dev->mthca_flags & MTHCA_FLAG_SRQ)) return 0; spin_lock_init(&dev->srq_table.lock); err = mthca_alloc_init(&dev->srq_table.alloc, dev->limits.num_srqs, dev->limits.num_srqs - 1, dev->limits.reserved_srqs); if (err) return err; err = mthca_array_init(&dev->srq_table.srq, dev->limits.num_srqs); if (err) mthca_alloc_cleanup(&dev->srq_table.alloc); return err; } void mthca_cleanup_srq_table(struct mthca_dev *dev) { if (!(dev->mthca_flags & MTHCA_FLAG_SRQ)) return; mthca_array_cleanup(&dev->srq_table.srq, dev->limits.num_srqs); mthca_alloc_cleanup(&dev->srq_table.alloc); } diff --git a/sys/dev/qlnx/qlnxr/qlnxr_cm.c b/sys/dev/qlnx/qlnxr/qlnxr_cm.c index 89386787c7f1..a9976146fe94 100644 --- a/sys/dev/qlnx/qlnxr/qlnxr_cm.c +++ b/sys/dev/qlnx/qlnxr/qlnxr_cm.c @@ -1,882 +1,882 @@ /* * Copyright (c) 2018-2019 Cavium, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "qlnxr_def.h" #include "rdma_common.h" #include "qlnxr_cm.h" void qlnxr_inc_sw_gsi_cons(struct qlnxr_qp_hwq_info *info) { info->gsi_cons = (info->gsi_cons + 1) % info->max_wr; } void qlnxr_store_gsi_qp_cq(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct ib_qp_init_attr *attrs) { QL_DPRINT12(dev->ha, "enter\n"); dev->gsi_qp_created = 1; dev->gsi_sqcq = get_qlnxr_cq((attrs->send_cq)); dev->gsi_rqcq = get_qlnxr_cq((attrs->recv_cq)); dev->gsi_qp = qp; QL_DPRINT12(dev->ha, "exit\n"); return; } void qlnxr_ll2_complete_tx_packet(void *cxt, uint8_t connection_handle, void *cookie, dma_addr_t first_frag_addr, bool b_last_fragment, bool b_last_packet) { struct qlnxr_dev *dev = (struct qlnxr_dev *)cxt; struct ecore_roce_ll2_packet *pkt = cookie; struct qlnxr_cq *cq = dev->gsi_sqcq; struct qlnxr_qp *qp = dev->gsi_qp; unsigned long flags; QL_DPRINT12(dev->ha, "enter\n"); qlnx_dma_free_coherent(&dev->ha->cdev, pkt->header.vaddr, pkt->header.baddr, pkt->header.len); kfree(pkt); spin_lock_irqsave(&qp->q_lock, flags); qlnxr_inc_sw_gsi_cons(&qp->sq); spin_unlock_irqrestore(&qp->q_lock, flags); if (cq->ibcq.comp_handler) (*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context); QL_DPRINT12(dev->ha, "exit\n"); return; } void qlnxr_ll2_complete_rx_packet(void *cxt, struct ecore_ll2_comp_rx_data *data) { struct qlnxr_dev *dev = (struct qlnxr_dev *)cxt; struct qlnxr_cq *cq = dev->gsi_rqcq; // struct qlnxr_qp *qp = dev->gsi_qp; struct qlnxr_qp *qp = NULL; unsigned long flags; uint32_t qp_num = 0; // uint32_t delay_count = 0, gsi_cons = 0; //void * dest_va; QL_DPRINT12(dev->ha, "enter\n"); if (data->u.data_length_error) { /* TODO: add statistic */ } if (data->cookie == NULL) { QL_DPRINT12(dev->ha, "cookie is NULL, bad sign\n"); } qp_num = (0xFF << 16) | data->qp_id; if (data->qp_id == 1) { qp = dev->gsi_qp; } else { /* TODO: This will be needed for UD QP support */ /* For RoCEv1 this is invalid */ QL_DPRINT12(dev->ha, "invalid QP\n"); return; } /* note: currently only one recv sg is supported */ QL_DPRINT12(dev->ha, "MAD received on QP : %x\n", data->rx_buf_addr); spin_lock_irqsave(&qp->q_lock, flags); qp->rqe_wr_id[qp->rq.gsi_cons].rc = data->u.data_length_error ? -EINVAL : 0; qp->rqe_wr_id[qp->rq.gsi_cons].vlan_id = data->vlan; /* note: length stands for data length i.e. GRH is excluded */ qp->rqe_wr_id[qp->rq.gsi_cons].sg_list[0].length = data->length.data_length; *((u32 *)&qp->rqe_wr_id[qp->rq.gsi_cons].smac[0]) = ntohl(data->opaque_data_0); *((u16 *)&qp->rqe_wr_id[qp->rq.gsi_cons].smac[4]) = ntohs((u16)data->opaque_data_1); qlnxr_inc_sw_gsi_cons(&qp->rq); spin_unlock_irqrestore(&qp->q_lock, flags); if (cq->ibcq.comp_handler) (*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context); QL_DPRINT12(dev->ha, "exit\n"); return; } void qlnxr_ll2_release_rx_packet(void *cxt, u8 connection_handle, void *cookie, dma_addr_t rx_buf_addr, bool b_last_packet) { /* Do nothing... */ } static void qlnxr_destroy_gsi_cq(struct qlnxr_dev *dev, struct ib_qp_init_attr *attrs) { struct ecore_rdma_destroy_cq_in_params iparams; struct ecore_rdma_destroy_cq_out_params oparams; struct qlnxr_cq *cq; QL_DPRINT12(dev->ha, "enter\n"); cq = get_qlnxr_cq((attrs->send_cq)); iparams.icid = cq->icid; ecore_rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams); ecore_chain_free(&dev->ha->cdev, &cq->pbl); cq = get_qlnxr_cq((attrs->recv_cq)); /* if a dedicated recv_cq was used, delete it too */ if (iparams.icid != cq->icid) { iparams.icid = cq->icid; ecore_rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams); ecore_chain_free(&dev->ha->cdev, &cq->pbl); } QL_DPRINT12(dev->ha, "exit\n"); return; } static inline int qlnxr_check_gsi_qp_attrs(struct qlnxr_dev *dev, struct ib_qp_init_attr *attrs) { QL_DPRINT12(dev->ha, "enter\n"); if (attrs->cap.max_recv_sge > QLNXR_GSI_MAX_RECV_SGE) { QL_DPRINT11(dev->ha, "(attrs->cap.max_recv_sge > QLNXR_GSI_MAX_RECV_SGE)\n"); return -EINVAL; } if (attrs->cap.max_recv_wr > QLNXR_GSI_MAX_RECV_WR) { QL_DPRINT11(dev->ha, "(attrs->cap.max_recv_wr > QLNXR_GSI_MAX_RECV_WR)\n"); return -EINVAL; } if (attrs->cap.max_send_wr > QLNXR_GSI_MAX_SEND_WR) { QL_DPRINT11(dev->ha, "(attrs->cap.max_send_wr > QLNXR_GSI_MAX_SEND_WR)\n"); return -EINVAL; } QL_DPRINT12(dev->ha, "exit\n"); return 0; } static int qlnxr_ll2_post_tx(struct qlnxr_dev *dev, struct ecore_roce_ll2_packet *pkt) { enum ecore_ll2_roce_flavor_type roce_flavor; struct ecore_ll2_tx_pkt_info ll2_tx_pkt; int rc; int i; QL_DPRINT12(dev->ha, "enter\n"); memset(&ll2_tx_pkt, 0, sizeof(ll2_tx_pkt)); if (pkt->roce_mode != ROCE_V1) { QL_DPRINT11(dev->ha, "roce_mode != ROCE_V1\n"); return (-1); } roce_flavor = (pkt->roce_mode == ROCE_V1) ? ECORE_LL2_ROCE : ECORE_LL2_RROCE; ll2_tx_pkt.num_of_bds = 1 /* hdr */ + pkt->n_seg; ll2_tx_pkt.vlan = 0; /* ??? */ ll2_tx_pkt.tx_dest = ECORE_LL2_TX_DEST_NW; ll2_tx_pkt.ecore_roce_flavor = roce_flavor; ll2_tx_pkt.first_frag = pkt->header.baddr; ll2_tx_pkt.first_frag_len = pkt->header.len; ll2_tx_pkt.cookie = pkt; ll2_tx_pkt.enable_ip_cksum = 1; // Only for RoCEv2:IPv4 /* tx header */ rc = ecore_ll2_prepare_tx_packet(dev->rdma_ctx, dev->gsi_ll2_handle, &ll2_tx_pkt, 1); if (rc) { QL_DPRINT11(dev->ha, "ecore_ll2_prepare_tx_packet failed\n"); /* TX failed while posting header - release resources*/ qlnx_dma_free_coherent(&dev->ha->cdev, pkt->header.vaddr, pkt->header.baddr, pkt->header.len); kfree(pkt); return rc; } /* tx payload */ for (i = 0; i < pkt->n_seg; i++) { rc = ecore_ll2_set_fragment_of_tx_packet(dev->rdma_ctx, dev->gsi_ll2_handle, pkt->payload[i].baddr, pkt->payload[i].len); if (rc) { /* if failed not much to do here, partial packet has * been posted we can't free memory, will need to wait * for completion */ QL_DPRINT11(dev->ha, "ecore_ll2_set_fragment_of_tx_packet failed\n"); return rc; } } struct ecore_ll2_stats stats = {0}; rc = ecore_ll2_get_stats(dev->rdma_ctx, dev->gsi_ll2_handle, &stats); if (rc) { QL_DPRINT11(dev->ha, "failed to obtain ll2 stats\n"); } QL_DPRINT12(dev->ha, "exit\n"); return 0; } int qlnxr_ll2_stop(struct qlnxr_dev *dev) { int rc; QL_DPRINT12(dev->ha, "enter\n"); if (dev->gsi_ll2_handle == 0xFF) return 0; /* remove LL2 MAC address filter */ rc = qlnx_rdma_ll2_set_mac_filter(dev->rdma_ctx, dev->gsi_ll2_mac_address, NULL); rc = ecore_ll2_terminate_connection(dev->rdma_ctx, dev->gsi_ll2_handle); ecore_ll2_release_connection(dev->rdma_ctx, dev->gsi_ll2_handle); dev->gsi_ll2_handle = 0xFF; QL_DPRINT12(dev->ha, "exit rc = %d\n", rc); return rc; } int qlnxr_ll2_start(struct qlnxr_dev *dev, struct ib_qp_init_attr *attrs, struct qlnxr_qp *qp) { struct ecore_ll2_acquire_data data; struct ecore_ll2_cbs cbs; int rc; QL_DPRINT12(dev->ha, "enter\n"); /* configure and start LL2 */ cbs.rx_comp_cb = qlnxr_ll2_complete_rx_packet; cbs.tx_comp_cb = qlnxr_ll2_complete_tx_packet; cbs.rx_release_cb = qlnxr_ll2_release_rx_packet; cbs.tx_release_cb = qlnxr_ll2_complete_tx_packet; cbs.cookie = dev; dev->gsi_ll2_handle = 0xFF; memset(&data, 0, sizeof(data)); data.input.conn_type = ECORE_LL2_TYPE_ROCE; data.input.mtu = dev->ha->ifp->if_mtu; data.input.rx_num_desc = 8 * 1024; data.input.rx_drop_ttl0_flg = 1; data.input.rx_vlan_removal_en = 0; data.input.tx_num_desc = 8 * 1024; data.input.tx_tc = 0; data.input.tx_dest = ECORE_LL2_TX_DEST_NW; data.input.ai_err_packet_too_big = ECORE_LL2_DROP_PACKET; data.input.ai_err_no_buf = ECORE_LL2_DROP_PACKET; data.input.gsi_enable = 1; data.p_connection_handle = &dev->gsi_ll2_handle; data.cbs = &cbs; rc = ecore_ll2_acquire_connection(dev->rdma_ctx, &data); if (rc) { QL_DPRINT11(dev->ha, "ecore_ll2_acquire_connection failed: %d\n", rc); return rc; } QL_DPRINT11(dev->ha, "ll2 connection acquired successfully\n"); rc = ecore_ll2_establish_connection(dev->rdma_ctx, dev->gsi_ll2_handle); if (rc) { QL_DPRINT11(dev->ha, "ecore_ll2_establish_connection failed\n", rc); goto err1; } QL_DPRINT11(dev->ha, "ll2 connection established successfully\n"); rc = qlnx_rdma_ll2_set_mac_filter(dev->rdma_ctx, NULL, dev->ha->primary_mac); if (rc) { QL_DPRINT11(dev->ha, "qlnx_rdma_ll2_set_mac_filter failed\n", rc); goto err2; } QL_DPRINT12(dev->ha, "exit rc = %d\n", rc); return 0; err2: ecore_ll2_terminate_connection(dev->rdma_ctx, dev->gsi_ll2_handle); err1: ecore_ll2_release_connection(dev->rdma_ctx, dev->gsi_ll2_handle); QL_DPRINT12(dev->ha, "exit rc = %d\n", rc); return rc; } struct ib_qp* qlnxr_create_gsi_qp(struct qlnxr_dev *dev, struct ib_qp_init_attr *attrs, struct qlnxr_qp *qp) { int rc; QL_DPRINT12(dev->ha, "enter\n"); rc = qlnxr_check_gsi_qp_attrs(dev, attrs); if (rc) { QL_DPRINT11(dev->ha, "qlnxr_check_gsi_qp_attrs failed\n"); return ERR_PTR(rc); } rc = qlnxr_ll2_start(dev, attrs, qp); if (rc) { QL_DPRINT11(dev->ha, "qlnxr_ll2_start failed\n"); return ERR_PTR(rc); } /* create QP */ qp->ibqp.qp_num = 1; qp->rq.max_wr = attrs->cap.max_recv_wr; qp->sq.max_wr = attrs->cap.max_send_wr; qp->rqe_wr_id = kzalloc(qp->rq.max_wr * sizeof(*qp->rqe_wr_id), GFP_KERNEL); if (!qp->rqe_wr_id) { QL_DPRINT11(dev->ha, "(!qp->rqe_wr_id)\n"); goto err; } qp->wqe_wr_id = kzalloc(qp->sq.max_wr * sizeof(*qp->wqe_wr_id), GFP_KERNEL); if (!qp->wqe_wr_id) { QL_DPRINT11(dev->ha, "(!qp->wqe_wr_id)\n"); goto err; } qlnxr_store_gsi_qp_cq(dev, qp, attrs); memcpy(dev->gsi_ll2_mac_address, dev->ha->primary_mac, ETH_ALEN); /* the GSI CQ is handled by the driver so remove it from the FW */ qlnxr_destroy_gsi_cq(dev, attrs); dev->gsi_rqcq->cq_type = QLNXR_CQ_TYPE_GSI; dev->gsi_rqcq->cq_type = QLNXR_CQ_TYPE_GSI; QL_DPRINT12(dev->ha, "exit &qp->ibqp = %p\n", &qp->ibqp); return &qp->ibqp; err: kfree(qp->rqe_wr_id); rc = qlnxr_ll2_stop(dev); QL_DPRINT12(dev->ha, "exit with error\n"); return ERR_PTR(-ENOMEM); } int qlnxr_destroy_gsi_qp(struct qlnxr_dev *dev) { int rc = 0; QL_DPRINT12(dev->ha, "enter\n"); rc = qlnxr_ll2_stop(dev); QL_DPRINT12(dev->ha, "exit rc = %d\n", rc); return (rc); } static inline bool qlnxr_get_vlan_id_gsi(struct ib_ah_attr *ah_attr, u16 *vlan_id) { u16 tmp_vlan_id; union ib_gid *dgid = &ah_attr->grh.dgid; tmp_vlan_id = (dgid->raw[11] << 8) | dgid->raw[12]; if (tmp_vlan_id < 0x1000) { *vlan_id = tmp_vlan_id; return true; } else { *vlan_id = 0; return false; } } #define QLNXR_MAX_UD_HEADER_SIZE (100) #define QLNXR_GSI_QPN (1) static inline int qlnxr_gsi_build_header(struct qlnxr_dev *dev, struct qlnxr_qp *qp, - struct ib_send_wr *swr, + const struct ib_send_wr *swr, struct ib_ud_header *udh, int *roce_mode) { bool has_vlan = false, has_grh_ipv6 = true; struct ib_ah_attr *ah_attr = &get_qlnxr_ah((ud_wr(swr)->ah))->attr; struct ib_global_route *grh = &ah_attr->grh; union ib_gid sgid; int send_size = 0; u16 vlan_id = 0; u16 ether_type; #if __FreeBSD_version >= 1102000 int rc = 0; int ip_ver = 0; bool has_udp = false; #endif /* #if __FreeBSD_version >= 1102000 */ #if !DEFINE_IB_AH_ATTR_WITH_DMAC u8 mac[ETH_ALEN]; #endif int i; send_size = 0; for (i = 0; i < swr->num_sge; ++i) send_size += swr->sg_list[i].length; has_vlan = qlnxr_get_vlan_id_gsi(ah_attr, &vlan_id); ether_type = ETH_P_ROCE; *roce_mode = ROCE_V1; if (grh->sgid_index < QLNXR_MAX_SGID) sgid = dev->sgid_tbl[grh->sgid_index]; else sgid = dev->sgid_tbl[0]; #if __FreeBSD_version >= 1102000 rc = ib_ud_header_init(send_size, false /* LRH */, true /* ETH */, has_vlan, has_grh_ipv6, ip_ver, has_udp, 0 /* immediate */, udh); if (rc) { QL_DPRINT11(dev->ha, "gsi post send: failed to init header\n"); return rc; } #else ib_ud_header_init(send_size, false /* LRH */, true /* ETH */, has_vlan, has_grh_ipv6, 0 /* immediate */, udh); #endif /* #if __FreeBSD_version >= 1102000 */ /* ENET + VLAN headers*/ #if DEFINE_IB_AH_ATTR_WITH_DMAC memcpy(udh->eth.dmac_h, ah_attr->dmac, ETH_ALEN); #else qlnxr_get_dmac(dev, ah_attr, mac); memcpy(udh->eth.dmac_h, mac, ETH_ALEN); #endif memcpy(udh->eth.smac_h, dev->ha->primary_mac, ETH_ALEN); if (has_vlan) { udh->eth.type = htons(ETH_P_8021Q); udh->vlan.tag = htons(vlan_id); udh->vlan.type = htons(ether_type); } else { udh->eth.type = htons(ether_type); } for (int j = 0; j < 4; j++) { QL_DPRINT12(dev->ha, "destination mac: %x\n", udh->eth.dmac_h[j]); } for (int j = 0; j < 4; j++) { QL_DPRINT12(dev->ha, "source mac: %x\n", udh->eth.smac_h[j]); } QL_DPRINT12(dev->ha, "QP: %p, opcode: %d, wq: %lx, roce: %x, hops:%d," "imm : %d, vlan :%d, AH: %p\n", qp, swr->opcode, swr->wr_id, *roce_mode, grh->hop_limit, 0, has_vlan, get_qlnxr_ah((ud_wr(swr)->ah))); if (has_grh_ipv6) { /* GRH / IPv6 header */ udh->grh.traffic_class = grh->traffic_class; udh->grh.flow_label = grh->flow_label; udh->grh.hop_limit = grh->hop_limit; udh->grh.destination_gid = grh->dgid; memcpy(&udh->grh.source_gid.raw, &sgid.raw, sizeof(udh->grh.source_gid.raw)); QL_DPRINT12(dev->ha, "header: tc: %x, flow_label : %x, " "hop_limit: %x \n", udh->grh.traffic_class, udh->grh.flow_label, udh->grh.hop_limit); for (i = 0; i < 16; i++) { QL_DPRINT12(dev->ha, "udh dgid = %x\n", udh->grh.destination_gid.raw[i]); } for (i = 0; i < 16; i++) { QL_DPRINT12(dev->ha, "udh sgid = %x\n", udh->grh.source_gid.raw[i]); } udh->grh.next_header = 0x1b; } #ifdef DEFINE_IB_UD_HEADER_INIT_UDP_PRESENT /* This is for RoCEv2 */ else { /* IPv4 header */ u32 ipv4_addr; udh->ip4.protocol = IPPROTO_UDP; udh->ip4.tos = htonl(grh->flow_label); udh->ip4.frag_off = htons(IP_DF); udh->ip4.ttl = grh->hop_limit; ipv4_addr = qedr_get_ipv4_from_gid(sgid.raw); udh->ip4.saddr = ipv4_addr; ipv4_addr = qedr_get_ipv4_from_gid(grh->dgid.raw); udh->ip4.daddr = ipv4_addr; /* note: checksum is calculated by the device */ } #endif /* BTH */ udh->bth.solicited_event = !!(swr->send_flags & IB_SEND_SOLICITED); udh->bth.pkey = QLNXR_ROCE_PKEY_DEFAULT;/* TODO: ib_get_cahced_pkey?! */ //udh->bth.destination_qpn = htonl(ud_wr(swr)->remote_qpn); udh->bth.destination_qpn = OSAL_CPU_TO_BE32(ud_wr(swr)->remote_qpn); //udh->bth.psn = htonl((qp->sq_psn++) & ((1 << 24) - 1)); udh->bth.psn = OSAL_CPU_TO_BE32((qp->sq_psn++) & ((1 << 24) - 1)); udh->bth.opcode = IB_OPCODE_UD_SEND_ONLY; /* DETH */ //udh->deth.qkey = htonl(0x80010000); /* qp->qkey */ /* TODO: what is?! */ //udh->deth.source_qpn = htonl(QLNXR_GSI_QPN); udh->deth.qkey = OSAL_CPU_TO_BE32(0x80010000); /* qp->qkey */ /* TODO: what is?! */ udh->deth.source_qpn = OSAL_CPU_TO_BE32(QLNXR_GSI_QPN); QL_DPRINT12(dev->ha, "exit\n"); return 0; } static inline int qlnxr_gsi_build_packet(struct qlnxr_dev *dev, - struct qlnxr_qp *qp, struct ib_send_wr *swr, + struct qlnxr_qp *qp, const struct ib_send_wr *swr, struct ecore_roce_ll2_packet **p_packet) { u8 ud_header_buffer[QLNXR_MAX_UD_HEADER_SIZE]; struct ecore_roce_ll2_packet *packet; int roce_mode, header_size; struct ib_ud_header udh; int i, rc; QL_DPRINT12(dev->ha, "enter\n"); *p_packet = NULL; rc = qlnxr_gsi_build_header(dev, qp, swr, &udh, &roce_mode); if (rc) { QL_DPRINT11(dev->ha, "qlnxr_gsi_build_header failed rc = %d\n", rc); return rc; } header_size = ib_ud_header_pack(&udh, &ud_header_buffer); packet = kzalloc(sizeof(*packet), GFP_ATOMIC); if (!packet) { QL_DPRINT11(dev->ha, "packet == NULL\n"); return -ENOMEM; } packet->header.vaddr = qlnx_dma_alloc_coherent(&dev->ha->cdev, &packet->header.baddr, header_size); if (!packet->header.vaddr) { QL_DPRINT11(dev->ha, "packet->header.vaddr == NULL\n"); kfree(packet); return -ENOMEM; } if (memcmp(udh.eth.smac_h, udh.eth.dmac_h, ETH_ALEN)) packet->tx_dest = ECORE_ROCE_LL2_TX_DEST_NW; else packet->tx_dest = ECORE_ROCE_LL2_TX_DEST_LB; packet->roce_mode = roce_mode; memcpy(packet->header.vaddr, ud_header_buffer, header_size); packet->header.len = header_size; packet->n_seg = swr->num_sge; qp->wqe_wr_id[qp->sq.prod].bytes_len = IB_GRH_BYTES; //RDMA_GRH_BYTES for (i = 0; i < packet->n_seg; i++) { packet->payload[i].baddr = swr->sg_list[i].addr; packet->payload[i].len = swr->sg_list[i].length; qp->wqe_wr_id[qp->sq.prod].bytes_len += packet->payload[i].len; QL_DPRINT11(dev->ha, "baddr: %p, len: %d\n", packet->payload[i].baddr, packet->payload[i].len); } *p_packet = packet; QL_DPRINT12(dev->ha, "exit, packet->n_seg: %d\n", packet->n_seg); return 0; } int qlnxr_gsi_post_send(struct ib_qp *ibqp, - struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct ecore_roce_ll2_packet *pkt = NULL; struct qlnxr_qp *qp = get_qlnxr_qp(ibqp); struct qlnxr_dev *dev = qp->dev; unsigned long flags; int rc; QL_DPRINT12(dev->ha, "exit\n"); if (qp->state != ECORE_ROCE_QP_STATE_RTS) { QL_DPRINT11(dev->ha, "(qp->state != ECORE_ROCE_QP_STATE_RTS)\n"); *bad_wr = wr; return -EINVAL; } if (wr->num_sge > RDMA_MAX_SGE_PER_SQ_WQE) { QL_DPRINT11(dev->ha, "(wr->num_sge > RDMA_MAX_SGE_PER_SQ_WQE)\n"); rc = -EINVAL; goto err; } if (wr->opcode != IB_WR_SEND) { QL_DPRINT11(dev->ha, "(wr->opcode > IB_WR_SEND)\n"); rc = -EINVAL; goto err; } spin_lock_irqsave(&qp->q_lock, flags); rc = qlnxr_gsi_build_packet(dev, qp, wr, &pkt); if(rc) { spin_unlock_irqrestore(&qp->q_lock, flags); QL_DPRINT11(dev->ha, "qlnxr_gsi_build_packet failed\n"); goto err; } rc = qlnxr_ll2_post_tx(dev, pkt); if (!rc) { qp->wqe_wr_id[qp->sq.prod].wr_id = wr->wr_id; qp->wqe_wr_id[qp->sq.prod].signaled = !!(wr->send_flags & IB_SEND_SIGNALED); qp->wqe_wr_id[qp->sq.prod].opcode = IB_WC_SEND; qlnxr_inc_sw_prod(&qp->sq); QL_DPRINT11(dev->ha, "packet sent over gsi qp\n"); } else { QL_DPRINT11(dev->ha, "qlnxr_ll2_post_tx failed\n"); rc = -EAGAIN; *bad_wr = wr; } spin_unlock_irqrestore(&qp->q_lock, flags); if (wr->next != NULL) { *bad_wr = wr->next; rc=-EINVAL; } QL_DPRINT12(dev->ha, "exit\n"); return rc; err: *bad_wr = wr; QL_DPRINT12(dev->ha, "exit error\n"); return rc; } #define QLNXR_LL2_RX_BUFFER_SIZE (4 * 1024) int qlnxr_gsi_post_recv(struct ib_qp *ibqp, - struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) + const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct qlnxr_dev *dev = get_qlnxr_dev((ibqp->device)); struct qlnxr_qp *qp = get_qlnxr_qp(ibqp); unsigned long flags; int rc = 0; QL_DPRINT12(dev->ha, "enter, wr: %p\n", wr); if ((qp->state != ECORE_ROCE_QP_STATE_RTR) && (qp->state != ECORE_ROCE_QP_STATE_RTS)) { *bad_wr = wr; QL_DPRINT11(dev->ha, "exit 0\n"); return -EINVAL; } spin_lock_irqsave(&qp->q_lock, flags); while (wr) { if (wr->num_sge > QLNXR_GSI_MAX_RECV_SGE) { QL_DPRINT11(dev->ha, "exit 1\n"); goto err; } rc = ecore_ll2_post_rx_buffer(dev->rdma_ctx, dev->gsi_ll2_handle, wr->sg_list[0].addr, wr->sg_list[0].length, 0 /* cookie */, 1 /* notify_fw */); if (rc) { QL_DPRINT11(dev->ha, "exit 2\n"); goto err; } memset(&qp->rqe_wr_id[qp->rq.prod], 0, sizeof(qp->rqe_wr_id[qp->rq.prod])); qp->rqe_wr_id[qp->rq.prod].sg_list[0] = wr->sg_list[0]; qp->rqe_wr_id[qp->rq.prod].wr_id = wr->wr_id; qlnxr_inc_sw_prod(&qp->rq); wr = wr->next; } spin_unlock_irqrestore(&qp->q_lock, flags); QL_DPRINT12(dev->ha, "exit rc = %d\n", rc); return rc; err: spin_unlock_irqrestore(&qp->q_lock, flags); *bad_wr = wr; QL_DPRINT12(dev->ha, "exit with -ENOMEM\n"); return -ENOMEM; } int qlnxr_gsi_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { struct qlnxr_dev *dev = get_qlnxr_dev((ibcq->device)); struct qlnxr_cq *cq = get_qlnxr_cq(ibcq); struct qlnxr_qp *qp = dev->gsi_qp; unsigned long flags; int i = 0; QL_DPRINT12(dev->ha, "enter\n"); spin_lock_irqsave(&cq->cq_lock, flags); while (i < num_entries && qp->rq.cons != qp->rq.gsi_cons) { memset(&wc[i], 0, sizeof(*wc)); wc[i].qp = &qp->ibqp; wc[i].wr_id = qp->rqe_wr_id[qp->rq.cons].wr_id; wc[i].opcode = IB_WC_RECV; wc[i].pkey_index = 0; wc[i].status = (qp->rqe_wr_id[qp->rq.cons].rc)? IB_WC_GENERAL_ERR:IB_WC_SUCCESS; /* 0 - currently only one recv sg is supported */ wc[i].byte_len = qp->rqe_wr_id[qp->rq.cons].sg_list[0].length; wc[i].wc_flags |= IB_WC_GRH | IB_WC_IP_CSUM_OK; #if __FreeBSD_version >= 1100000 memcpy(&wc[i].smac, qp->rqe_wr_id[qp->rq.cons].smac, ETH_ALEN); wc[i].wc_flags |= IB_WC_WITH_SMAC; if (qp->rqe_wr_id[qp->rq.cons].vlan_id) { wc[i].wc_flags |= IB_WC_WITH_VLAN; wc[i].vlan_id = qp->rqe_wr_id[qp->rq.cons].vlan_id; } #endif qlnxr_inc_sw_cons(&qp->rq); i++; } while (i < num_entries && qp->sq.cons != qp->sq.gsi_cons) { memset(&wc[i], 0, sizeof(*wc)); wc[i].qp = &qp->ibqp; wc[i].wr_id = qp->wqe_wr_id[qp->sq.cons].wr_id; wc[i].opcode = IB_WC_SEND; wc[i].status = IB_WC_SUCCESS; qlnxr_inc_sw_cons(&qp->sq); i++; } spin_unlock_irqrestore(&cq->cq_lock, flags); QL_DPRINT12(dev->ha, "exit i = %d\n", i); return i; } diff --git a/sys/dev/qlnx/qlnxr/qlnxr_cm.h b/sys/dev/qlnx/qlnxr/qlnxr_cm.h index 15363613a06c..530a42833e95 100644 --- a/sys/dev/qlnx/qlnxr/qlnxr_cm.h +++ b/sys/dev/qlnx/qlnxr/qlnxr_cm.h @@ -1,109 +1,109 @@ /* * Copyright (c) 2018-2019 Cavium, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __QLNXR_CM_H__ #define __QLNXR_CM_H__ /* ECORE LL2 has a limit to the number of buffers it can handle. * FYI, OFED used 512 and 128 for recv and send. */ #define QLNXR_GSI_MAX_RECV_WR (4096) #define QLNXR_GSI_MAX_SEND_WR (4096) #define QLNXR_GSI_MAX_RECV_SGE (1) /* LL2 FW limitation */ /* future OFED/kernel will have these */ #define ETH_P_ROCE (0x8915) #define QLNXR_ROCE_V2_UDP_SPORT (0000) #if __FreeBSD_version >= 1102000 #define rdma_wr(_wr) rdma_wr(_wr) #define ud_wr(_wr) ud_wr(_wr) #define atomic_wr(_wr) atomic_wr(_wr) #else #define rdma_wr(_wr) (&(_wr->wr.rdma)) #define ud_wr(_wr) (&(_wr->wr.ud)) #define atomic_wr(_wr) (&(_wr->wr.atomic)) #endif /* #if __FreeBSD_version >= 1102000 */ static inline u32 qlnxr_get_ipv4_from_gid(u8 *gid) { return *(u32 *)(void *)&gid[12]; } struct ecore_roce_ll2_header { void *vaddr; dma_addr_t baddr; size_t len; }; struct ecore_roce_ll2_buffer { dma_addr_t baddr; size_t len; }; struct ecore_roce_ll2_packet { struct ecore_roce_ll2_header header; int n_seg; struct ecore_roce_ll2_buffer payload[RDMA_MAX_SGE_PER_SQ_WQE]; int roce_mode; enum ecore_roce_ll2_tx_dest tx_dest; }; /* RDMA CM */ extern int qlnxr_gsi_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); extern int qlnxr_gsi_post_recv(struct ib_qp *ibqp, - struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); + const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); extern int qlnxr_gsi_post_send(struct ib_qp *ibqp, - struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); extern struct ib_qp* qlnxr_create_gsi_qp(struct qlnxr_dev *dev, struct ib_qp_init_attr *attrs, struct qlnxr_qp *qp); extern void qlnxr_store_gsi_qp_cq(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct ib_qp_init_attr *attrs); extern void qlnxr_inc_sw_gsi_cons(struct qlnxr_qp_hwq_info *info); extern int qlnxr_destroy_gsi_qp(struct qlnxr_dev *dev); #endif /* #ifndef __QLNXR_CM_H__ */ diff --git a/sys/dev/qlnx/qlnxr/qlnxr_verbs.c b/sys/dev/qlnx/qlnxr/qlnxr_verbs.c index 861db52044f8..c078df0365f9 100644 --- a/sys/dev/qlnx/qlnxr/qlnxr_verbs.c +++ b/sys/dev/qlnx/qlnxr/qlnxr_verbs.c @@ -1,7213 +1,7213 @@ /* * Copyright (c) 2018-2019 Cavium, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * File: qlnxr_verbs.c */ #include __FBSDID("$FreeBSD$"); #include "qlnxr_def.h" #include "rdma_common.h" #include "qlnxr_roce.h" #include "qlnxr_cm.h" #define HILO_U64(hi, lo) ((((u64)(hi)) << 32) + (lo)) #define TYPEPTR_ADDR_SET(type_ptr, field, vaddr) \ do { \ (type_ptr)->field.hi = cpu_to_le32(upper_32_bits(vaddr));\ (type_ptr)->field.lo = cpu_to_le32(lower_32_bits(vaddr));\ } while (0) #define RQ_SGE_SET(sge, vaddr, vlength, vflags) \ do { \ TYPEPTR_ADDR_SET(sge, addr, vaddr); \ (sge)->length = cpu_to_le32(vlength); \ (sge)->flags = cpu_to_le32(vflags); \ } while (0) #define SRQ_HDR_SET(hdr, vwr_id, num_sge) \ do { \ TYPEPTR_ADDR_SET(hdr, wr_id, vwr_id); \ (hdr)->num_sges = num_sge; \ } while (0) #define SRQ_SGE_SET(sge, vaddr, vlength, vlkey) \ do { \ TYPEPTR_ADDR_SET(sge, addr, vaddr); \ (sge)->length = cpu_to_le32(vlength); \ (sge)->l_key = cpu_to_le32(vlkey); \ } while (0) #define NIPQUAD(addr) \ ((unsigned char *)&addr)[0], \ ((unsigned char *)&addr)[1], \ ((unsigned char *)&addr)[2], \ ((unsigned char *)&addr)[3] static int qlnxr_check_srq_params(struct ib_pd *ibpd, struct qlnxr_dev *dev, struct ib_srq_init_attr *attrs); static int qlnxr_init_srq_user_params(struct ib_ucontext *ib_ctx, struct qlnxr_srq *srq, struct qlnxr_create_srq_ureq *ureq, int access, int dmasync); static int qlnxr_alloc_srq_kernel_params(struct qlnxr_srq *srq, struct qlnxr_dev *dev, struct ib_srq_init_attr *init_attr); static int qlnxr_copy_srq_uresp(struct qlnxr_dev *dev, struct qlnxr_srq *srq, struct ib_udata *udata); static void qlnxr_free_srq_user_params(struct qlnxr_srq *srq); static void qlnxr_free_srq_kernel_params(struct qlnxr_srq *srq); static u32 qlnxr_srq_elem_left(struct qlnxr_srq_hwq_info *hw_srq); int qlnxr_iw_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *sgid) { struct qlnxr_dev *dev; qlnx_host_t *ha; dev = get_qlnxr_dev(ibdev); ha = dev->ha; QL_DPRINT12(ha, "enter\n"); memset(sgid->raw, 0, sizeof(sgid->raw)); memcpy(sgid->raw, dev->ha->primary_mac, sizeof (dev->ha->primary_mac)); QL_DPRINT12(ha, "exit\n"); return 0; } int qlnxr_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *sgid) { struct qlnxr_dev *dev; qlnx_host_t *ha; dev = get_qlnxr_dev(ibdev); ha = dev->ha; QL_DPRINT12(ha, "enter index: %d\n", index); #if 0 int ret = 0; /* @@@: if DEFINE_ROCE_GID_TABLE to be used here */ //if (!rdma_cap_roce_gid_table(ibdev, port)) { if (!(rdma_protocol_roce(ibdev, port) && ibdev->add_gid && ibdev->del_gid)) { QL_DPRINT11(ha, "acquire gid failed\n"); return -ENODEV; } ret = ib_get_cached_gid(ibdev, port, index, sgid, NULL); if (ret == -EAGAIN) { memcpy(sgid, &zgid, sizeof(*sgid)); return 0; } #endif if ((index >= QLNXR_MAX_SGID) || (index < 0)) { QL_DPRINT12(ha, "invalid gid index %d\n", index); memset(sgid, 0, sizeof(*sgid)); return -EINVAL; } memcpy(sgid, &dev->sgid_tbl[index], sizeof(*sgid)); QL_DPRINT12(ha, "exit : %p\n", sgid); return 0; } struct ib_srq * qlnxr_create_srq(struct ib_pd *ibpd, struct ib_srq_init_attr *init_attr, struct ib_udata *udata) { struct qlnxr_dev *dev; qlnx_host_t *ha; struct ecore_rdma_destroy_srq_in_params destroy_in_params; struct ecore_rdma_create_srq_out_params out_params; struct ecore_rdma_create_srq_in_params in_params; u64 pbl_base_addr, phy_prod_pair_addr; struct qlnxr_pd *pd = get_qlnxr_pd(ibpd); struct ib_ucontext *ib_ctx = NULL; struct qlnxr_srq_hwq_info *hw_srq; struct qlnxr_ucontext *ctx = NULL; struct qlnxr_create_srq_ureq ureq; u32 page_cnt, page_size; struct qlnxr_srq *srq; int ret = 0; dev = get_qlnxr_dev((ibpd->device)); ha = dev->ha; QL_DPRINT12(ha, "enter\n"); ret = qlnxr_check_srq_params(ibpd, dev, init_attr); srq = kzalloc(sizeof(*srq), GFP_KERNEL); if (!srq) { QL_DPRINT11(ha, "cannot allocate memory for srq\n"); return NULL; //@@@ : TODO what to return here? } srq->dev = dev; hw_srq = &srq->hw_srq; spin_lock_init(&srq->lock); memset(&in_params, 0, sizeof(in_params)); if (udata && ibpd->uobject && ibpd->uobject->context) { ib_ctx = ibpd->uobject->context; ctx = get_qlnxr_ucontext(ib_ctx); memset(&ureq, 0, sizeof(ureq)); if (ib_copy_from_udata(&ureq, udata, min(sizeof(ureq), udata->inlen))) { QL_DPRINT11(ha, "problem" " copying data from user space\n"); goto err0; } ret = qlnxr_init_srq_user_params(ib_ctx, srq, &ureq, 0, 0); if (ret) goto err0; page_cnt = srq->usrq.pbl_info.num_pbes; pbl_base_addr = srq->usrq.pbl_tbl->pa; phy_prod_pair_addr = hw_srq->phy_prod_pair_addr; // @@@ : if DEFINE_IB_UMEM_PAGE_SHIFT // page_size = BIT(srq->usrq.umem->page_shift); // else page_size = srq->usrq.umem->page_size; } else { struct ecore_chain *pbl; ret = qlnxr_alloc_srq_kernel_params(srq, dev, init_attr); if (ret) goto err0; pbl = &hw_srq->pbl; page_cnt = ecore_chain_get_page_cnt(pbl); pbl_base_addr = ecore_chain_get_pbl_phys(pbl); phy_prod_pair_addr = hw_srq->phy_prod_pair_addr; page_size = pbl->elem_per_page << 4; } in_params.pd_id = pd->pd_id; in_params.pbl_base_addr = pbl_base_addr; in_params.prod_pair_addr = phy_prod_pair_addr; in_params.num_pages = page_cnt; in_params.page_size = page_size; ret = ecore_rdma_create_srq(dev->rdma_ctx, &in_params, &out_params); if (ret) goto err1; srq->srq_id = out_params.srq_id; if (udata) { ret = qlnxr_copy_srq_uresp(dev, srq, udata); if (ret) goto err2; } QL_DPRINT12(ha, "created srq with srq_id = 0x%0x\n", srq->srq_id); return &srq->ibsrq; err2: memset(&in_params, 0, sizeof(in_params)); destroy_in_params.srq_id = srq->srq_id; ecore_rdma_destroy_srq(dev->rdma_ctx, &destroy_in_params); err1: if (udata) qlnxr_free_srq_user_params(srq); else qlnxr_free_srq_kernel_params(srq); err0: kfree(srq); return ERR_PTR(-EFAULT); } int qlnxr_destroy_srq(struct ib_srq *ibsrq) { struct qlnxr_dev *dev; struct qlnxr_srq *srq; qlnx_host_t *ha; struct ecore_rdma_destroy_srq_in_params in_params; srq = get_qlnxr_srq(ibsrq); dev = srq->dev; ha = dev->ha; memset(&in_params, 0, sizeof(in_params)); in_params.srq_id = srq->srq_id; ecore_rdma_destroy_srq(dev->rdma_ctx, &in_params); if (ibsrq->pd->uobject && ibsrq->pd->uobject->context) qlnxr_free_srq_user_params(srq); else qlnxr_free_srq_kernel_params(srq); QL_DPRINT12(ha, "destroyed srq_id=0x%0x\n", srq->srq_id); kfree(srq); return 0; } int qlnxr_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) { struct qlnxr_dev *dev; struct qlnxr_srq *srq; qlnx_host_t *ha; struct ecore_rdma_modify_srq_in_params in_params; int ret = 0; srq = get_qlnxr_srq(ibsrq); dev = srq->dev; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (attr_mask & IB_SRQ_MAX_WR) { QL_DPRINT12(ha, "invalid attribute mask=0x%x" " specified for %p\n", attr_mask, srq); return -EINVAL; } if (attr_mask & IB_SRQ_LIMIT) { if (attr->srq_limit >= srq->hw_srq.max_wr) { QL_DPRINT12(ha, "invalid srq_limit=0x%x" " (max_srq_limit = 0x%x)\n", attr->srq_limit, srq->hw_srq.max_wr); return -EINVAL; } memset(&in_params, 0, sizeof(in_params)); in_params.srq_id = srq->srq_id; in_params.wqe_limit = attr->srq_limit; ret = ecore_rdma_modify_srq(dev->rdma_ctx, &in_params); if (ret) return ret; } QL_DPRINT12(ha, "modified srq with srq_id = 0x%0x\n", srq->srq_id); return 0; } int qlnxr_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) { struct qlnxr_dev *dev; struct qlnxr_srq *srq; qlnx_host_t *ha; struct ecore_rdma_device *qattr; srq = get_qlnxr_srq(ibsrq); dev = srq->dev; ha = dev->ha; //qattr = &dev->attr; qattr = ecore_rdma_query_device(dev->rdma_ctx); QL_DPRINT12(ha, "enter\n"); if (!dev->rdma_ctx) { QL_DPRINT12(ha, "called with invalid params" " rdma_ctx is NULL\n"); return -EINVAL; } srq_attr->srq_limit = qattr->max_srq; srq_attr->max_wr = qattr->max_srq_wr; srq_attr->max_sge = qattr->max_sge; QL_DPRINT12(ha, "exit\n"); return 0; } /* Increment srq wr producer by one */ static void qlnxr_inc_srq_wr_prod (struct qlnxr_srq_hwq_info *info) { info->wr_prod_cnt++; } /* Increment srq wr consumer by one */ static void qlnxr_inc_srq_wr_cons(struct qlnxr_srq_hwq_info *info) { info->wr_cons_cnt++; } /* get_port_immutable verb is not available in FreeBSD */ #if 0 int qlnxr_roce_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable) { struct qlnxr_dev *dev; qlnx_host_t *ha; dev = get_qlnxr_dev(ibdev); ha = dev->ha; QL_DPRINT12(ha, "entered but not implemented!!!\n"); } #endif int -qlnxr_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +qlnxr_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct qlnxr_dev *dev; struct qlnxr_srq *srq; qlnx_host_t *ha; struct qlnxr_srq_hwq_info *hw_srq; struct ecore_chain *pbl; unsigned long flags; int status = 0; u32 num_sge, offset; srq = get_qlnxr_srq(ibsrq); dev = srq->dev; ha = dev->ha; hw_srq = &srq->hw_srq; QL_DPRINT12(ha, "enter\n"); spin_lock_irqsave(&srq->lock, flags); pbl = &srq->hw_srq.pbl; while (wr) { struct rdma_srq_wqe_header *hdr; int i; if (!qlnxr_srq_elem_left(hw_srq) || wr->num_sge > srq->hw_srq.max_sges) { QL_DPRINT11(ha, "WR cannot be posted" " (%d, %d) || (%d > %d)\n", hw_srq->wr_prod_cnt, hw_srq->wr_cons_cnt, wr->num_sge, srq->hw_srq.max_sges); status = -ENOMEM; *bad_wr = wr; break; } hdr = ecore_chain_produce(pbl); num_sge = wr->num_sge; /* Set number of sge and WR id in header */ SRQ_HDR_SET(hdr, wr->wr_id, num_sge); /* PBL is maintained in case of WR granularity. * So increment WR producer in case we post a WR. */ qlnxr_inc_srq_wr_prod(hw_srq); hw_srq->wqe_prod++; hw_srq->sge_prod++; QL_DPRINT12(ha, "SRQ WR : SGEs: %d with wr_id[%d] = %llx\n", wr->num_sge, hw_srq->wqe_prod, wr->wr_id); for (i = 0; i < wr->num_sge; i++) { struct rdma_srq_sge *srq_sge = ecore_chain_produce(pbl); /* Set SGE length, lkey and address */ SRQ_SGE_SET(srq_sge, wr->sg_list[i].addr, wr->sg_list[i].length, wr->sg_list[i].lkey); QL_DPRINT12(ha, "[%d]: len %d, key %x, addr %x:%x\n", i, srq_sge->length, srq_sge->l_key, srq_sge->addr.hi, srq_sge->addr.lo); hw_srq->sge_prod++; } wmb(); /* * SRQ prod is 8 bytes. Need to update SGE prod in index * in first 4 bytes and need to update WQE prod in next * 4 bytes. */ *(srq->hw_srq.virt_prod_pair_addr) = hw_srq->sge_prod; offset = offsetof(struct rdma_srq_producers, wqe_prod); *((u8 *)srq->hw_srq.virt_prod_pair_addr + offset) = hw_srq->wqe_prod; /* Flush prod after updating it */ wmb(); wr = wr->next; } QL_DPRINT12(ha, "Elements in SRQ: %d\n", ecore_chain_get_elem_left(pbl)); spin_unlock_irqrestore(&srq->lock, flags); QL_DPRINT12(ha, "exit\n"); return status; } int #if __FreeBSD_version < 1102000 qlnxr_query_device(struct ib_device *ibdev, struct ib_device_attr *attr) #else qlnxr_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, struct ib_udata *udata) #endif /* #if __FreeBSD_version < 1102000 */ { struct qlnxr_dev *dev; struct ecore_rdma_device *qattr; qlnx_host_t *ha; dev = get_qlnxr_dev(ibdev); ha = dev->ha; QL_DPRINT12(ha, "enter\n"); #if __FreeBSD_version > 1102000 if (udata->inlen || udata->outlen) return -EINVAL; #endif /* #if __FreeBSD_version > 1102000 */ if (dev->rdma_ctx == NULL) { return -EINVAL; } qattr = ecore_rdma_query_device(dev->rdma_ctx); memset(attr, 0, sizeof *attr); attr->fw_ver = qattr->fw_ver; attr->sys_image_guid = qattr->sys_image_guid; attr->max_mr_size = qattr->max_mr_size; attr->page_size_cap = qattr->page_size_caps; attr->vendor_id = qattr->vendor_id; attr->vendor_part_id = qattr->vendor_part_id; attr->hw_ver = qattr->hw_ver; attr->max_qp = qattr->max_qp; attr->device_cap_flags = IB_DEVICE_CURR_QP_STATE_MOD | IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_MGT_EXTENSIONS; attr->max_sge = qattr->max_sge; attr->max_sge_rd = qattr->max_sge; attr->max_cq = qattr->max_cq; attr->max_cqe = qattr->max_cqe; attr->max_mr = qattr->max_mr; attr->max_mw = qattr->max_mw; attr->max_pd = qattr->max_pd; attr->atomic_cap = dev->atomic_cap; attr->max_fmr = qattr->max_fmr; attr->max_map_per_fmr = 16; /* TBD: FMR */ /* There is an implicit assumption in some of the ib_xxx apps that the * qp_rd_atom is smaller than the qp_init_rd_atom. Specifically, in * communication the qp_rd_atom is passed to the other side and used as * init_rd_atom without check device capabilities for init_rd_atom. * for this reason, we set the qp_rd_atom to be the minimum between the * two...There is an additional assumption in mlx4 driver that the * values are power of two, fls is performed on the value - 1, which * in fact gives a larger power of two for values which are not a power * of two. This should be fixed in mlx4 driver, but until then -> * we provide a value that is a power of two in our code. */ attr->max_qp_init_rd_atom = 1 << (fls(qattr->max_qp_req_rd_atomic_resc) - 1); attr->max_qp_rd_atom = min(1 << (fls(qattr->max_qp_resp_rd_atomic_resc) - 1), attr->max_qp_init_rd_atom); attr->max_srq = qattr->max_srq; attr->max_srq_sge = qattr->max_srq_sge; attr->max_srq_wr = qattr->max_srq_wr; /* TODO: R&D to more properly configure the following */ attr->local_ca_ack_delay = qattr->dev_ack_delay; attr->max_fast_reg_page_list_len = qattr->max_mr/8; attr->max_pkeys = QLNXR_ROCE_PKEY_MAX; attr->max_ah = qattr->max_ah; QL_DPRINT12(ha, "exit\n"); return 0; } static inline void get_link_speed_and_width(int speed, uint8_t *ib_speed, uint8_t *ib_width) { switch (speed) { case 1000: *ib_speed = IB_SPEED_SDR; *ib_width = IB_WIDTH_1X; break; case 10000: *ib_speed = IB_SPEED_QDR; *ib_width = IB_WIDTH_1X; break; case 20000: *ib_speed = IB_SPEED_DDR; *ib_width = IB_WIDTH_4X; break; case 25000: *ib_speed = IB_SPEED_EDR; *ib_width = IB_WIDTH_1X; break; case 40000: *ib_speed = IB_SPEED_QDR; *ib_width = IB_WIDTH_4X; break; case 50000: *ib_speed = IB_SPEED_QDR; *ib_width = IB_WIDTH_4X; // TODO doesn't add up to 50... break; case 100000: *ib_speed = IB_SPEED_EDR; *ib_width = IB_WIDTH_4X; break; default: /* Unsupported */ *ib_speed = IB_SPEED_SDR; *ib_width = IB_WIDTH_1X; } return; } int qlnxr_query_port(struct ib_device *ibdev, uint8_t port, struct ib_port_attr *attr) { struct qlnxr_dev *dev; struct ecore_rdma_port *rdma_port; qlnx_host_t *ha; dev = get_qlnxr_dev(ibdev); ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (port > 1) { QL_DPRINT12(ha, "port [%d] > 1 \n", port); return -EINVAL; } if (dev->rdma_ctx == NULL) { QL_DPRINT12(ha, "rdma_ctx == NULL\n"); return -EINVAL; } rdma_port = ecore_rdma_query_port(dev->rdma_ctx); memset(attr, 0, sizeof *attr); if (rdma_port->port_state == ECORE_RDMA_PORT_UP) { attr->state = IB_PORT_ACTIVE; attr->phys_state = 5; } else { attr->state = IB_PORT_DOWN; attr->phys_state = 3; } attr->max_mtu = IB_MTU_4096; attr->active_mtu = iboe_get_mtu(dev->ha->ifp->if_mtu); attr->lid = 0; attr->lmc = 0; attr->sm_lid = 0; attr->sm_sl = 0; attr->port_cap_flags = 0; if (QLNX_IS_IWARP(dev)) { attr->gid_tbl_len = 1; attr->pkey_tbl_len = 1; } else { attr->gid_tbl_len = QLNXR_MAX_SGID; attr->pkey_tbl_len = QLNXR_ROCE_PKEY_TABLE_LEN; } attr->bad_pkey_cntr = rdma_port->pkey_bad_counter; attr->qkey_viol_cntr = 0; get_link_speed_and_width(rdma_port->link_speed, &attr->active_speed, &attr->active_width); attr->max_msg_sz = rdma_port->max_msg_size; attr->max_vl_num = 4; /* TODO -> figure this one out... */ QL_DPRINT12(ha, "state = %d phys_state = %d " " link_speed = %d active_speed = %d active_width = %d" " attr->gid_tbl_len = %d attr->pkey_tbl_len = %d" " max_msg_sz = 0x%x max_vl_num = 0x%x \n", attr->state, attr->phys_state, rdma_port->link_speed, attr->active_speed, attr->active_width, attr->gid_tbl_len, attr->pkey_tbl_len, attr->max_msg_sz, attr->max_vl_num); QL_DPRINT12(ha, "exit\n"); return 0; } int qlnxr_modify_port(struct ib_device *ibdev, uint8_t port, int mask, struct ib_port_modify *props) { struct qlnxr_dev *dev; qlnx_host_t *ha; dev = get_qlnxr_dev(ibdev); ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (port > 1) { QL_DPRINT12(ha, "port (%d) > 1\n", port); return -EINVAL; } QL_DPRINT12(ha, "exit\n"); return 0; } enum rdma_link_layer qlnxr_link_layer(struct ib_device *ibdev, uint8_t port_num) { struct qlnxr_dev *dev; qlnx_host_t *ha; dev = get_qlnxr_dev(ibdev); ha = dev->ha; QL_DPRINT12(ha, "ibdev = %p port_num = 0x%x\n", ibdev, port_num); return IB_LINK_LAYER_ETHERNET; } struct ib_pd * qlnxr_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct qlnxr_pd *pd = NULL; u16 pd_id; int rc; struct qlnxr_dev *dev; qlnx_host_t *ha; dev = get_qlnxr_dev(ibdev); ha = dev->ha; QL_DPRINT12(ha, "ibdev = %p context = %p" " udata = %p enter\n", ibdev, context, udata); if (dev->rdma_ctx == NULL) { QL_DPRINT11(ha, "dev->rdma_ctx = NULL\n"); rc = -1; goto err; } pd = kzalloc(sizeof(*pd), GFP_KERNEL); if (!pd) { rc = -ENOMEM; QL_DPRINT11(ha, "kzalloc(pd) = NULL\n"); goto err; } rc = ecore_rdma_alloc_pd(dev->rdma_ctx, &pd_id); if (rc) { QL_DPRINT11(ha, "ecore_rdma_alloc_pd failed\n"); goto err; } pd->pd_id = pd_id; if (udata && context) { rc = ib_copy_to_udata(udata, &pd->pd_id, sizeof(pd->pd_id)); if (rc) { QL_DPRINT11(ha, "ib_copy_to_udata failed\n"); ecore_rdma_free_pd(dev->rdma_ctx, pd_id); goto err; } pd->uctx = get_qlnxr_ucontext(context); pd->uctx->pd = pd; } atomic_add_rel_32(&dev->pd_count, 1); QL_DPRINT12(ha, "exit [pd, pd_id, pd_count] = [%p, 0x%x, %d]\n", pd, pd_id, dev->pd_count); return &pd->ibpd; err: kfree(pd); QL_DPRINT12(ha, "exit -1\n"); return ERR_PTR(rc); } int qlnxr_dealloc_pd(struct ib_pd *ibpd) { struct qlnxr_pd *pd; struct qlnxr_dev *dev; qlnx_host_t *ha; pd = get_qlnxr_pd(ibpd); dev = get_qlnxr_dev((ibpd->device)); ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (pd == NULL) { QL_DPRINT11(ha, "pd = NULL\n"); } else { ecore_rdma_free_pd(dev->rdma_ctx, pd->pd_id); kfree(pd); atomic_subtract_rel_32(&dev->pd_count, 1); QL_DPRINT12(ha, "exit [pd, pd_id, pd_count] = [%p, 0x%x, %d]\n", pd, pd->pd_id, dev->pd_count); } QL_DPRINT12(ha, "exit\n"); return 0; } #define ROCE_WQE_ELEM_SIZE sizeof(struct rdma_sq_sge) #define RDMA_MAX_SGE_PER_SRQ (4) /* Should be part of HSI */ /* Should be part of HSI */ #define RDMA_MAX_SRQ_WQE_SIZE (RDMA_MAX_SGE_PER_SRQ + 1) /* +1 for header */ #define DB_ADDR_SHIFT(addr) ((addr) << DB_PWM_ADDR_OFFSET_SHIFT) static void qlnxr_cleanup_user(struct qlnxr_dev *, struct qlnxr_qp *); static void qlnxr_cleanup_kernel(struct qlnxr_dev *, struct qlnxr_qp *); int qlnxr_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { struct qlnxr_dev *dev; qlnx_host_t *ha; dev = get_qlnxr_dev(ibdev); ha = dev->ha; QL_DPRINT12(ha, "enter index = 0x%x\n", index); if (index > QLNXR_ROCE_PKEY_TABLE_LEN) return -EINVAL; *pkey = QLNXR_ROCE_PKEY_DEFAULT; QL_DPRINT12(ha, "exit\n"); return 0; } static inline bool qlnxr_get_vlan_id_qp(qlnx_host_t *ha, struct ib_qp_attr *attr, int attr_mask, u16 *vlan_id) { bool ret = false; QL_DPRINT12(ha, "enter \n"); *vlan_id = 0; #if __FreeBSD_version >= 1100000 u16 tmp_vlan_id; #if __FreeBSD_version >= 1102000 union ib_gid *dgid; dgid = &attr->ah_attr.grh.dgid; tmp_vlan_id = (dgid->raw[11] << 8) | dgid->raw[12]; if (!(tmp_vlan_id & ~EVL_VLID_MASK)) { *vlan_id = tmp_vlan_id; ret = true; } #else tmp_vlan_id = attr->vlan_id; if ((attr_mask & IB_QP_VID) && (!(tmp_vlan_id & ~EVL_VLID_MASK))) { *vlan_id = tmp_vlan_id; ret = true; } #endif /* #if __FreeBSD_version > 1102000 */ #else ret = true; #endif /* #if __FreeBSD_version >= 1100000 */ QL_DPRINT12(ha, "exit vlan_id = 0x%x ret = %d \n", *vlan_id, ret); return (ret); } static inline void get_gid_info(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct ecore_rdma_modify_qp_in_params *qp_params) { int i; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); memcpy(&qp_params->sgid.bytes[0], &dev->sgid_tbl[qp->sgid_idx].raw[0], sizeof(qp_params->sgid.bytes)); memcpy(&qp_params->dgid.bytes[0], &attr->ah_attr.grh.dgid.raw[0], sizeof(qp_params->dgid)); qlnxr_get_vlan_id_qp(ha, attr, attr_mask, &qp_params->vlan_id); for (i = 0; i < (sizeof(qp_params->sgid.dwords)/sizeof(uint32_t)); i++) { qp_params->sgid.dwords[i] = ntohl(qp_params->sgid.dwords[i]); qp_params->dgid.dwords[i] = ntohl(qp_params->dgid.dwords[i]); } QL_DPRINT12(ha, "exit\n"); return; } static int qlnxr_add_mmap(struct qlnxr_ucontext *uctx, u64 phy_addr, unsigned long len) { struct qlnxr_mm *mm; qlnx_host_t *ha; ha = uctx->dev->ha; QL_DPRINT12(ha, "enter\n"); mm = kzalloc(sizeof(*mm), GFP_KERNEL); if (mm == NULL) { QL_DPRINT11(ha, "mm = NULL\n"); return -ENOMEM; } mm->key.phy_addr = phy_addr; /* This function might be called with a length which is not a multiple * of PAGE_SIZE, while the mapping is PAGE_SIZE grained and the kernel * forces this granularity by increasing the requested size if needed. * When qedr_mmap is called, it will search the list with the updated * length as a key. To prevent search failures, the length is rounded up * in advance to PAGE_SIZE. */ mm->key.len = roundup(len, PAGE_SIZE); INIT_LIST_HEAD(&mm->entry); mutex_lock(&uctx->mm_list_lock); list_add(&mm->entry, &uctx->mm_head); mutex_unlock(&uctx->mm_list_lock); QL_DPRINT12(ha, "added (addr=0x%llx,len=0x%lx) for ctx=%p\n", (unsigned long long)mm->key.phy_addr, (unsigned long)mm->key.len, uctx); return 0; } static bool qlnxr_search_mmap(struct qlnxr_ucontext *uctx, u64 phy_addr, unsigned long len) { bool found = false; struct qlnxr_mm *mm; qlnx_host_t *ha; ha = uctx->dev->ha; QL_DPRINT12(ha, "enter\n"); mutex_lock(&uctx->mm_list_lock); list_for_each_entry(mm, &uctx->mm_head, entry) { if (len != mm->key.len || phy_addr != mm->key.phy_addr) continue; found = true; break; } mutex_unlock(&uctx->mm_list_lock); QL_DPRINT12(ha, "searched for (addr=0x%llx,len=0x%lx) for ctx=%p, found=%d\n", mm->key.phy_addr, mm->key.len, uctx, found); return found; } struct ib_ucontext *qlnxr_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { int rc; struct qlnxr_ucontext *ctx; struct qlnxr_alloc_ucontext_resp uresp; struct qlnxr_dev *dev = get_qlnxr_dev(ibdev); qlnx_host_t *ha = dev->ha; struct ecore_rdma_add_user_out_params oparams; if (!udata) { return ERR_PTR(-EFAULT); } ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); rc = ecore_rdma_add_user(dev->rdma_ctx, &oparams); if (rc) { QL_DPRINT12(ha, "Failed to allocate a DPI for a new RoCE application " ",rc = %d. To overcome this, consider to increase " "the number of DPIs, increase the doorbell BAR size " "or just close unnecessary RoCE applications. In " "order to increase the number of DPIs consult the " "README\n", rc); goto err; } ctx->dpi = oparams.dpi; ctx->dpi_addr = oparams.dpi_addr; ctx->dpi_phys_addr = oparams.dpi_phys_addr; ctx->dpi_size = oparams.dpi_size; INIT_LIST_HEAD(&ctx->mm_head); mutex_init(&ctx->mm_list_lock); memset(&uresp, 0, sizeof(uresp)); uresp.dpm_enabled = offsetof(struct qlnxr_alloc_ucontext_resp, dpm_enabled) < udata->outlen ? dev->user_dpm_enabled : 0; //TODO: figure this out uresp.wids_enabled = offsetof(struct qlnxr_alloc_ucontext_resp, wids_enabled) < udata->outlen ? 1 : 0; //TODO: figure this out uresp.wid_count = offsetof(struct qlnxr_alloc_ucontext_resp, wid_count) < udata->outlen ? oparams.wid_count : 0; //TODO: figure this out uresp.db_pa = ctx->dpi_phys_addr; uresp.db_size = ctx->dpi_size; uresp.max_send_wr = dev->attr.max_sqe; uresp.max_recv_wr = dev->attr.max_rqe; uresp.max_srq_wr = dev->attr.max_srq_wr; uresp.sges_per_send_wr = QLNXR_MAX_SQE_ELEMENTS_PER_SQE; uresp.sges_per_recv_wr = QLNXR_MAX_RQE_ELEMENTS_PER_RQE; uresp.sges_per_srq_wr = dev->attr.max_srq_sge; uresp.max_cqes = QLNXR_MAX_CQES; rc = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (rc) goto err; ctx->dev = dev; rc = qlnxr_add_mmap(ctx, ctx->dpi_phys_addr, ctx->dpi_size); if (rc) goto err; QL_DPRINT12(ha, "Allocated user context %p\n", &ctx->ibucontext); return &ctx->ibucontext; err: kfree(ctx); return ERR_PTR(rc); } int qlnxr_dealloc_ucontext(struct ib_ucontext *ibctx) { struct qlnxr_ucontext *uctx = get_qlnxr_ucontext(ibctx); struct qlnxr_dev *dev = uctx->dev; qlnx_host_t *ha = dev->ha; struct qlnxr_mm *mm, *tmp; int status = 0; QL_DPRINT12(ha, "Deallocating user context %p\n", uctx); if (dev) { ecore_rdma_remove_user(uctx->dev->rdma_ctx, uctx->dpi); } list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) { QL_DPRINT12(ha, "deleted addr= 0x%llx, len = 0x%lx for" " ctx=%p\n", mm->key.phy_addr, mm->key.len, uctx); list_del(&mm->entry); kfree(mm); } kfree(uctx); return status; } int qlnxr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct qlnxr_ucontext *ucontext = get_qlnxr_ucontext(context); struct qlnxr_dev *dev = get_qlnxr_dev((context->device)); unsigned long vm_page = vma->vm_pgoff << PAGE_SHIFT; u64 unmapped_db; unsigned long len = (vma->vm_end - vma->vm_start); int rc = 0; bool found; qlnx_host_t *ha; ha = dev->ha; #if __FreeBSD_version > 1102000 unmapped_db = dev->db_phys_addr + (ucontext->dpi * ucontext->dpi_size); #else unmapped_db = dev->db_phys_addr; #endif /* #if __FreeBSD_version > 1102000 */ QL_DPRINT12(ha, "qedr_mmap enter vm_page=0x%lx" " vm_pgoff=0x%lx unmapped_db=0x%llx db_size=%x, len=%lx\n", vm_page, vma->vm_pgoff, unmapped_db, dev->db_size, len); if ((vma->vm_start & (PAGE_SIZE - 1)) || (len & (PAGE_SIZE - 1))) { QL_DPRINT11(ha, "Vma_start not page aligned " "vm_start = %ld vma_end = %ld\n", vma->vm_start, vma->vm_end); return -EINVAL; } found = qlnxr_search_mmap(ucontext, vm_page, len); if (!found) { QL_DPRINT11(ha, "Vma_pgoff not found in mapped array = %ld\n", vma->vm_pgoff); return -EINVAL; } QL_DPRINT12(ha, "Mapping doorbell bar\n"); #if __FreeBSD_version > 1102000 if ((vm_page < unmapped_db) || ((vm_page + len) > (unmapped_db + ucontext->dpi_size))) { QL_DPRINT11(ha, "failed pages are outside of dpi;" "page address=0x%lx, unmapped_db=0x%lx, dpi_size=0x%x\n", vm_page, unmapped_db, ucontext->dpi_size); return -EINVAL; } if (vma->vm_flags & VM_READ) { QL_DPRINT11(ha, "failed mmap, cannot map doorbell bar for read\n"); return -EINVAL; } vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); rc = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, len, vma->vm_page_prot); #else if ((vm_page >= unmapped_db) && (vm_page <= (unmapped_db + dev->db_size))) { QL_DPRINT12(ha, "Mapping doorbell bar\n"); vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); rc = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, PAGE_SIZE, vma->vm_page_prot); } else { QL_DPRINT12(ha, "Mapping chains\n"); rc = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, len, vma->vm_page_prot); } #endif /* #if __FreeBSD_version > 1102000 */ QL_DPRINT12(ha, "exit [%d]\n", rc); return rc; } struct ib_mr * qlnxr_get_dma_mr(struct ib_pd *ibpd, int acc) { struct qlnxr_mr *mr; struct qlnxr_dev *dev = get_qlnxr_dev((ibpd->device)); struct qlnxr_pd *pd = get_qlnxr_pd(ibpd); int rc; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (acc & IB_ACCESS_MW_BIND) { QL_DPRINT12(ha, "Unsupported access flags received for dma mr\n"); } mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { rc = -ENOMEM; QL_DPRINT12(ha, "kzalloc(mr) failed %d\n", rc); goto err0; } mr->type = QLNXR_MR_DMA; rc = ecore_rdma_alloc_tid(dev->rdma_ctx, &mr->hw_mr.itid); if (rc) { QL_DPRINT12(ha, "ecore_rdma_alloc_tid failed %d\n", rc); goto err1; } /* index only, 18 bit long, lkey = itid << 8 | key */ mr->hw_mr.tid_type = ECORE_RDMA_TID_REGISTERED_MR; mr->hw_mr.pd = pd->pd_id; mr->hw_mr.local_read = 1; mr->hw_mr.local_write = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0; mr->hw_mr.remote_read = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0; mr->hw_mr.remote_write = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; mr->hw_mr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; mr->hw_mr.dma_mr = true; rc = ecore_rdma_register_tid(dev->rdma_ctx, &mr->hw_mr); if (rc) { QL_DPRINT12(ha, "ecore_rdma_register_tid failed %d\n", rc); goto err2; } mr->ibmr.lkey = mr->hw_mr.itid << 8 | mr->hw_mr.key; if (mr->hw_mr.remote_write || mr->hw_mr.remote_read || mr->hw_mr.remote_atomic) { mr->ibmr.rkey = mr->hw_mr.itid << 8 | mr->hw_mr.key; } QL_DPRINT12(ha, "lkey = %x\n", mr->ibmr.lkey); return &mr->ibmr; err2: ecore_rdma_free_tid(dev->rdma_ctx, mr->hw_mr.itid); err1: kfree(mr); err0: QL_DPRINT12(ha, "exit [%d]\n", rc); return ERR_PTR(rc); } static void qlnxr_free_pbl(struct qlnxr_dev *dev, struct qlnxr_pbl_info *pbl_info, struct qlnxr_pbl *pbl) { int i; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); for (i = 0; i < pbl_info->num_pbls; i++) { if (!pbl[i].va) continue; qlnx_dma_free_coherent(&dev->ha->cdev, pbl[i].va, pbl[i].pa, pbl_info->pbl_size); } kfree(pbl); QL_DPRINT12(ha, "exit\n"); return; } #define MIN_FW_PBL_PAGE_SIZE (4*1024) #define MAX_FW_PBL_PAGE_SIZE (64*1024) #define NUM_PBES_ON_PAGE(_page_size) (_page_size / sizeof(u64)) #define MAX_PBES_ON_PAGE NUM_PBES_ON_PAGE(MAX_FW_PBL_PAGE_SIZE) #define MAX_PBES_TWO_LAYER (MAX_PBES_ON_PAGE*MAX_PBES_ON_PAGE) static struct qlnxr_pbl * qlnxr_alloc_pbl_tbl(struct qlnxr_dev *dev, struct qlnxr_pbl_info *pbl_info, gfp_t flags) { void *va; dma_addr_t pa; dma_addr_t *pbl_main_tbl; struct qlnxr_pbl *pbl_table; int i, rc = 0; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); pbl_table = kzalloc(sizeof(*pbl_table) * pbl_info->num_pbls, flags); if (!pbl_table) { QL_DPRINT12(ha, "pbl_table = NULL\n"); return NULL; } for (i = 0; i < pbl_info->num_pbls; i++) { va = qlnx_dma_alloc_coherent(&dev->ha->cdev, &pa, pbl_info->pbl_size); if (!va) { QL_DPRINT11(ha, "Failed to allocate pbl#%d\n", i); rc = -ENOMEM; goto err; } memset(va, 0, pbl_info->pbl_size); pbl_table[i].va = va; pbl_table[i].pa = pa; } /* Two-Layer PBLs, if we have more than one pbl we need to initialize * the first one with physical pointers to all of the rest */ pbl_main_tbl = (dma_addr_t *)pbl_table[0].va; for (i = 0; i < pbl_info->num_pbls - 1; i++) pbl_main_tbl[i] = pbl_table[i + 1].pa; QL_DPRINT12(ha, "exit\n"); return pbl_table; err: qlnxr_free_pbl(dev, pbl_info, pbl_table); QL_DPRINT12(ha, "exit with error\n"); return NULL; } static int qlnxr_prepare_pbl_tbl(struct qlnxr_dev *dev, struct qlnxr_pbl_info *pbl_info, u32 num_pbes, int two_layer_capable) { u32 pbl_capacity; u32 pbl_size; u32 num_pbls; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if ((num_pbes > MAX_PBES_ON_PAGE) && two_layer_capable) { if (num_pbes > MAX_PBES_TWO_LAYER) { QL_DPRINT11(ha, "prepare pbl table: too many pages %d\n", num_pbes); return -EINVAL; } /* calculate required pbl page size */ pbl_size = MIN_FW_PBL_PAGE_SIZE; pbl_capacity = NUM_PBES_ON_PAGE(pbl_size) * NUM_PBES_ON_PAGE(pbl_size); while (pbl_capacity < num_pbes) { pbl_size *= 2; pbl_capacity = pbl_size / sizeof(u64); pbl_capacity = pbl_capacity * pbl_capacity; } num_pbls = DIV_ROUND_UP(num_pbes, NUM_PBES_ON_PAGE(pbl_size)); num_pbls++; /* One for the layer0 ( points to the pbls) */ pbl_info->two_layered = true; } else { /* One layered PBL */ num_pbls = 1; pbl_size = max_t(u32, MIN_FW_PBL_PAGE_SIZE, \ roundup_pow_of_two((num_pbes * sizeof(u64)))); pbl_info->two_layered = false; } pbl_info->num_pbls = num_pbls; pbl_info->pbl_size = pbl_size; pbl_info->num_pbes = num_pbes; QL_DPRINT12(ha, "prepare pbl table: num_pbes=%d, num_pbls=%d pbl_size=%d\n", pbl_info->num_pbes, pbl_info->num_pbls, pbl_info->pbl_size); return 0; } static void qlnxr_populate_pbls(struct qlnxr_dev *dev, struct ib_umem *umem, struct qlnxr_pbl *pbl, struct qlnxr_pbl_info *pbl_info) { struct regpair *pbe; struct qlnxr_pbl *pbl_tbl; struct scatterlist *sg; int shift, pg_cnt, pages, pbe_cnt, total_num_pbes = 0; qlnx_host_t *ha; #ifdef DEFINE_IB_UMEM_WITH_CHUNK int i; struct ib_umem_chunk *chunk = NULL; #else int entry; #endif ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (!pbl_info) { QL_DPRINT11(ha, "PBL_INFO not initialized\n"); return; } if (!pbl_info->num_pbes) { QL_DPRINT11(ha, "pbl_info->num_pbes == 0\n"); return; } /* If we have a two layered pbl, the first pbl points to the rest * of the pbls and the first entry lays on the second pbl in the table */ if (pbl_info->two_layered) pbl_tbl = &pbl[1]; else pbl_tbl = pbl; pbe = (struct regpair *)pbl_tbl->va; if (!pbe) { QL_DPRINT12(ha, "pbe is NULL\n"); return; } pbe_cnt = 0; shift = ilog2(umem->page_size); #ifndef DEFINE_IB_UMEM_WITH_CHUNK for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { #else list_for_each_entry(chunk, &umem->chunk_list, list) { /* get all the dma regions from the chunk. */ for (i = 0; i < chunk->nmap; i++) { sg = &chunk->page_list[i]; #endif pages = sg_dma_len(sg) >> shift; for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) { /* store the page address in pbe */ pbe->lo = cpu_to_le32(sg_dma_address(sg) + (umem->page_size * pg_cnt)); pbe->hi = cpu_to_le32(upper_32_bits ((sg_dma_address(sg) + umem->page_size * pg_cnt))); QL_DPRINT12(ha, "Populate pbl table:" " pbe->addr=0x%x:0x%x " " pbe_cnt = %d total_num_pbes=%d" " pbe=%p\n", pbe->lo, pbe->hi, pbe_cnt, total_num_pbes, pbe); pbe_cnt ++; total_num_pbes ++; pbe++; if (total_num_pbes == pbl_info->num_pbes) return; /* if the given pbl is full storing the pbes, * move to next pbl. */ if (pbe_cnt == (pbl_info->pbl_size / sizeof(u64))) { pbl_tbl++; pbe = (struct regpair *)pbl_tbl->va; pbe_cnt = 0; } } #ifdef DEFINE_IB_UMEM_WITH_CHUNK } #endif } QL_DPRINT12(ha, "exit\n"); return; } static void free_mr_info(struct qlnxr_dev *dev, struct mr_info *info) { struct qlnxr_pbl *pbl, *tmp; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (info->pbl_table) list_add_tail(&info->pbl_table->list_entry, &info->free_pbl_list); if (!list_empty(&info->inuse_pbl_list)) list_splice(&info->inuse_pbl_list, &info->free_pbl_list); list_for_each_entry_safe(pbl, tmp, &info->free_pbl_list, list_entry) { list_del(&pbl->list_entry); qlnxr_free_pbl(dev, &info->pbl_info, pbl); } QL_DPRINT12(ha, "exit\n"); return; } static int qlnxr_init_mr_info(struct qlnxr_dev *dev, struct mr_info *info, size_t page_list_len, bool two_layered) { int rc; struct qlnxr_pbl *tmp; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); INIT_LIST_HEAD(&info->free_pbl_list); INIT_LIST_HEAD(&info->inuse_pbl_list); rc = qlnxr_prepare_pbl_tbl(dev, &info->pbl_info, page_list_len, two_layered); if (rc) { QL_DPRINT11(ha, "qlnxr_prepare_pbl_tbl [%d]\n", rc); goto done; } info->pbl_table = qlnxr_alloc_pbl_tbl(dev, &info->pbl_info, GFP_KERNEL); if (!info->pbl_table) { rc = -ENOMEM; QL_DPRINT11(ha, "qlnxr_alloc_pbl_tbl returned NULL\n"); goto done; } QL_DPRINT12(ha, "pbl_table_pa = %pa\n", &info->pbl_table->pa); /* in usual case we use 2 PBLs, so we add one to free * list and allocating another one */ tmp = qlnxr_alloc_pbl_tbl(dev, &info->pbl_info, GFP_KERNEL); if (!tmp) { QL_DPRINT11(ha, "Extra PBL is not allocated\n"); goto done; /* it's OK if second allocation fails, so rc = 0*/ } list_add_tail(&tmp->list_entry, &info->free_pbl_list); QL_DPRINT12(ha, "extra pbl_table_pa = %pa\n", &tmp->pa); done: if (rc) free_mr_info(dev, info); QL_DPRINT12(ha, "exit [%d]\n", rc); return rc; } struct ib_mr * #if __FreeBSD_version >= 1102000 qlnxr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, u64 usr_addr, int acc, struct ib_udata *udata) #else qlnxr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, u64 usr_addr, int acc, struct ib_udata *udata, int mr_id) #endif /* #if __FreeBSD_version >= 1102000 */ { int rc = -ENOMEM; struct qlnxr_dev *dev = get_qlnxr_dev((ibpd->device)); struct qlnxr_mr *mr; struct qlnxr_pd *pd; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); pd = get_qlnxr_pd(ibpd); QL_DPRINT12(ha, "qedr_register user mr pd = %d" " start = %lld, len = %lld, usr_addr = %lld, acc = %d\n", pd->pd_id, start, len, usr_addr, acc); if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE)) { QL_DPRINT11(ha, "(acc & IB_ACCESS_REMOTE_WRITE &&" " !(acc & IB_ACCESS_LOCAL_WRITE))\n"); return ERR_PTR(-EINVAL); } mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { QL_DPRINT11(ha, "kzalloc(mr) failed\n"); return ERR_PTR(rc); } mr->type = QLNXR_MR_USER; mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0); if (IS_ERR(mr->umem)) { rc = -EFAULT; QL_DPRINT11(ha, "ib_umem_get failed [%p]\n", mr->umem); goto err0; } rc = qlnxr_init_mr_info(dev, &mr->info, ib_umem_page_count(mr->umem), 1); if (rc) { QL_DPRINT11(ha, "qlnxr_init_mr_info failed [%d]\n", rc); goto err1; } qlnxr_populate_pbls(dev, mr->umem, mr->info.pbl_table, &mr->info.pbl_info); rc = ecore_rdma_alloc_tid(dev->rdma_ctx, &mr->hw_mr.itid); if (rc) { QL_DPRINT11(ha, "roce alloc tid returned an error %d\n", rc); goto err1; } /* index only, 18 bit long, lkey = itid << 8 | key */ mr->hw_mr.tid_type = ECORE_RDMA_TID_REGISTERED_MR; mr->hw_mr.key = 0; mr->hw_mr.pd = pd->pd_id; mr->hw_mr.local_read = 1; mr->hw_mr.local_write = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0; mr->hw_mr.remote_read = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0; mr->hw_mr.remote_write = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; mr->hw_mr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; mr->hw_mr.mw_bind = false; /* TBD MW BIND */ mr->hw_mr.pbl_ptr = mr->info.pbl_table[0].pa; mr->hw_mr.pbl_two_level = mr->info.pbl_info.two_layered; mr->hw_mr.pbl_page_size_log = ilog2(mr->info.pbl_info.pbl_size); mr->hw_mr.page_size_log = ilog2(mr->umem->page_size); /* for the MR pages */ #if __FreeBSD_version >= 1102000 mr->hw_mr.fbo = ib_umem_offset(mr->umem); #else mr->hw_mr.fbo = mr->umem->offset; #endif mr->hw_mr.length = len; mr->hw_mr.vaddr = usr_addr; mr->hw_mr.zbva = false; /* TBD figure when this should be true */ mr->hw_mr.phy_mr = false; /* Fast MR - True, Regular Register False */ mr->hw_mr.dma_mr = false; rc = ecore_rdma_register_tid(dev->rdma_ctx, &mr->hw_mr); if (rc) { QL_DPRINT11(ha, "roce register tid returned an error %d\n", rc); goto err2; } mr->ibmr.lkey = mr->hw_mr.itid << 8 | mr->hw_mr.key; if (mr->hw_mr.remote_write || mr->hw_mr.remote_read || mr->hw_mr.remote_atomic) mr->ibmr.rkey = mr->hw_mr.itid << 8 | mr->hw_mr.key; QL_DPRINT12(ha, "register user mr lkey: %x\n", mr->ibmr.lkey); return (&mr->ibmr); err2: ecore_rdma_free_tid(dev->rdma_ctx, mr->hw_mr.itid); err1: qlnxr_free_pbl(dev, &mr->info.pbl_info, mr->info.pbl_table); err0: kfree(mr); QL_DPRINT12(ha, "exit [%d]\n", rc); return (ERR_PTR(rc)); } int qlnxr_dereg_mr(struct ib_mr *ib_mr) { struct qlnxr_mr *mr = get_qlnxr_mr(ib_mr); struct qlnxr_dev *dev = get_qlnxr_dev((ib_mr->device)); int rc = 0; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if ((mr->type != QLNXR_MR_DMA) && (mr->type != QLNXR_MR_FRMR)) qlnxr_free_pbl(dev, &mr->info.pbl_info, mr->info.pbl_table); /* it could be user registered memory. */ if (mr->umem) ib_umem_release(mr->umem); kfree(mr->pages); kfree(mr); QL_DPRINT12(ha, "exit\n"); return rc; } static int qlnxr_copy_cq_uresp(struct qlnxr_dev *dev, struct qlnxr_cq *cq, struct ib_udata *udata) { struct qlnxr_create_cq_uresp uresp; int rc; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); memset(&uresp, 0, sizeof(uresp)); uresp.db_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT); uresp.icid = cq->icid; rc = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (rc) { QL_DPRINT12(ha, "ib_copy_to_udata error cqid=0x%x[%d]\n", cq->icid, rc); } QL_DPRINT12(ha, "exit [%d]\n", rc); return rc; } static void consume_cqe(struct qlnxr_cq *cq) { if (cq->latest_cqe == cq->toggle_cqe) cq->pbl_toggle ^= RDMA_RESIZE_CQ_RAMROD_DATA_TOGGLE_BIT_MASK; cq->latest_cqe = ecore_chain_consume(&cq->pbl); } static inline int qlnxr_align_cq_entries(int entries) { u64 size, aligned_size; /* We allocate an extra entry that we don't report to the FW. * Why? * The CQE size is 32 bytes but the FW writes in chunks of 64 bytes * (for performance purposes). Allocating an extra entry and telling * the FW we have less prevents overwriting the first entry in case of * a wrap i.e. when the FW writes the last entry and the application * hasn't read the first one. */ size = (entries + 1) * QLNXR_CQE_SIZE; /* We align to PAGE_SIZE. * Why? * Since the CQ is going to be mapped and the mapping is anyhow in whole * kernel pages we benefit from the possibly extra CQEs. */ aligned_size = ALIGN(size, PAGE_SIZE); /* note: for CQs created in user space the result of this function * should match the size mapped in user space */ return (aligned_size / QLNXR_CQE_SIZE); } static inline int qlnxr_init_user_queue(struct ib_ucontext *ib_ctx, struct qlnxr_dev *dev, struct qlnxr_userq *q, u64 buf_addr, size_t buf_len, int access, int dmasync, int alloc_and_init) { int page_cnt; int rc; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); q->buf_addr = buf_addr; q->buf_len = buf_len; QL_DPRINT12(ha, "buf_addr : %llx, buf_len : %x, access : %x" " dmasync : %x\n", q->buf_addr, q->buf_len, access, dmasync); q->umem = ib_umem_get(ib_ctx, q->buf_addr, q->buf_len, access, dmasync); if (IS_ERR(q->umem)) { QL_DPRINT11(ha, "ib_umem_get failed [%lx]\n", PTR_ERR(q->umem)); return PTR_ERR(q->umem); } page_cnt = ib_umem_page_count(q->umem); rc = qlnxr_prepare_pbl_tbl(dev, &q->pbl_info, page_cnt, 0 /* SQ and RQ don't support dual layer pbl. * CQ may, but this is yet uncoded. */); if (rc) { QL_DPRINT11(ha, "qlnxr_prepare_pbl_tbl failed [%d]\n", rc); goto err; } if (alloc_and_init) { q->pbl_tbl = qlnxr_alloc_pbl_tbl(dev, &q->pbl_info, GFP_KERNEL); if (!q->pbl_tbl) { QL_DPRINT11(ha, "qlnxr_alloc_pbl_tbl failed\n"); rc = -ENOMEM; goto err; } qlnxr_populate_pbls(dev, q->umem, q->pbl_tbl, &q->pbl_info); } else { q->pbl_tbl = kzalloc(sizeof(*q->pbl_tbl), GFP_KERNEL); if (!q->pbl_tbl) { QL_DPRINT11(ha, "qlnxr_alloc_pbl_tbl failed\n"); rc = -ENOMEM; goto err; } } QL_DPRINT12(ha, "exit\n"); return 0; err: ib_umem_release(q->umem); q->umem = NULL; QL_DPRINT12(ha, "exit [%d]\n", rc); return rc; } #if __FreeBSD_version >= 1102000 struct ib_cq * qlnxr_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_ucontext *ib_ctx, struct ib_udata *udata) #else #if __FreeBSD_version >= 1100000 struct ib_cq * qlnxr_create_cq(struct ib_device *ibdev, struct ib_cq_init_attr *attr, struct ib_ucontext *ib_ctx, struct ib_udata *udata) #else struct ib_cq * qlnxr_create_cq(struct ib_device *ibdev, int entries, int vector, struct ib_ucontext *ib_ctx, struct ib_udata *udata) #endif /* #if __FreeBSD_version >= 1100000 */ #endif /* #if __FreeBSD_version >= 1102000 */ { struct qlnxr_ucontext *ctx; struct ecore_rdma_destroy_cq_out_params destroy_oparams; struct ecore_rdma_destroy_cq_in_params destroy_iparams; struct qlnxr_dev *dev; struct ecore_rdma_create_cq_in_params params; struct qlnxr_create_cq_ureq ureq; #if __FreeBSD_version >= 1100000 int vector = attr->comp_vector; int entries = attr->cqe; #endif struct qlnxr_cq *cq; int chain_entries, rc, page_cnt; u64 pbl_ptr; u16 icid; qlnx_host_t *ha; dev = get_qlnxr_dev(ibdev); ha = dev->ha; QL_DPRINT12(ha, "called from %s. entries = %d, " "vector = %d\n", (udata ? "User Lib" : "Kernel"), entries, vector); memset(¶ms, 0, sizeof(struct ecore_rdma_create_cq_in_params)); memset(&destroy_iparams, 0, sizeof(struct ecore_rdma_destroy_cq_in_params)); memset(&destroy_oparams, 0, sizeof(struct ecore_rdma_destroy_cq_out_params)); if (entries > QLNXR_MAX_CQES) { QL_DPRINT11(ha, "the number of entries %d is too high. " "Must be equal or below %d.\n", entries, QLNXR_MAX_CQES); return ERR_PTR(-EINVAL); } chain_entries = qlnxr_align_cq_entries(entries); chain_entries = min_t(int, chain_entries, QLNXR_MAX_CQES); cq = qlnx_zalloc((sizeof(struct qlnxr_cq))); if (!cq) return ERR_PTR(-ENOMEM); if (udata) { memset(&ureq, 0, sizeof(ureq)); if (ib_copy_from_udata(&ureq, udata, min(sizeof(ureq), udata->inlen))) { QL_DPRINT11(ha, "ib_copy_from_udata failed\n"); goto err0; } if (!ureq.len) { QL_DPRINT11(ha, "ureq.len == 0\n"); goto err0; } cq->cq_type = QLNXR_CQ_TYPE_USER; qlnxr_init_user_queue(ib_ctx, dev, &cq->q, ureq.addr, ureq.len, IB_ACCESS_LOCAL_WRITE, 1, 1); pbl_ptr = cq->q.pbl_tbl->pa; page_cnt = cq->q.pbl_info.num_pbes; cq->ibcq.cqe = chain_entries; } else { cq->cq_type = QLNXR_CQ_TYPE_KERNEL; rc = ecore_chain_alloc(&dev->ha->cdev, ECORE_CHAIN_USE_TO_CONSUME, ECORE_CHAIN_MODE_PBL, ECORE_CHAIN_CNT_TYPE_U32, chain_entries, sizeof(union roce_cqe), &cq->pbl, NULL); if (rc) goto err1; page_cnt = ecore_chain_get_page_cnt(&cq->pbl); pbl_ptr = ecore_chain_get_pbl_phys(&cq->pbl); cq->ibcq.cqe = cq->pbl.capacity; } params.cq_handle_hi = upper_32_bits((uintptr_t)cq); params.cq_handle_lo = lower_32_bits((uintptr_t)cq); params.cnq_id = vector; params.cq_size = chain_entries - 1; params.pbl_num_pages = page_cnt; params.pbl_ptr = pbl_ptr; params.pbl_two_level = 0; if (ib_ctx != NULL) { ctx = get_qlnxr_ucontext(ib_ctx); params.dpi = ctx->dpi; } else { params.dpi = dev->dpi; } rc = ecore_rdma_create_cq(dev->rdma_ctx, ¶ms, &icid); if (rc) goto err2; cq->icid = icid; cq->sig = QLNXR_CQ_MAGIC_NUMBER; spin_lock_init(&cq->cq_lock); if (ib_ctx) { rc = qlnxr_copy_cq_uresp(dev, cq, udata); if (rc) goto err3; } else { /* Generate doorbell address. * Configure bits 3-9 with DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT. * TODO: consider moving to device scope as it is a function of * the device. * TODO: add ifdef if plan to support 16 bit. */ cq->db_addr = dev->db_addr + DB_ADDR_SHIFT(DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT); cq->db.data.icid = cq->icid; cq->db.data.params = DB_AGG_CMD_SET << RDMA_PWM_VAL32_DATA_AGG_CMD_SHIFT; /* point to the very last element, passing it we will toggle */ cq->toggle_cqe = ecore_chain_get_last_elem(&cq->pbl); cq->pbl_toggle = RDMA_RESIZE_CQ_RAMROD_DATA_TOGGLE_BIT_MASK; /* must be different from pbl_toggle */ cq->latest_cqe = NULL; consume_cqe(cq); cq->cq_cons = ecore_chain_get_cons_idx_u32(&cq->pbl); } QL_DPRINT12(ha, "exit icid = 0x%0x, addr = %p," " number of entries = 0x%x\n", cq->icid, cq, params.cq_size); QL_DPRINT12(ha,"cq_addr = %p\n", cq); return &cq->ibcq; err3: destroy_iparams.icid = cq->icid; ecore_rdma_destroy_cq(dev->rdma_ctx, &destroy_iparams, &destroy_oparams); err2: if (udata) qlnxr_free_pbl(dev, &cq->q.pbl_info, cq->q.pbl_tbl); else ecore_chain_free(&dev->ha->cdev, &cq->pbl); err1: if (udata) ib_umem_release(cq->q.umem); err0: kfree(cq); QL_DPRINT12(ha, "exit error\n"); return ERR_PTR(-EINVAL); } int qlnxr_resize_cq(struct ib_cq *ibcq, int new_cnt, struct ib_udata *udata) { int status = 0; struct qlnxr_dev *dev = get_qlnxr_dev((ibcq->device)); qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter/exit\n"); return status; } int qlnxr_destroy_cq(struct ib_cq *ibcq) { struct qlnxr_dev *dev = get_qlnxr_dev((ibcq->device)); struct ecore_rdma_destroy_cq_out_params oparams; struct ecore_rdma_destroy_cq_in_params iparams; struct qlnxr_cq *cq = get_qlnxr_cq(ibcq); int rc = 0; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter cq_id = %d\n", cq->icid); cq->destroyed = 1; /* TODO: Syncronize irq of the CNQ the CQ belongs to for validation * that all completions with notification are dealt with. The rest * of the completions are not interesting */ /* GSIs CQs are handled by driver, so they don't exist in the FW */ if (cq->cq_type != QLNXR_CQ_TYPE_GSI) { iparams.icid = cq->icid; rc = ecore_rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams); if (rc) { QL_DPRINT12(ha, "ecore_rdma_destroy_cq failed cq_id = %d\n", cq->icid); return rc; } QL_DPRINT12(ha, "free cq->pbl cq_id = %d\n", cq->icid); ecore_chain_free(&dev->ha->cdev, &cq->pbl); } if (ibcq->uobject && ibcq->uobject->context) { qlnxr_free_pbl(dev, &cq->q.pbl_info, cq->q.pbl_tbl); ib_umem_release(cq->q.umem); } cq->sig = ~cq->sig; kfree(cq); QL_DPRINT12(ha, "exit cq_id = %d\n", cq->icid); return rc; } static int qlnxr_check_qp_attrs(struct ib_pd *ibpd, struct qlnxr_dev *dev, struct ib_qp_init_attr *attrs, struct ib_udata *udata) { struct ecore_rdma_device *qattr; qlnx_host_t *ha; qattr = ecore_rdma_query_device(dev->rdma_ctx); ha = dev->ha; QL_DPRINT12(ha, "enter\n"); QL_DPRINT12(ha, "attrs->sq_sig_type = %d\n", attrs->sq_sig_type); QL_DPRINT12(ha, "attrs->qp_type = %d\n", attrs->qp_type); QL_DPRINT12(ha, "attrs->create_flags = %d\n", attrs->create_flags); #if __FreeBSD_version < 1102000 QL_DPRINT12(ha, "attrs->qpg_type = %d\n", attrs->qpg_type); #endif QL_DPRINT12(ha, "attrs->port_num = %d\n", attrs->port_num); QL_DPRINT12(ha, "attrs->cap.max_send_wr = 0x%x\n", attrs->cap.max_send_wr); QL_DPRINT12(ha, "attrs->cap.max_recv_wr = 0x%x\n", attrs->cap.max_recv_wr); QL_DPRINT12(ha, "attrs->cap.max_send_sge = 0x%x\n", attrs->cap.max_send_sge); QL_DPRINT12(ha, "attrs->cap.max_recv_sge = 0x%x\n", attrs->cap.max_recv_sge); QL_DPRINT12(ha, "attrs->cap.max_inline_data = 0x%x\n", attrs->cap.max_inline_data); #if __FreeBSD_version < 1102000 QL_DPRINT12(ha, "attrs->cap.qpg_tss_mask_sz = 0x%x\n", attrs->cap.qpg_tss_mask_sz); #endif QL_DPRINT12(ha, "\n\nqattr->vendor_id = 0x%x\n", qattr->vendor_id); QL_DPRINT12(ha, "qattr->vendor_part_id = 0x%x\n", qattr->vendor_part_id); QL_DPRINT12(ha, "qattr->hw_ver = 0x%x\n", qattr->hw_ver); QL_DPRINT12(ha, "qattr->fw_ver = %p\n", (void *)qattr->fw_ver); QL_DPRINT12(ha, "qattr->node_guid = %p\n", (void *)qattr->node_guid); QL_DPRINT12(ha, "qattr->sys_image_guid = %p\n", (void *)qattr->sys_image_guid); QL_DPRINT12(ha, "qattr->max_cnq = 0x%x\n", qattr->max_cnq); QL_DPRINT12(ha, "qattr->max_sge = 0x%x\n", qattr->max_sge); QL_DPRINT12(ha, "qattr->max_srq_sge = 0x%x\n", qattr->max_srq_sge); QL_DPRINT12(ha, "qattr->max_inline = 0x%x\n", qattr->max_inline); QL_DPRINT12(ha, "qattr->max_wqe = 0x%x\n", qattr->max_wqe); QL_DPRINT12(ha, "qattr->max_srq_wqe = 0x%x\n", qattr->max_srq_wqe); QL_DPRINT12(ha, "qattr->max_qp_resp_rd_atomic_resc = 0x%x\n", qattr->max_qp_resp_rd_atomic_resc); QL_DPRINT12(ha, "qattr->max_qp_req_rd_atomic_resc = 0x%x\n", qattr->max_qp_req_rd_atomic_resc); QL_DPRINT12(ha, "qattr->max_dev_resp_rd_atomic_resc = 0x%x\n", qattr->max_dev_resp_rd_atomic_resc); QL_DPRINT12(ha, "qattr->max_cq = 0x%x\n", qattr->max_cq); QL_DPRINT12(ha, "qattr->max_qp = 0x%x\n", qattr->max_qp); QL_DPRINT12(ha, "qattr->max_srq = 0x%x\n", qattr->max_srq); QL_DPRINT12(ha, "qattr->max_mr = 0x%x\n", qattr->max_mr); QL_DPRINT12(ha, "qattr->max_mr_size = %p\n", (void *)qattr->max_mr_size); QL_DPRINT12(ha, "qattr->max_cqe = 0x%x\n", qattr->max_cqe); QL_DPRINT12(ha, "qattr->max_mw = 0x%x\n", qattr->max_mw); QL_DPRINT12(ha, "qattr->max_fmr = 0x%x\n", qattr->max_fmr); QL_DPRINT12(ha, "qattr->max_mr_mw_fmr_pbl = 0x%x\n", qattr->max_mr_mw_fmr_pbl); QL_DPRINT12(ha, "qattr->max_mr_mw_fmr_size = %p\n", (void *)qattr->max_mr_mw_fmr_size); QL_DPRINT12(ha, "qattr->max_pd = 0x%x\n", qattr->max_pd); QL_DPRINT12(ha, "qattr->max_ah = 0x%x\n", qattr->max_ah); QL_DPRINT12(ha, "qattr->max_pkey = 0x%x\n", qattr->max_pkey); QL_DPRINT12(ha, "qattr->max_srq_wr = 0x%x\n", qattr->max_srq_wr); QL_DPRINT12(ha, "qattr->max_stats_queues = 0x%x\n", qattr->max_stats_queues); //QL_DPRINT12(ha, "qattr->dev_caps = 0x%x\n", qattr->dev_caps); QL_DPRINT12(ha, "qattr->page_size_caps = %p\n", (void *)qattr->page_size_caps); QL_DPRINT12(ha, "qattr->dev_ack_delay = 0x%x\n", qattr->dev_ack_delay); QL_DPRINT12(ha, "qattr->reserved_lkey = 0x%x\n", qattr->reserved_lkey); QL_DPRINT12(ha, "qattr->bad_pkey_counter = 0x%x\n", qattr->bad_pkey_counter); if ((attrs->qp_type == IB_QPT_GSI) && udata) { QL_DPRINT12(ha, "unexpected udata when creating GSI QP\n"); return -EINVAL; } if (udata && !(ibpd->uobject && ibpd->uobject->context)) { QL_DPRINT12(ha, "called from user without context\n"); return -EINVAL; } /* QP0... attrs->qp_type == IB_QPT_GSI */ if (attrs->qp_type != IB_QPT_RC && attrs->qp_type != IB_QPT_GSI) { QL_DPRINT12(ha, "unsupported qp type=0x%x requested\n", attrs->qp_type); return -EINVAL; } if (attrs->qp_type == IB_QPT_GSI && attrs->srq) { QL_DPRINT12(ha, "cannot create GSI qp with SRQ\n"); return -EINVAL; } /* Skip the check for QP1 to support CM size of 128 */ if (attrs->cap.max_send_wr > qattr->max_wqe) { QL_DPRINT12(ha, "cannot create a SQ with %d elements " " (max_send_wr=0x%x)\n", attrs->cap.max_send_wr, qattr->max_wqe); return -EINVAL; } if (!attrs->srq && (attrs->cap.max_recv_wr > qattr->max_wqe)) { QL_DPRINT12(ha, "cannot create a RQ with %d elements" " (max_recv_wr=0x%x)\n", attrs->cap.max_recv_wr, qattr->max_wqe); return -EINVAL; } if (attrs->cap.max_inline_data > qattr->max_inline) { QL_DPRINT12(ha, "unsupported inline data size=0x%x " "requested (max_inline=0x%x)\n", attrs->cap.max_inline_data, qattr->max_inline); return -EINVAL; } if (attrs->cap.max_send_sge > qattr->max_sge) { QL_DPRINT12(ha, "unsupported send_sge=0x%x " "requested (max_send_sge=0x%x)\n", attrs->cap.max_send_sge, qattr->max_sge); return -EINVAL; } if (attrs->cap.max_recv_sge > qattr->max_sge) { QL_DPRINT12(ha, "unsupported recv_sge=0x%x requested " " (max_recv_sge=0x%x)\n", attrs->cap.max_recv_sge, qattr->max_sge); return -EINVAL; } /* unprivileged user space cannot create special QP */ if (ibpd->uobject && attrs->qp_type == IB_QPT_GSI) { QL_DPRINT12(ha, "userspace can't create special QPs of type=0x%x\n", attrs->qp_type); return -EINVAL; } /* allow creating only one GSI type of QP */ if (attrs->qp_type == IB_QPT_GSI && dev->gsi_qp_created) { QL_DPRINT12(ha, "create qp: GSI special QPs already created.\n"); return -EINVAL; } /* verify consumer QPs are not trying to use GSI QP's CQ */ if ((attrs->qp_type != IB_QPT_GSI) && (dev->gsi_qp_created)) { struct qlnxr_cq *send_cq = get_qlnxr_cq(attrs->send_cq); struct qlnxr_cq *recv_cq = get_qlnxr_cq(attrs->recv_cq); if ((send_cq->cq_type == QLNXR_CQ_TYPE_GSI) || (recv_cq->cq_type == QLNXR_CQ_TYPE_GSI)) { QL_DPRINT11(ha, "consumer QP cannot use GSI CQs.\n"); return -EINVAL; } } QL_DPRINT12(ha, "exit\n"); return 0; } static int qlnxr_copy_srq_uresp(struct qlnxr_dev *dev, struct qlnxr_srq *srq, struct ib_udata *udata) { struct qlnxr_create_srq_uresp uresp; qlnx_host_t *ha; int rc; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); memset(&uresp, 0, sizeof(uresp)); uresp.srq_id = srq->srq_id; rc = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); QL_DPRINT12(ha, "exit [%d]\n", rc); return rc; } static void qlnxr_copy_rq_uresp(struct qlnxr_dev *dev, struct qlnxr_create_qp_uresp *uresp, struct qlnxr_qp *qp) { qlnx_host_t *ha; ha = dev->ha; /* Return if QP is associated with SRQ instead of RQ */ QL_DPRINT12(ha, "enter qp->srq = %p\n", qp->srq); if (qp->srq) return; /* iWARP requires two doorbells per RQ. */ if (QLNX_IS_IWARP(dev)) { uresp->rq_db_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_IWARP_RQ_PROD); uresp->rq_db2_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_FLAGS); QL_DPRINT12(ha, "uresp->rq_db_offset = 0x%x " "uresp->rq_db2_offset = 0x%x\n", uresp->rq_db_offset, uresp->rq_db2_offset); } else { uresp->rq_db_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD); } uresp->rq_icid = qp->icid; QL_DPRINT12(ha, "exit\n"); return; } static void qlnxr_copy_sq_uresp(struct qlnxr_dev *dev, struct qlnxr_create_qp_uresp *uresp, struct qlnxr_qp *qp) { qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); uresp->sq_db_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD); /* iWARP uses the same cid for rq and sq*/ if (QLNX_IS_IWARP(dev)) { uresp->sq_icid = qp->icid; QL_DPRINT12(ha, "uresp->sq_icid = 0x%x\n", uresp->sq_icid); } else uresp->sq_icid = qp->icid + 1; QL_DPRINT12(ha, "exit\n"); return; } static int qlnxr_copy_qp_uresp(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct ib_udata *udata) { int rc; struct qlnxr_create_qp_uresp uresp; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter qp->icid =0x%x\n", qp->icid); memset(&uresp, 0, sizeof(uresp)); qlnxr_copy_sq_uresp(dev, &uresp, qp); qlnxr_copy_rq_uresp(dev, &uresp, qp); uresp.atomic_supported = dev->atomic_cap != IB_ATOMIC_NONE; uresp.qp_id = qp->qp_id; rc = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); QL_DPRINT12(ha, "exit [%d]\n", rc); return rc; } static void qlnxr_set_common_qp_params(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct qlnxr_pd *pd, struct ib_qp_init_attr *attrs) { qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); spin_lock_init(&qp->q_lock); atomic_set(&qp->refcnt, 1); qp->pd = pd; qp->sig = QLNXR_QP_MAGIC_NUMBER; qp->qp_type = attrs->qp_type; qp->max_inline_data = ROCE_REQ_MAX_INLINE_DATA_SIZE; qp->sq.max_sges = attrs->cap.max_send_sge; qp->state = ECORE_ROCE_QP_STATE_RESET; qp->signaled = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) ? true : false; qp->sq_cq = get_qlnxr_cq(attrs->send_cq); qp->rq_cq = get_qlnxr_cq(attrs->recv_cq); qp->dev = dev; if (!attrs->srq) { /* QP is associated with RQ instead of SRQ */ qp->rq.max_sges = attrs->cap.max_recv_sge; QL_DPRINT12(ha, "RQ params:\trq_max_sges = %d, rq_cq_id = %d\n", qp->rq.max_sges, qp->rq_cq->icid); } else { qp->srq = get_qlnxr_srq(attrs->srq); } QL_DPRINT12(ha, "QP params:\tpd = %d, qp_type = %d, max_inline_data = %d," " state = %d, signaled = %d, use_srq=%d\n", pd->pd_id, qp->qp_type, qp->max_inline_data, qp->state, qp->signaled, ((attrs->srq) ? 1 : 0)); QL_DPRINT12(ha, "SQ params:\tsq_max_sges = %d, sq_cq_id = %d\n", qp->sq.max_sges, qp->sq_cq->icid); return; } static int qlnxr_check_srq_params(struct ib_pd *ibpd, struct qlnxr_dev *dev, struct ib_srq_init_attr *attrs) { struct ecore_rdma_device *qattr; qlnx_host_t *ha; ha = dev->ha; qattr = ecore_rdma_query_device(dev->rdma_ctx); QL_DPRINT12(ha, "enter\n"); if (attrs->attr.max_wr > qattr->max_srq_wqe) { QL_DPRINT12(ha, "unsupported srq_wr=0x%x" " requested (max_srq_wr=0x%x)\n", attrs->attr.max_wr, qattr->max_srq_wr); return -EINVAL; } if (attrs->attr.max_sge > qattr->max_sge) { QL_DPRINT12(ha, "unsupported sge=0x%x requested (max_srq_sge=0x%x)\n", attrs->attr.max_sge, qattr->max_sge); return -EINVAL; } if (attrs->attr.srq_limit > attrs->attr.max_wr) { QL_DPRINT12(ha, "unsupported srq_limit=0x%x requested" " (max_srq_limit=0x%x)\n", attrs->attr.srq_limit, attrs->attr.srq_limit); return -EINVAL; } QL_DPRINT12(ha, "exit\n"); return 0; } static void qlnxr_free_srq_user_params(struct qlnxr_srq *srq) { struct qlnxr_dev *dev = srq->dev; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); qlnxr_free_pbl(srq->dev, &srq->usrq.pbl_info, srq->usrq.pbl_tbl); ib_umem_release(srq->usrq.umem); ib_umem_release(srq->prod_umem); QL_DPRINT12(ha, "exit\n"); return; } static void qlnxr_free_srq_kernel_params(struct qlnxr_srq *srq) { struct qlnxr_srq_hwq_info *hw_srq = &srq->hw_srq; struct qlnxr_dev *dev = srq->dev; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); ecore_chain_free(dev->cdev, &hw_srq->pbl); qlnx_dma_free_coherent(&dev->cdev, hw_srq->virt_prod_pair_addr, hw_srq->phy_prod_pair_addr, sizeof(struct rdma_srq_producers)); QL_DPRINT12(ha, "exit\n"); return; } static int qlnxr_init_srq_user_params(struct ib_ucontext *ib_ctx, struct qlnxr_srq *srq, struct qlnxr_create_srq_ureq *ureq, int access, int dmasync) { #ifdef DEFINE_IB_UMEM_WITH_CHUNK struct ib_umem_chunk *chunk; #endif struct scatterlist *sg; int rc; struct qlnxr_dev *dev = srq->dev; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); rc = qlnxr_init_user_queue(ib_ctx, srq->dev, &srq->usrq, ureq->srq_addr, ureq->srq_len, access, dmasync, 1); if (rc) return rc; srq->prod_umem = ib_umem_get(ib_ctx, ureq->prod_pair_addr, sizeof(struct rdma_srq_producers), access, dmasync); if (IS_ERR(srq->prod_umem)) { qlnxr_free_pbl(srq->dev, &srq->usrq.pbl_info, srq->usrq.pbl_tbl); ib_umem_release(srq->usrq.umem); QL_DPRINT12(ha, "ib_umem_get failed for producer [%p]\n", PTR_ERR(srq->prod_umem)); return PTR_ERR(srq->prod_umem); } #ifdef DEFINE_IB_UMEM_WITH_CHUNK chunk = container_of((&srq->prod_umem->chunk_list)->next, typeof(*chunk), list); sg = &chunk->page_list[0]; #else sg = srq->prod_umem->sg_head.sgl; #endif srq->hw_srq.phy_prod_pair_addr = sg_dma_address(sg); QL_DPRINT12(ha, "exit\n"); return 0; } static int qlnxr_alloc_srq_kernel_params(struct qlnxr_srq *srq, struct qlnxr_dev *dev, struct ib_srq_init_attr *init_attr) { struct qlnxr_srq_hwq_info *hw_srq = &srq->hw_srq; dma_addr_t phy_prod_pair_addr; u32 num_elems, max_wr; void *va; int rc; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); va = qlnx_dma_alloc_coherent(&dev->cdev, &phy_prod_pair_addr, sizeof(struct rdma_srq_producers)); if (!va) { QL_DPRINT11(ha, "qlnx_dma_alloc_coherent failed for produceer\n"); return -ENOMEM; } hw_srq->phy_prod_pair_addr = phy_prod_pair_addr; hw_srq->virt_prod_pair_addr = va; max_wr = init_attr->attr.max_wr; num_elems = max_wr * RDMA_MAX_SRQ_WQE_SIZE; rc = ecore_chain_alloc(dev->cdev, ECORE_CHAIN_USE_TO_CONSUME_PRODUCE, ECORE_CHAIN_MODE_PBL, ECORE_CHAIN_CNT_TYPE_U32, num_elems, ECORE_RDMA_SRQ_WQE_ELEM_SIZE, &hw_srq->pbl, NULL); if (rc) { QL_DPRINT11(ha, "ecore_chain_alloc failed [%d]\n", rc); goto err0; } hw_srq->max_wr = max_wr; hw_srq->num_elems = num_elems; hw_srq->max_sges = RDMA_MAX_SGE_PER_SRQ; QL_DPRINT12(ha, "exit\n"); return 0; err0: qlnx_dma_free_coherent(&dev->cdev, va, phy_prod_pair_addr, sizeof(struct rdma_srq_producers)); QL_DPRINT12(ha, "exit [%d]\n", rc); return rc; } static inline void qlnxr_init_common_qp_in_params(struct qlnxr_dev *dev, struct qlnxr_pd *pd, struct qlnxr_qp *qp, struct ib_qp_init_attr *attrs, bool fmr_and_reserved_lkey, struct ecore_rdma_create_qp_in_params *params) { qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); /* QP handle to be written in an async event */ params->qp_handle_async_lo = lower_32_bits((uintptr_t)qp); params->qp_handle_async_hi = upper_32_bits((uintptr_t)qp); params->signal_all = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR); params->fmr_and_reserved_lkey = fmr_and_reserved_lkey; params->pd = pd->pd_id; params->dpi = pd->uctx ? pd->uctx->dpi : dev->dpi; params->sq_cq_id = get_qlnxr_cq(attrs->send_cq)->icid; params->stats_queue = 0; params->rq_cq_id = get_qlnxr_cq(attrs->recv_cq)->icid; if (qp->srq) { /* QP is associated with SRQ instead of RQ */ params->srq_id = qp->srq->srq_id; params->use_srq = true; QL_DPRINT11(ha, "exit srq_id = 0x%x use_srq = 0x%x\n", params->srq_id, params->use_srq); return; } params->srq_id = 0; params->use_srq = false; QL_DPRINT12(ha, "exit\n"); return; } static inline void qlnxr_qp_user_print( struct qlnxr_dev *dev, struct qlnxr_qp *qp) { QL_DPRINT12((dev->ha), "qp=%p. sq_addr=0x%llx, sq_len=%zd, " "rq_addr=0x%llx, rq_len=%zd\n", qp, qp->usq.buf_addr, qp->usq.buf_len, qp->urq.buf_addr, qp->urq.buf_len); return; } static int qlnxr_idr_add(struct qlnxr_dev *dev, void *ptr, u32 id) { u32 newid; int rc; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (!QLNX_IS_IWARP(dev)) return 0; do { if (!idr_pre_get(&dev->qpidr, GFP_KERNEL)) { QL_DPRINT11(ha, "idr_pre_get failed\n"); return -ENOMEM; } mtx_lock(&dev->idr_lock); rc = idr_get_new_above(&dev->qpidr, ptr, id, &newid); mtx_unlock(&dev->idr_lock); } while (rc == -EAGAIN); QL_DPRINT12(ha, "exit [%d]\n", rc); return rc; } static void qlnxr_idr_remove(struct qlnxr_dev *dev, u32 id) { qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (!QLNX_IS_IWARP(dev)) return; mtx_lock(&dev->idr_lock); idr_remove(&dev->qpidr, id); mtx_unlock(&dev->idr_lock); QL_DPRINT12(ha, "exit \n"); return; } static inline void qlnxr_iwarp_populate_user_qp(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct ecore_rdma_create_qp_out_params *out_params) { qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); qp->usq.pbl_tbl->va = out_params->sq_pbl_virt; qp->usq.pbl_tbl->pa = out_params->sq_pbl_phys; qlnxr_populate_pbls(dev, qp->usq.umem, qp->usq.pbl_tbl, &qp->usq.pbl_info); if (qp->srq) { QL_DPRINT11(ha, "qp->srq = %p\n", qp->srq); return; } qp->urq.pbl_tbl->va = out_params->rq_pbl_virt; qp->urq.pbl_tbl->pa = out_params->rq_pbl_phys; qlnxr_populate_pbls(dev, qp->urq.umem, qp->urq.pbl_tbl, &qp->urq.pbl_info); QL_DPRINT12(ha, "exit\n"); return; } static int qlnxr_create_user_qp(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct ib_pd *ibpd, struct ib_udata *udata, struct ib_qp_init_attr *attrs) { struct ecore_rdma_destroy_qp_out_params d_out_params; struct ecore_rdma_create_qp_in_params in_params; struct ecore_rdma_create_qp_out_params out_params; struct qlnxr_pd *pd = get_qlnxr_pd(ibpd); struct ib_ucontext *ib_ctx = NULL; struct qlnxr_ucontext *ctx = NULL; struct qlnxr_create_qp_ureq ureq; int alloc_and_init = QLNX_IS_ROCE(dev); int rc = -EINVAL; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); ib_ctx = ibpd->uobject->context; ctx = get_qlnxr_ucontext(ib_ctx); memset(&ureq, 0, sizeof(ureq)); rc = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); if (rc) { QL_DPRINT11(ha, "ib_copy_from_udata failed [%d]\n", rc); return rc; } /* SQ - read access only (0), dma sync not required (0) */ rc = qlnxr_init_user_queue(ib_ctx, dev, &qp->usq, ureq.sq_addr, ureq.sq_len, 0, 0, alloc_and_init); if (rc) { QL_DPRINT11(ha, "qlnxr_init_user_queue failed [%d]\n", rc); return rc; } if (!qp->srq) { /* RQ - read access only (0), dma sync not required (0) */ rc = qlnxr_init_user_queue(ib_ctx, dev, &qp->urq, ureq.rq_addr, ureq.rq_len, 0, 0, alloc_and_init); if (rc) { QL_DPRINT11(ha, "qlnxr_init_user_queue failed [%d]\n", rc); return rc; } } memset(&in_params, 0, sizeof(in_params)); qlnxr_init_common_qp_in_params(dev, pd, qp, attrs, false, &in_params); in_params.qp_handle_lo = ureq.qp_handle_lo; in_params.qp_handle_hi = ureq.qp_handle_hi; in_params.sq_num_pages = qp->usq.pbl_info.num_pbes; in_params.sq_pbl_ptr = qp->usq.pbl_tbl->pa; if (!qp->srq) { in_params.rq_num_pages = qp->urq.pbl_info.num_pbes; in_params.rq_pbl_ptr = qp->urq.pbl_tbl->pa; } qp->ecore_qp = ecore_rdma_create_qp(dev->rdma_ctx, &in_params, &out_params); if (!qp->ecore_qp) { rc = -ENOMEM; QL_DPRINT11(ha, "ecore_rdma_create_qp failed\n"); goto err1; } if (QLNX_IS_IWARP(dev)) qlnxr_iwarp_populate_user_qp(dev, qp, &out_params); qp->qp_id = out_params.qp_id; qp->icid = out_params.icid; rc = qlnxr_copy_qp_uresp(dev, qp, udata); if (rc) { QL_DPRINT11(ha, "qlnxr_copy_qp_uresp failed\n"); goto err; } qlnxr_qp_user_print(dev, qp); QL_DPRINT12(ha, "exit\n"); return 0; err: rc = ecore_rdma_destroy_qp(dev->rdma_ctx, qp->ecore_qp, &d_out_params); if (rc) QL_DPRINT12(ha, "fatal fault\n"); err1: qlnxr_cleanup_user(dev, qp); QL_DPRINT12(ha, "exit[%d]\n", rc); return rc; } static void qlnxr_set_roce_db_info(struct qlnxr_dev *dev, struct qlnxr_qp *qp) { qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter qp = %p qp->srq %p\n", qp, qp->srq); qp->sq.db = dev->db_addr + DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD); qp->sq.db_data.data.icid = qp->icid + 1; if (!qp->srq) { qp->rq.db = dev->db_addr + DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD); qp->rq.db_data.data.icid = qp->icid; } QL_DPRINT12(ha, "exit\n"); return; } static void qlnxr_set_iwarp_db_info(struct qlnxr_dev *dev, struct qlnxr_qp *qp) { qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter qp = %p qp->srq %p\n", qp, qp->srq); qp->sq.db = dev->db_addr + DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD); qp->sq.db_data.data.icid = qp->icid; if (!qp->srq) { qp->rq.db = dev->db_addr + DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_IWARP_RQ_PROD); qp->rq.db_data.data.icid = qp->icid; qp->rq.iwarp_db2 = dev->db_addr + DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_FLAGS); qp->rq.iwarp_db2_data.data.icid = qp->icid; qp->rq.iwarp_db2_data.data.value = DQ_TCM_IWARP_POST_RQ_CF_CMD; } QL_DPRINT12(ha, "qp->sq.db = %p qp->sq.db_data.data.icid =0x%x\n" "\t\t\tqp->rq.db = %p qp->rq.db_data.data.icid =0x%x\n" "\t\t\tqp->rq.iwarp_db2 = %p qp->rq.iwarp_db2.data.icid =0x%x" " qp->rq.iwarp_db2.data.prod_val =0x%x\n", qp->sq.db, qp->sq.db_data.data.icid, qp->rq.db, qp->rq.db_data.data.icid, qp->rq.iwarp_db2, qp->rq.iwarp_db2_data.data.icid, qp->rq.iwarp_db2_data.data.value); QL_DPRINT12(ha, "exit\n"); return; } static int qlnxr_roce_create_kernel_qp(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct ecore_rdma_create_qp_in_params *in_params, u32 n_sq_elems, u32 n_rq_elems) { struct ecore_rdma_create_qp_out_params out_params; int rc; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); rc = ecore_chain_alloc( dev->cdev, ECORE_CHAIN_USE_TO_PRODUCE, ECORE_CHAIN_MODE_PBL, ECORE_CHAIN_CNT_TYPE_U32, n_sq_elems, QLNXR_SQE_ELEMENT_SIZE, &qp->sq.pbl, NULL); if (rc) { QL_DPRINT11(ha, "ecore_chain_alloc qp->sq.pbl failed[%d]\n", rc); return rc; } in_params->sq_num_pages = ecore_chain_get_page_cnt(&qp->sq.pbl); in_params->sq_pbl_ptr = ecore_chain_get_pbl_phys(&qp->sq.pbl); if (!qp->srq) { rc = ecore_chain_alloc( dev->cdev, ECORE_CHAIN_USE_TO_CONSUME_PRODUCE, ECORE_CHAIN_MODE_PBL, ECORE_CHAIN_CNT_TYPE_U32, n_rq_elems, QLNXR_RQE_ELEMENT_SIZE, &qp->rq.pbl, NULL); if (rc) { QL_DPRINT11(ha, "ecore_chain_alloc qp->rq.pbl failed[%d]\n", rc); return rc; } in_params->rq_num_pages = ecore_chain_get_page_cnt(&qp->rq.pbl); in_params->rq_pbl_ptr = ecore_chain_get_pbl_phys(&qp->rq.pbl); } qp->ecore_qp = ecore_rdma_create_qp(dev->rdma_ctx, in_params, &out_params); if (!qp->ecore_qp) { QL_DPRINT11(ha, "qp->ecore_qp == NULL\n"); return -EINVAL; } qp->qp_id = out_params.qp_id; qp->icid = out_params.icid; qlnxr_set_roce_db_info(dev, qp); QL_DPRINT12(ha, "exit\n"); return 0; } static int qlnxr_iwarp_create_kernel_qp(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct ecore_rdma_create_qp_in_params *in_params, u32 n_sq_elems, u32 n_rq_elems) { struct ecore_rdma_destroy_qp_out_params d_out_params; struct ecore_rdma_create_qp_out_params out_params; struct ecore_chain_ext_pbl ext_pbl; int rc; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); in_params->sq_num_pages = ECORE_CHAIN_PAGE_CNT(n_sq_elems, QLNXR_SQE_ELEMENT_SIZE, ECORE_CHAIN_MODE_PBL); in_params->rq_num_pages = ECORE_CHAIN_PAGE_CNT(n_rq_elems, QLNXR_RQE_ELEMENT_SIZE, ECORE_CHAIN_MODE_PBL); QL_DPRINT12(ha, "n_sq_elems = 0x%x" " n_rq_elems = 0x%x in_params\n" "\t\t\tqp_handle_lo\t\t= 0x%08x\n" "\t\t\tqp_handle_hi\t\t= 0x%08x\n" "\t\t\tqp_handle_async_lo\t\t= 0x%08x\n" "\t\t\tqp_handle_async_hi\t\t= 0x%08x\n" "\t\t\tuse_srq\t\t\t= 0x%x\n" "\t\t\tsignal_all\t\t= 0x%x\n" "\t\t\tfmr_and_reserved_lkey\t= 0x%x\n" "\t\t\tpd\t\t\t= 0x%x\n" "\t\t\tdpi\t\t\t= 0x%x\n" "\t\t\tsq_cq_id\t\t\t= 0x%x\n" "\t\t\tsq_num_pages\t\t= 0x%x\n" "\t\t\tsq_pbl_ptr\t\t= %p\n" "\t\t\tmax_sq_sges\t\t= 0x%x\n" "\t\t\trq_cq_id\t\t\t= 0x%x\n" "\t\t\trq_num_pages\t\t= 0x%x\n" "\t\t\trq_pbl_ptr\t\t= %p\n" "\t\t\tsrq_id\t\t\t= 0x%x\n" "\t\t\tstats_queue\t\t= 0x%x\n", n_sq_elems, n_rq_elems, in_params->qp_handle_lo, in_params->qp_handle_hi, in_params->qp_handle_async_lo, in_params->qp_handle_async_hi, in_params->use_srq, in_params->signal_all, in_params->fmr_and_reserved_lkey, in_params->pd, in_params->dpi, in_params->sq_cq_id, in_params->sq_num_pages, (void *)in_params->sq_pbl_ptr, in_params->max_sq_sges, in_params->rq_cq_id, in_params->rq_num_pages, (void *)in_params->rq_pbl_ptr, in_params->srq_id, in_params->stats_queue ); memset(&out_params, 0, sizeof (struct ecore_rdma_create_qp_out_params)); memset(&ext_pbl, 0, sizeof (struct ecore_chain_ext_pbl)); qp->ecore_qp = ecore_rdma_create_qp(dev->rdma_ctx, in_params, &out_params); if (!qp->ecore_qp) { QL_DPRINT11(ha, "ecore_rdma_create_qp failed\n"); return -EINVAL; } /* Now we allocate the chain */ ext_pbl.p_pbl_virt = out_params.sq_pbl_virt; ext_pbl.p_pbl_phys = out_params.sq_pbl_phys; QL_DPRINT12(ha, "ext_pbl.p_pbl_virt = %p " "ext_pbl.p_pbl_phys = %p\n", ext_pbl.p_pbl_virt, ext_pbl.p_pbl_phys); rc = ecore_chain_alloc( dev->cdev, ECORE_CHAIN_USE_TO_PRODUCE, ECORE_CHAIN_MODE_PBL, ECORE_CHAIN_CNT_TYPE_U32, n_sq_elems, QLNXR_SQE_ELEMENT_SIZE, &qp->sq.pbl, &ext_pbl); if (rc) { QL_DPRINT11(ha, "ecore_chain_alloc qp->sq.pbl failed rc = %d\n", rc); goto err; } ext_pbl.p_pbl_virt = out_params.rq_pbl_virt; ext_pbl.p_pbl_phys = out_params.rq_pbl_phys; QL_DPRINT12(ha, "ext_pbl.p_pbl_virt = %p " "ext_pbl.p_pbl_phys = %p\n", ext_pbl.p_pbl_virt, ext_pbl.p_pbl_phys); if (!qp->srq) { rc = ecore_chain_alloc( dev->cdev, ECORE_CHAIN_USE_TO_CONSUME_PRODUCE, ECORE_CHAIN_MODE_PBL, ECORE_CHAIN_CNT_TYPE_U32, n_rq_elems, QLNXR_RQE_ELEMENT_SIZE, &qp->rq.pbl, &ext_pbl); if (rc) { QL_DPRINT11(ha,, "ecore_chain_alloc qp->rq.pbl" " failed rc = %d\n", rc); goto err; } } QL_DPRINT12(ha, "qp_id = 0x%x icid =0x%x\n", out_params.qp_id, out_params.icid); qp->qp_id = out_params.qp_id; qp->icid = out_params.icid; qlnxr_set_iwarp_db_info(dev, qp); QL_DPRINT12(ha, "exit\n"); return 0; err: ecore_rdma_destroy_qp(dev->rdma_ctx, qp->ecore_qp, &d_out_params); QL_DPRINT12(ha, "exit rc = %d\n", rc); return rc; } static int qlnxr_create_kernel_qp(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct ib_pd *ibpd, struct ib_qp_init_attr *attrs) { struct ecore_rdma_create_qp_in_params in_params; struct qlnxr_pd *pd = get_qlnxr_pd(ibpd); int rc = -EINVAL; u32 n_rq_elems; u32 n_sq_elems; u32 n_sq_entries; struct ecore_rdma_device *qattr = ecore_rdma_query_device(dev->rdma_ctx); qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); memset(&in_params, 0, sizeof(in_params)); /* A single work request may take up to MAX_SQ_WQE_SIZE elements in * the ring. The ring should allow at least a single WR, even if the * user requested none, due to allocation issues. * We should add an extra WR since the prod and cons indices of * wqe_wr_id are managed in such a way that the WQ is considered full * when (prod+1)%max_wr==cons. We currently don't do that because we * double the number of entries due an iSER issue that pushes far more * WRs than indicated. If we decline its ib_post_send() then we get * error prints in the dmesg we'd like to avoid. */ qp->sq.max_wr = min_t(u32, attrs->cap.max_send_wr * dev->wq_multiplier, qattr->max_wqe); qp->wqe_wr_id = kzalloc(qp->sq.max_wr * sizeof(*qp->wqe_wr_id), GFP_KERNEL); if (!qp->wqe_wr_id) { QL_DPRINT11(ha, "failed SQ shadow memory allocation\n"); return -ENOMEM; } /* QP handle to be written in CQE */ in_params.qp_handle_lo = lower_32_bits((uintptr_t)qp); in_params.qp_handle_hi = upper_32_bits((uintptr_t)qp); /* A single work request may take up to MAX_RQ_WQE_SIZE elements in * the ring. There ring should allow at least a single WR, even if the * user requested none, due to allocation issues. */ qp->rq.max_wr = (u16)max_t(u32, attrs->cap.max_recv_wr, 1); /* Allocate driver internal RQ array */ if (!qp->srq) { qp->rqe_wr_id = kzalloc(qp->rq.max_wr * sizeof(*qp->rqe_wr_id), GFP_KERNEL); if (!qp->rqe_wr_id) { QL_DPRINT11(ha, "failed RQ shadow memory allocation\n"); kfree(qp->wqe_wr_id); return -ENOMEM; } } //qlnxr_init_common_qp_in_params(dev, pd, qp, attrs, true, &in_params); in_params.qp_handle_async_lo = lower_32_bits((uintptr_t)qp); in_params.qp_handle_async_hi = upper_32_bits((uintptr_t)qp); in_params.signal_all = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR); in_params.fmr_and_reserved_lkey = true; in_params.pd = pd->pd_id; in_params.dpi = pd->uctx ? pd->uctx->dpi : dev->dpi; in_params.sq_cq_id = get_qlnxr_cq(attrs->send_cq)->icid; in_params.stats_queue = 0; in_params.rq_cq_id = get_qlnxr_cq(attrs->recv_cq)->icid; if (qp->srq) { /* QP is associated with SRQ instead of RQ */ in_params.srq_id = qp->srq->srq_id; in_params.use_srq = true; QL_DPRINT11(ha, "exit srq_id = 0x%x use_srq = 0x%x\n", in_params.srq_id, in_params.use_srq); } else { in_params.srq_id = 0; in_params.use_srq = false; } n_sq_entries = attrs->cap.max_send_wr; n_sq_entries = min_t(u32, n_sq_entries, qattr->max_wqe); n_sq_entries = max_t(u32, n_sq_entries, 1); n_sq_elems = n_sq_entries * QLNXR_MAX_SQE_ELEMENTS_PER_SQE; n_rq_elems = qp->rq.max_wr * QLNXR_MAX_RQE_ELEMENTS_PER_RQE; if (QLNX_IS_ROCE(dev)) { rc = qlnxr_roce_create_kernel_qp(dev, qp, &in_params, n_sq_elems, n_rq_elems); } else { rc = qlnxr_iwarp_create_kernel_qp(dev, qp, &in_params, n_sq_elems, n_rq_elems); } if (rc) qlnxr_cleanup_kernel(dev, qp); QL_DPRINT12(ha, "exit [%d]\n", rc); return rc; } struct ib_qp * qlnxr_create_qp(struct ib_pd *ibpd, struct ib_qp_init_attr *attrs, struct ib_udata *udata) { struct qlnxr_dev *dev = get_qlnxr_dev(ibpd->device); struct qlnxr_pd *pd = get_qlnxr_pd(ibpd); struct qlnxr_qp *qp; int rc = 0; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); rc = qlnxr_check_qp_attrs(ibpd, dev, attrs, udata); if (rc) { QL_DPRINT11(ha, "qlnxr_check_qp_attrs failed [%d]\n", rc); return ERR_PTR(rc); } QL_DPRINT12(ha, "called from %s, event_handle=%p," " eepd=%p sq_cq=%p, sq_icid=%d, rq_cq=%p, rq_icid=%d\n", (udata ? "user library" : "kernel"), attrs->event_handler, pd, get_qlnxr_cq(attrs->send_cq), get_qlnxr_cq(attrs->send_cq)->icid, get_qlnxr_cq(attrs->recv_cq), get_qlnxr_cq(attrs->recv_cq)->icid); qp = qlnx_zalloc(sizeof(struct qlnxr_qp)); if (!qp) { QL_DPRINT11(ha, "kzalloc(qp) failed\n"); return ERR_PTR(-ENOMEM); } qlnxr_set_common_qp_params(dev, qp, pd, attrs); if (attrs->qp_type == IB_QPT_GSI) { QL_DPRINT11(ha, "calling qlnxr_create_gsi_qp\n"); return qlnxr_create_gsi_qp(dev, attrs, qp); } if (udata) { rc = qlnxr_create_user_qp(dev, qp, ibpd, udata, attrs); if (rc) { QL_DPRINT11(ha, "qlnxr_create_user_qp failed\n"); goto err; } } else { rc = qlnxr_create_kernel_qp(dev, qp, ibpd, attrs); if (rc) { QL_DPRINT11(ha, "qlnxr_create_kernel_qp failed\n"); goto err; } } qp->ibqp.qp_num = qp->qp_id; rc = qlnxr_idr_add(dev, qp, qp->qp_id); if (rc) { QL_DPRINT11(ha, "qlnxr_idr_add failed\n"); goto err; } QL_DPRINT12(ha, "exit [%p]\n", &qp->ibqp); return &qp->ibqp; err: kfree(qp); QL_DPRINT12(ha, "failed exit\n"); return ERR_PTR(-EFAULT); } static enum ib_qp_state qlnxr_get_ibqp_state(enum ecore_roce_qp_state qp_state) { enum ib_qp_state state = IB_QPS_ERR; switch (qp_state) { case ECORE_ROCE_QP_STATE_RESET: state = IB_QPS_RESET; break; case ECORE_ROCE_QP_STATE_INIT: state = IB_QPS_INIT; break; case ECORE_ROCE_QP_STATE_RTR: state = IB_QPS_RTR; break; case ECORE_ROCE_QP_STATE_RTS: state = IB_QPS_RTS; break; case ECORE_ROCE_QP_STATE_SQD: state = IB_QPS_SQD; break; case ECORE_ROCE_QP_STATE_ERR: state = IB_QPS_ERR; break; case ECORE_ROCE_QP_STATE_SQE: state = IB_QPS_SQE; break; } return state; } static enum ecore_roce_qp_state qlnxr_get_state_from_ibqp( enum ib_qp_state qp_state) { enum ecore_roce_qp_state ecore_qp_state; ecore_qp_state = ECORE_ROCE_QP_STATE_ERR; switch (qp_state) { case IB_QPS_RESET: ecore_qp_state = ECORE_ROCE_QP_STATE_RESET; break; case IB_QPS_INIT: ecore_qp_state = ECORE_ROCE_QP_STATE_INIT; break; case IB_QPS_RTR: ecore_qp_state = ECORE_ROCE_QP_STATE_RTR; break; case IB_QPS_RTS: ecore_qp_state = ECORE_ROCE_QP_STATE_RTS; break; case IB_QPS_SQD: ecore_qp_state = ECORE_ROCE_QP_STATE_SQD; break; case IB_QPS_ERR: ecore_qp_state = ECORE_ROCE_QP_STATE_ERR; break; default: ecore_qp_state = ECORE_ROCE_QP_STATE_ERR; break; } return (ecore_qp_state); } static void qlnxr_reset_qp_hwq_info(struct qlnxr_qp_hwq_info *qph) { ecore_chain_reset(&qph->pbl); qph->prod = qph->cons = 0; qph->wqe_cons = 0; qph->db_data.data.value = cpu_to_le16(0); return; } static int qlnxr_update_qp_state(struct qlnxr_dev *dev, struct qlnxr_qp *qp, enum ecore_roce_qp_state new_state) { int status = 0; uint32_t reg_addr; struct ecore_dev *cdev; qlnx_host_t *ha; ha = dev->ha; cdev = &ha->cdev; QL_DPRINT12(ha, "enter qp = %p new_state = 0x%x qp->state = 0x%x\n", qp, new_state, qp->state); if (new_state == qp->state) { return 0; } switch (qp->state) { case ECORE_ROCE_QP_STATE_RESET: switch (new_state) { case ECORE_ROCE_QP_STATE_INIT: qp->prev_wqe_size = 0; qlnxr_reset_qp_hwq_info(&qp->sq); if (!(qp->srq)) qlnxr_reset_qp_hwq_info(&qp->rq); break; default: status = -EINVAL; break; }; break; case ECORE_ROCE_QP_STATE_INIT: /* INIT->XXX */ switch (new_state) { case ECORE_ROCE_QP_STATE_RTR: /* Update doorbell (in case post_recv was done before move to RTR) */ if (qp->srq) break; wmb(); //writel(qp->rq.db_data.raw, qp->rq.db); //if (QLNX_IS_IWARP(dev)) // writel(qp->rq.iwarp_db2_data.raw, // qp->rq.iwarp_db2); reg_addr = (uint32_t)((uint8_t *)qp->rq.db - (uint8_t *)cdev->doorbells); bus_write_4(ha->pci_dbells, reg_addr, qp->rq.db_data.raw); bus_barrier(ha->pci_dbells, 0, 0, BUS_SPACE_BARRIER_READ); if (QLNX_IS_IWARP(dev)) { reg_addr = (uint32_t)((uint8_t *)qp->rq.iwarp_db2 - (uint8_t *)cdev->doorbells); bus_write_4(ha->pci_dbells, reg_addr,\ qp->rq.iwarp_db2_data.raw); bus_barrier(ha->pci_dbells, 0, 0,\ BUS_SPACE_BARRIER_READ); } mmiowb(); break; case ECORE_ROCE_QP_STATE_ERR: /* TBD:flush qps... */ break; default: /* invalid state change. */ status = -EINVAL; break; }; break; case ECORE_ROCE_QP_STATE_RTR: /* RTR->XXX */ switch (new_state) { case ECORE_ROCE_QP_STATE_RTS: break; case ECORE_ROCE_QP_STATE_ERR: break; default: /* invalid state change. */ status = -EINVAL; break; }; break; case ECORE_ROCE_QP_STATE_RTS: /* RTS->XXX */ switch (new_state) { case ECORE_ROCE_QP_STATE_SQD: break; case ECORE_ROCE_QP_STATE_ERR: break; default: /* invalid state change. */ status = -EINVAL; break; }; break; case ECORE_ROCE_QP_STATE_SQD: /* SQD->XXX */ switch (new_state) { case ECORE_ROCE_QP_STATE_RTS: case ECORE_ROCE_QP_STATE_ERR: break; default: /* invalid state change. */ status = -EINVAL; break; }; break; case ECORE_ROCE_QP_STATE_ERR: /* ERR->XXX */ switch (new_state) { case ECORE_ROCE_QP_STATE_RESET: if ((qp->rq.prod != qp->rq.cons) || (qp->sq.prod != qp->sq.cons)) { QL_DPRINT11(ha, "Error->Reset with rq/sq " "not empty rq.prod=0x%x rq.cons=0x%x" " sq.prod=0x%x sq.cons=0x%x\n", qp->rq.prod, qp->rq.cons, qp->sq.prod, qp->sq.cons); status = -EINVAL; } break; default: status = -EINVAL; break; }; break; default: status = -EINVAL; break; }; QL_DPRINT12(ha, "exit\n"); return status; } int qlnxr_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { int rc = 0; struct qlnxr_qp *qp = get_qlnxr_qp(ibqp); struct qlnxr_dev *dev = get_qlnxr_dev(&qp->dev->ibdev); struct ecore_rdma_modify_qp_in_params qp_params = { 0 }; enum ib_qp_state old_qp_state, new_qp_state; struct ecore_rdma_device *qattr = ecore_rdma_query_device(dev->rdma_ctx); qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter qp = %p attr_mask = 0x%x, state = %d udata = %p\n", qp, attr_mask, attr->qp_state, udata); old_qp_state = qlnxr_get_ibqp_state(qp->state); if (attr_mask & IB_QP_STATE) new_qp_state = attr->qp_state; else new_qp_state = old_qp_state; if (QLNX_IS_ROCE(dev)) { if (!ib_modify_qp_is_ok(old_qp_state, new_qp_state, ibqp->qp_type, attr_mask )) { QL_DPRINT12(ha, "invalid attribute mask=0x%x" " specified for qpn=0x%x of type=0x%x \n" " old_qp_state=0x%x, new_qp_state=0x%x\n", attr_mask, qp->qp_id, ibqp->qp_type, old_qp_state, new_qp_state); rc = -EINVAL; goto err; } } /* translate the masks... */ if (attr_mask & IB_QP_STATE) { SET_FIELD(qp_params.modify_flags, ECORE_RDMA_MODIFY_QP_VALID_NEW_STATE, 1); qp_params.new_state = qlnxr_get_state_from_ibqp(attr->qp_state); } // TBD consider changing ecore to be a flag as well... if (attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) qp_params.sqd_async = true; if (attr_mask & IB_QP_PKEY_INDEX) { SET_FIELD(qp_params.modify_flags, ECORE_ROCE_MODIFY_QP_VALID_PKEY, 1); if (attr->pkey_index >= QLNXR_ROCE_PKEY_TABLE_LEN) { rc = -EINVAL; goto err; } qp_params.pkey = QLNXR_ROCE_PKEY_DEFAULT; } if (attr_mask & IB_QP_QKEY) { qp->qkey = attr->qkey; } /* tbd consider splitting in ecore.. */ if (attr_mask & IB_QP_ACCESS_FLAGS) { SET_FIELD(qp_params.modify_flags, ECORE_RDMA_MODIFY_QP_VALID_RDMA_OPS_EN, 1); qp_params.incoming_rdma_read_en = attr->qp_access_flags & IB_ACCESS_REMOTE_READ; qp_params.incoming_rdma_write_en = attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE; qp_params.incoming_atomic_en = attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC; } if (attr_mask & (IB_QP_AV | IB_QP_PATH_MTU)) { if (attr_mask & IB_QP_PATH_MTU) { if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) { QL_DPRINT12(ha, "Only MTU sizes of 256, 512, 1024," " 2048 and 4096 are supported " " attr->path_mtu = [%d]\n", attr->path_mtu); rc = -EINVAL; goto err; } qp->mtu = min(ib_mtu_enum_to_int(attr->path_mtu), ib_mtu_enum_to_int( iboe_get_mtu(dev->ha->ifp->if_mtu))); } if (qp->mtu == 0) { qp->mtu = ib_mtu_enum_to_int( iboe_get_mtu(dev->ha->ifp->if_mtu)); QL_DPRINT12(ha, "fixing zetoed MTU to qp->mtu = %d\n", qp->mtu); } SET_FIELD(qp_params.modify_flags, ECORE_ROCE_MODIFY_QP_VALID_ADDRESS_VECTOR, 1); qp_params.traffic_class_tos = attr->ah_attr.grh.traffic_class; qp_params.flow_label = attr->ah_attr.grh.flow_label; qp_params.hop_limit_ttl = attr->ah_attr.grh.hop_limit; qp->sgid_idx = attr->ah_attr.grh.sgid_index; get_gid_info(ibqp, attr, attr_mask, dev, qp, &qp_params); rc = qlnxr_get_dmac(dev, &attr->ah_attr, qp_params.remote_mac_addr); if (rc) return rc; qp_params.use_local_mac = true; memcpy(qp_params.local_mac_addr, dev->ha->primary_mac, ETH_ALEN); QL_DPRINT12(ha, "dgid=0x%x:0x%x:0x%x:0x%x\n", qp_params.dgid.dwords[0], qp_params.dgid.dwords[1], qp_params.dgid.dwords[2], qp_params.dgid.dwords[3]); QL_DPRINT12(ha, "sgid=0x%x:0x%x:0x%x:0x%x\n", qp_params.sgid.dwords[0], qp_params.sgid.dwords[1], qp_params.sgid.dwords[2], qp_params.sgid.dwords[3]); QL_DPRINT12(ha, "remote_mac=[0x%x:0x%x:0x%x:0x%x:0x%x:0x%x]\n", qp_params.remote_mac_addr[0], qp_params.remote_mac_addr[1], qp_params.remote_mac_addr[2], qp_params.remote_mac_addr[3], qp_params.remote_mac_addr[4], qp_params.remote_mac_addr[5]); qp_params.mtu = qp->mtu; } if (qp_params.mtu == 0) { /* stay with current MTU */ if (qp->mtu) { qp_params.mtu = qp->mtu; } else { qp_params.mtu = ib_mtu_enum_to_int( iboe_get_mtu(dev->ha->ifp->if_mtu)); } } if (attr_mask & IB_QP_TIMEOUT) { SET_FIELD(qp_params.modify_flags, \ ECORE_ROCE_MODIFY_QP_VALID_ACK_TIMEOUT, 1); qp_params.ack_timeout = attr->timeout; if (attr->timeout) { u32 temp; /* 12.7.34 LOCAL ACK TIMEOUT * Value representing the transport (ACK) timeout for * use by the remote, expressed as (4.096 μS*2Local ACK * Timeout) */ /* We use 1UL since the temporal value may be overflow * 32 bits */ temp = 4096 * (1UL << attr->timeout) / 1000 / 1000; qp_params.ack_timeout = temp; /* FW requires [msec] */ } else qp_params.ack_timeout = 0; /* infinite */ } if (attr_mask & IB_QP_RETRY_CNT) { SET_FIELD(qp_params.modify_flags,\ ECORE_ROCE_MODIFY_QP_VALID_RETRY_CNT, 1); qp_params.retry_cnt = attr->retry_cnt; } if (attr_mask & IB_QP_RNR_RETRY) { SET_FIELD(qp_params.modify_flags, ECORE_ROCE_MODIFY_QP_VALID_RNR_RETRY_CNT, 1); qp_params.rnr_retry_cnt = attr->rnr_retry; } if (attr_mask & IB_QP_RQ_PSN) { SET_FIELD(qp_params.modify_flags, ECORE_ROCE_MODIFY_QP_VALID_RQ_PSN, 1); qp_params.rq_psn = attr->rq_psn; qp->rq_psn = attr->rq_psn; } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { if (attr->max_rd_atomic > qattr->max_qp_req_rd_atomic_resc) { rc = -EINVAL; QL_DPRINT12(ha, "unsupported max_rd_atomic=%d, supported=%d\n", attr->max_rd_atomic, qattr->max_qp_req_rd_atomic_resc); goto err; } SET_FIELD(qp_params.modify_flags, ECORE_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_REQ, 1); qp_params.max_rd_atomic_req = attr->max_rd_atomic; } if (attr_mask & IB_QP_MIN_RNR_TIMER) { SET_FIELD(qp_params.modify_flags, ECORE_ROCE_MODIFY_QP_VALID_MIN_RNR_NAK_TIMER, 1); qp_params.min_rnr_nak_timer = attr->min_rnr_timer; } if (attr_mask & IB_QP_SQ_PSN) { SET_FIELD(qp_params.modify_flags, ECORE_ROCE_MODIFY_QP_VALID_SQ_PSN, 1); qp_params.sq_psn = attr->sq_psn; qp->sq_psn = attr->sq_psn; } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { if (attr->max_dest_rd_atomic > qattr->max_qp_resp_rd_atomic_resc) { QL_DPRINT12(ha, "unsupported max_dest_rd_atomic=%d, " "supported=%d\n", attr->max_dest_rd_atomic, qattr->max_qp_resp_rd_atomic_resc); rc = -EINVAL; goto err; } SET_FIELD(qp_params.modify_flags, ECORE_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_RESP, 1); qp_params.max_rd_atomic_resp = attr->max_dest_rd_atomic; } if (attr_mask & IB_QP_DEST_QPN) { SET_FIELD(qp_params.modify_flags, ECORE_ROCE_MODIFY_QP_VALID_DEST_QP, 1); qp_params.dest_qp = attr->dest_qp_num; qp->dest_qp_num = attr->dest_qp_num; } /* * Update the QP state before the actual ramrod to prevent a race with * fast path. Modifying the QP state to error will cause the device to * flush the CQEs and while polling the flushed CQEs will considered as * a potential issue if the QP isn't in error state. */ if ((attr_mask & IB_QP_STATE) && (qp->qp_type != IB_QPT_GSI) && (!udata) && (qp_params.new_state == ECORE_ROCE_QP_STATE_ERR)) qp->state = ECORE_ROCE_QP_STATE_ERR; if (qp->qp_type != IB_QPT_GSI) rc = ecore_rdma_modify_qp(dev->rdma_ctx, qp->ecore_qp, &qp_params); if (attr_mask & IB_QP_STATE) { if ((qp->qp_type != IB_QPT_GSI) && (!udata)) rc = qlnxr_update_qp_state(dev, qp, qp_params.new_state); qp->state = qp_params.new_state; } err: QL_DPRINT12(ha, "exit\n"); return rc; } static int qlnxr_to_ib_qp_acc_flags(struct ecore_rdma_query_qp_out_params *params) { int ib_qp_acc_flags = 0; if (params->incoming_rdma_write_en) ib_qp_acc_flags |= IB_ACCESS_REMOTE_WRITE; if (params->incoming_rdma_read_en) ib_qp_acc_flags |= IB_ACCESS_REMOTE_READ; if (params->incoming_atomic_en) ib_qp_acc_flags |= IB_ACCESS_REMOTE_ATOMIC; if (true) /* FIXME -> local write ?? */ ib_qp_acc_flags |= IB_ACCESS_LOCAL_WRITE; return ib_qp_acc_flags; } static enum ib_mtu qlnxr_mtu_int_to_enum(u16 mtu) { enum ib_mtu ib_mtu_size; switch (mtu) { case 256: ib_mtu_size = IB_MTU_256; break; case 512: ib_mtu_size = IB_MTU_512; break; case 1024: ib_mtu_size = IB_MTU_1024; break; case 2048: ib_mtu_size = IB_MTU_2048; break; case 4096: ib_mtu_size = IB_MTU_4096; break; default: ib_mtu_size = IB_MTU_1024; break; } return (ib_mtu_size); } int qlnxr_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int attr_mask, struct ib_qp_init_attr *qp_init_attr) { int rc = 0; struct ecore_rdma_query_qp_out_params params; struct qlnxr_qp *qp = get_qlnxr_qp(ibqp); struct qlnxr_dev *dev = qp->dev; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); memset(¶ms, 0, sizeof(params)); rc = ecore_rdma_query_qp(dev->rdma_ctx, qp->ecore_qp, ¶ms); if (rc) goto err; memset(qp_attr, 0, sizeof(*qp_attr)); memset(qp_init_attr, 0, sizeof(*qp_init_attr)); qp_attr->qp_state = qlnxr_get_ibqp_state(params.state); qp_attr->cur_qp_state = qlnxr_get_ibqp_state(params.state); /* In some cases in iWARP qelr will ask for the state only */ if (QLNX_IS_IWARP(dev) && (attr_mask == IB_QP_STATE)) { QL_DPRINT11(ha, "only state requested\n"); return 0; } qp_attr->path_mtu = qlnxr_mtu_int_to_enum(params.mtu); qp_attr->path_mig_state = IB_MIG_MIGRATED; qp_attr->rq_psn = params.rq_psn; qp_attr->sq_psn = params.sq_psn; qp_attr->dest_qp_num = params.dest_qp; qp_attr->qp_access_flags = qlnxr_to_ib_qp_acc_flags(¶ms); QL_DPRINT12(ha, "qp_state = 0x%x cur_qp_state = 0x%x " "path_mtu = %d qp_access_flags = 0x%x\n", qp_attr->qp_state, qp_attr->cur_qp_state, qp_attr->path_mtu, qp_attr->qp_access_flags); qp_attr->cap.max_send_wr = qp->sq.max_wr; qp_attr->cap.max_recv_wr = qp->rq.max_wr; qp_attr->cap.max_send_sge = qp->sq.max_sges; qp_attr->cap.max_recv_sge = qp->rq.max_sges; qp_attr->cap.max_inline_data = qp->max_inline_data; qp_init_attr->cap = qp_attr->cap; memcpy(&qp_attr->ah_attr.grh.dgid.raw[0], ¶ms.dgid.bytes[0], sizeof(qp_attr->ah_attr.grh.dgid.raw)); qp_attr->ah_attr.grh.flow_label = params.flow_label; qp_attr->ah_attr.grh.sgid_index = qp->sgid_idx; qp_attr->ah_attr.grh.hop_limit = params.hop_limit_ttl; qp_attr->ah_attr.grh.traffic_class = params.traffic_class_tos; qp_attr->ah_attr.ah_flags = IB_AH_GRH; qp_attr->ah_attr.port_num = 1; /* FIXME -> check this */ qp_attr->ah_attr.sl = 0;/* FIXME -> check this */ qp_attr->timeout = params.timeout; qp_attr->rnr_retry = params.rnr_retry; qp_attr->retry_cnt = params.retry_cnt; qp_attr->min_rnr_timer = params.min_rnr_nak_timer; qp_attr->pkey_index = params.pkey_index; qp_attr->port_num = 1; /* FIXME -> check this */ qp_attr->ah_attr.src_path_bits = 0; qp_attr->ah_attr.static_rate = 0; qp_attr->alt_pkey_index = 0; qp_attr->alt_port_num = 0; qp_attr->alt_timeout = 0; memset(&qp_attr->alt_ah_attr, 0, sizeof(qp_attr->alt_ah_attr)); qp_attr->sq_draining = (params.state == ECORE_ROCE_QP_STATE_SQD) ? 1 : 0; qp_attr->max_dest_rd_atomic = params.max_dest_rd_atomic; qp_attr->max_rd_atomic = params.max_rd_atomic; qp_attr->en_sqd_async_notify = (params.sqd_async)? 1 : 0; QL_DPRINT12(ha, "max_inline_data=%d\n", qp_attr->cap.max_inline_data); err: QL_DPRINT12(ha, "exit\n"); return rc; } static void qlnxr_cleanup_user(struct qlnxr_dev *dev, struct qlnxr_qp *qp) { qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (qp->usq.umem) ib_umem_release(qp->usq.umem); qp->usq.umem = NULL; if (qp->urq.umem) ib_umem_release(qp->urq.umem); qp->urq.umem = NULL; QL_DPRINT12(ha, "exit\n"); return; } static void qlnxr_cleanup_kernel(struct qlnxr_dev *dev, struct qlnxr_qp *qp) { qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (qlnxr_qp_has_sq(qp)) { QL_DPRINT12(ha, "freeing SQ\n"); ha->qlnxr_debug = 1; // ecore_chain_free(dev->cdev, &qp->sq.pbl); ha->qlnxr_debug = 0; kfree(qp->wqe_wr_id); } if (qlnxr_qp_has_rq(qp)) { QL_DPRINT12(ha, "freeing RQ\n"); ha->qlnxr_debug = 1; // ecore_chain_free(dev->cdev, &qp->rq.pbl); ha->qlnxr_debug = 0; kfree(qp->rqe_wr_id); } QL_DPRINT12(ha, "exit\n"); return; } int qlnxr_free_qp_resources(struct qlnxr_dev *dev, struct qlnxr_qp *qp) { int rc = 0; qlnx_host_t *ha; struct ecore_rdma_destroy_qp_out_params d_out_params; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); #if 0 if (qp->qp_type != IB_QPT_GSI) { rc = ecore_rdma_destroy_qp(dev->rdma_ctx, qp->ecore_qp, &d_out_params); if (rc) return rc; } if (qp->ibqp.uobject && qp->ibqp.uobject->context) qlnxr_cleanup_user(dev, qp); else qlnxr_cleanup_kernel(dev, qp); #endif if (qp->ibqp.uobject && qp->ibqp.uobject->context) qlnxr_cleanup_user(dev, qp); else qlnxr_cleanup_kernel(dev, qp); if (qp->qp_type != IB_QPT_GSI) { rc = ecore_rdma_destroy_qp(dev->rdma_ctx, qp->ecore_qp, &d_out_params); if (rc) return rc; } QL_DPRINT12(ha, "exit\n"); return 0; } int qlnxr_destroy_qp(struct ib_qp *ibqp) { struct qlnxr_qp *qp = get_qlnxr_qp(ibqp); struct qlnxr_dev *dev = qp->dev; int rc = 0; struct ib_qp_attr attr; int attr_mask = 0; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter qp = %p, qp_type=%d\n", qp, qp->qp_type); qp->destroyed = 1; if (QLNX_IS_ROCE(dev) && (qp->state != (ECORE_ROCE_QP_STATE_RESET | ECORE_ROCE_QP_STATE_ERR | ECORE_ROCE_QP_STATE_INIT))) { attr.qp_state = IB_QPS_ERR; attr_mask |= IB_QP_STATE; /* change the QP state to ERROR */ qlnxr_modify_qp(ibqp, &attr, attr_mask, NULL); } if (qp->qp_type == IB_QPT_GSI) qlnxr_destroy_gsi_qp(dev); qp->sig = ~qp->sig; qlnxr_free_qp_resources(dev, qp); if (atomic_dec_and_test(&qp->refcnt)) { /* TODO: only for iWARP? */ qlnxr_idr_remove(dev, qp->qp_id); kfree(qp); } QL_DPRINT12(ha, "exit\n"); return rc; } static inline int qlnxr_wq_is_full(struct qlnxr_qp_hwq_info *wq) { return (((wq->prod + 1) % wq->max_wr) == wq->cons); } static int sge_data_len(struct ib_sge *sg_list, int num_sge) { int i, len = 0; for (i = 0; i < num_sge; i++) len += sg_list[i].length; return len; } static void swap_wqe_data64(u64 *p) { int i; for (i = 0; i < QLNXR_SQE_ELEMENT_SIZE / sizeof(u64); i++, p++) *p = cpu_to_be64(cpu_to_le64(*p)); } static u32 qlnxr_prepare_sq_inline_data(struct qlnxr_dev *dev, struct qlnxr_qp *qp, u8 *wqe_size, - struct ib_send_wr *wr, - struct ib_send_wr **bad_wr, + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr, u8 *bits, u8 bit) { int i, seg_siz; char *seg_prt, *wqe; u32 data_size = sge_data_len(wr->sg_list, wr->num_sge); qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter[%d]\n", data_size); if (data_size > ROCE_REQ_MAX_INLINE_DATA_SIZE) { QL_DPRINT12(ha, "Too much inline data in WR:[%d, %d]\n", data_size, ROCE_REQ_MAX_INLINE_DATA_SIZE); *bad_wr = wr; return 0; } if (!data_size) return data_size; /* set the bit */ *bits |= bit; seg_prt = wqe = NULL; seg_siz = 0; /* copy data inline */ for (i = 0; i < wr->num_sge; i++) { u32 len = wr->sg_list[i].length; void *src = (void *)(uintptr_t)wr->sg_list[i].addr; while (len > 0) { u32 cur; /* new segment required */ if (!seg_siz) { wqe = (char *)ecore_chain_produce(&qp->sq.pbl); seg_prt = wqe; seg_siz = sizeof(struct rdma_sq_common_wqe); (*wqe_size)++; } /* calculate currently allowed length */ cur = MIN(len, seg_siz); memcpy(seg_prt, src, cur); /* update segment variables */ seg_prt += cur; seg_siz -= cur; /* update sge variables */ src += cur; len -= cur; /* swap fully-completed segments */ if (!seg_siz) swap_wqe_data64((u64 *)wqe); } } /* swap last not completed segment */ if (seg_siz) swap_wqe_data64((u64 *)wqe); QL_DPRINT12(ha, "exit\n"); return data_size; } static u32 qlnxr_prepare_sq_sges(struct qlnxr_dev *dev, struct qlnxr_qp *qp, - u8 *wqe_size, struct ib_send_wr *wr) + u8 *wqe_size, const struct ib_send_wr *wr) { int i; u32 data_size = 0; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter wr->num_sge = %d \n", wr->num_sge); for (i = 0; i < wr->num_sge; i++) { struct rdma_sq_sge *sge = ecore_chain_produce(&qp->sq.pbl); TYPEPTR_ADDR_SET(sge, addr, wr->sg_list[i].addr); sge->l_key = cpu_to_le32(wr->sg_list[i].lkey); sge->length = cpu_to_le32(wr->sg_list[i].length); data_size += wr->sg_list[i].length; } if (wqe_size) *wqe_size += wr->num_sge; QL_DPRINT12(ha, "exit data_size = %d\n", data_size); return data_size; } static u32 qlnxr_prepare_sq_rdma_data(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct rdma_sq_rdma_wqe_1st *rwqe, struct rdma_sq_rdma_wqe_2nd *rwqe2, - struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { qlnx_host_t *ha; u32 ret = 0; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); rwqe2->r_key = cpu_to_le32(rdma_wr(wr)->rkey); TYPEPTR_ADDR_SET(rwqe2, remote_va, rdma_wr(wr)->remote_addr); if (wr->send_flags & IB_SEND_INLINE) { u8 flags = 0; SET_FIELD2(flags, RDMA_SQ_RDMA_WQE_1ST_INLINE_FLG, 1); return qlnxr_prepare_sq_inline_data(dev, qp, &rwqe->wqe_size, wr, bad_wr, &rwqe->flags, flags); } ret = qlnxr_prepare_sq_sges(dev, qp, &rwqe->wqe_size, wr); QL_DPRINT12(ha, "exit ret = 0x%x\n", ret); return (ret); } static u32 qlnxr_prepare_sq_send_data(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct rdma_sq_send_wqe *swqe, struct rdma_sq_send_wqe *swqe2, - struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { qlnx_host_t *ha; u32 ret = 0; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); memset(swqe2, 0, sizeof(*swqe2)); if (wr->send_flags & IB_SEND_INLINE) { u8 flags = 0; SET_FIELD2(flags, RDMA_SQ_SEND_WQE_INLINE_FLG, 1); return qlnxr_prepare_sq_inline_data(dev, qp, &swqe->wqe_size, wr, bad_wr, &swqe->flags, flags); } ret = qlnxr_prepare_sq_sges(dev, qp, &swqe->wqe_size, wr); QL_DPRINT12(ha, "exit ret = 0x%x\n", ret); return (ret); } static void qlnx_handle_completed_mrs(struct qlnxr_dev *dev, struct mr_info *info) { qlnx_host_t *ha; ha = dev->ha; int work = info->completed - info->completed_handled - 1; QL_DPRINT12(ha, "enter [%d]\n", work); while (work-- > 0 && !list_empty(&info->inuse_pbl_list)) { struct qlnxr_pbl *pbl; /* Free all the page list that are possible to be freed * (all the ones that were invalidated), under the assumption * that if an FMR was completed successfully that means that * if there was an invalidate operation before it also ended */ pbl = list_first_entry(&info->inuse_pbl_list, struct qlnxr_pbl, list_entry); list_del(&pbl->list_entry); list_add_tail(&pbl->list_entry, &info->free_pbl_list); info->completed_handled++; } QL_DPRINT12(ha, "exit\n"); return; } #if __FreeBSD_version >= 1102000 static int qlnxr_prepare_reg(struct qlnxr_qp *qp, struct rdma_sq_fmr_wqe_1st *fwqe1, - struct ib_reg_wr *wr) + const struct ib_reg_wr *wr) { struct qlnxr_mr *mr = get_qlnxr_mr(wr->mr); struct rdma_sq_fmr_wqe_2nd *fwqe2; fwqe2 = (struct rdma_sq_fmr_wqe_2nd *)ecore_chain_produce(&qp->sq.pbl); fwqe1->addr.hi = upper_32_bits(mr->ibmr.iova); fwqe1->addr.lo = lower_32_bits(mr->ibmr.iova); fwqe1->l_key = wr->key; fwqe2->access_ctrl = 0; SET_FIELD2(fwqe2->access_ctrl, RDMA_SQ_FMR_WQE_2ND_REMOTE_READ, !!(wr->access & IB_ACCESS_REMOTE_READ)); SET_FIELD2(fwqe2->access_ctrl, RDMA_SQ_FMR_WQE_2ND_REMOTE_WRITE, !!(wr->access & IB_ACCESS_REMOTE_WRITE)); SET_FIELD2(fwqe2->access_ctrl, RDMA_SQ_FMR_WQE_2ND_ENABLE_ATOMIC, !!(wr->access & IB_ACCESS_REMOTE_ATOMIC)); SET_FIELD2(fwqe2->access_ctrl, RDMA_SQ_FMR_WQE_2ND_LOCAL_READ, 1); SET_FIELD2(fwqe2->access_ctrl, RDMA_SQ_FMR_WQE_2ND_LOCAL_WRITE, !!(wr->access & IB_ACCESS_LOCAL_WRITE)); fwqe2->fmr_ctrl = 0; SET_FIELD2(fwqe2->fmr_ctrl, RDMA_SQ_FMR_WQE_2ND_PAGE_SIZE_LOG, ilog2(mr->ibmr.page_size) - 12); fwqe2->length_hi = 0; /* TODO - figure out why length is only 32bit.. */ fwqe2->length_lo = mr->ibmr.length; fwqe2->pbl_addr.hi = upper_32_bits(mr->info.pbl_table->pa); fwqe2->pbl_addr.lo = lower_32_bits(mr->info.pbl_table->pa); qp->wqe_wr_id[qp->sq.prod].mr = mr; return 0; } #else static void -build_frmr_pbes(struct qlnxr_dev *dev, struct ib_send_wr *wr, +build_frmr_pbes(struct qlnxr_dev *dev, const struct ib_send_wr *wr, struct mr_info *info) { int i; u64 buf_addr = 0; int num_pbes, total_num_pbes = 0; struct regpair *pbe; struct qlnxr_pbl *pbl_tbl = info->pbl_table; struct qlnxr_pbl_info *pbl_info = &info->pbl_info; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); pbe = (struct regpair *)pbl_tbl->va; num_pbes = 0; for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { buf_addr = wr->wr.fast_reg.page_list->page_list[i]; pbe->lo = cpu_to_le32((u32)buf_addr); pbe->hi = cpu_to_le32((u32)upper_32_bits(buf_addr)); num_pbes += 1; pbe++; total_num_pbes++; if (total_num_pbes == pbl_info->num_pbes) return; /* if the given pbl is full storing the pbes, * move to next pbl. */ if (num_pbes == (pbl_info->pbl_size / sizeof(u64))) { pbl_tbl++; pbe = (struct regpair *)pbl_tbl->va; num_pbes = 0; } } QL_DPRINT12(ha, "exit\n"); return; } static int qlnxr_prepare_safe_pbl(struct qlnxr_dev *dev, struct mr_info *info) { int rc = 0; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (info->completed == 0) { //DP_VERBOSE(dev, QLNXR_MSG_MR, "First FMR\n"); /* first fmr */ return 0; } qlnx_handle_completed_mrs(dev, info); list_add_tail(&info->pbl_table->list_entry, &info->inuse_pbl_list); if (list_empty(&info->free_pbl_list)) { info->pbl_table = qlnxr_alloc_pbl_tbl(dev, &info->pbl_info, GFP_ATOMIC); } else { info->pbl_table = list_first_entry(&info->free_pbl_list, struct qlnxr_pbl, list_entry); list_del(&info->pbl_table->list_entry); } if (!info->pbl_table) rc = -ENOMEM; QL_DPRINT12(ha, "exit\n"); return rc; } static inline int qlnxr_prepare_fmr(struct qlnxr_qp *qp, struct rdma_sq_fmr_wqe_1st *fwqe1, - struct ib_send_wr *wr) + const struct ib_send_wr *wr) { struct qlnxr_dev *dev = qp->dev; u64 fbo; struct qlnxr_fast_reg_page_list *frmr_list = get_qlnxr_frmr_list(wr->wr.fast_reg.page_list); struct rdma_sq_fmr_wqe *fwqe2 = (struct rdma_sq_fmr_wqe *)ecore_chain_produce(&qp->sq.pbl); int rc = 0; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (wr->wr.fast_reg.page_list_len == 0) BUG(); rc = qlnxr_prepare_safe_pbl(dev, &frmr_list->info); if (rc) return rc; fwqe1->addr.hi = upper_32_bits(wr->wr.fast_reg.iova_start); fwqe1->addr.lo = lower_32_bits(wr->wr.fast_reg.iova_start); fwqe1->l_key = wr->wr.fast_reg.rkey; SET_FIELD2(fwqe2->access_ctrl, RDMA_SQ_FMR_WQE_REMOTE_READ, !!(wr->wr.fast_reg.access_flags & IB_ACCESS_REMOTE_READ)); SET_FIELD2(fwqe2->access_ctrl, RDMA_SQ_FMR_WQE_REMOTE_WRITE, !!(wr->wr.fast_reg.access_flags & IB_ACCESS_REMOTE_WRITE)); SET_FIELD2(fwqe2->access_ctrl, RDMA_SQ_FMR_WQE_ENABLE_ATOMIC, !!(wr->wr.fast_reg.access_flags & IB_ACCESS_REMOTE_ATOMIC)); SET_FIELD2(fwqe2->access_ctrl, RDMA_SQ_FMR_WQE_LOCAL_READ, 1); SET_FIELD2(fwqe2->access_ctrl, RDMA_SQ_FMR_WQE_LOCAL_WRITE, !!(wr->wr.fast_reg.access_flags & IB_ACCESS_LOCAL_WRITE)); fwqe2->fmr_ctrl = 0; SET_FIELD2(fwqe2->fmr_ctrl, RDMA_SQ_FMR_WQE_2ND_PAGE_SIZE_LOG, ilog2(1 << wr->wr.fast_reg.page_shift) - 12); SET_FIELD2(fwqe2->fmr_ctrl, RDMA_SQ_FMR_WQE_2ND_ZERO_BASED, 0); fwqe2->length_hi = 0; /* Todo - figure this out... why length is only 32bit.. */ fwqe2->length_lo = wr->wr.fast_reg.length; fwqe2->pbl_addr.hi = upper_32_bits(frmr_list->info.pbl_table->pa); fwqe2->pbl_addr.lo = lower_32_bits(frmr_list->info.pbl_table->pa); /* produce another wqe for fwqe3 */ ecore_chain_produce(&qp->sq.pbl); fbo = wr->wr.fast_reg.iova_start - (wr->wr.fast_reg.page_list->page_list[0] & PAGE_MASK); QL_DPRINT12(ha, "wr.fast_reg.iova_start = %p rkey=%x addr=%x:%x" " length = %x pbl_addr %x:%x\n", wr->wr.fast_reg.iova_start, wr->wr.fast_reg.rkey, fwqe1->addr.hi, fwqe1->addr.lo, fwqe2->length_lo, fwqe2->pbl_addr.hi, fwqe2->pbl_addr.lo); build_frmr_pbes(dev, wr, &frmr_list->info); qp->wqe_wr_id[qp->sq.prod].frmr = frmr_list; QL_DPRINT12(ha, "exit\n"); return 0; } #endif /* #if __FreeBSD_version >= 1102000 */ static enum ib_wc_opcode qlnxr_ib_to_wc_opcode(enum ib_wr_opcode opcode) { switch (opcode) { case IB_WR_RDMA_WRITE: case IB_WR_RDMA_WRITE_WITH_IMM: return IB_WC_RDMA_WRITE; case IB_WR_SEND_WITH_IMM: case IB_WR_SEND: case IB_WR_SEND_WITH_INV: return IB_WC_SEND; case IB_WR_RDMA_READ: return IB_WC_RDMA_READ; case IB_WR_ATOMIC_CMP_AND_SWP: return IB_WC_COMP_SWAP; case IB_WR_ATOMIC_FETCH_AND_ADD: return IB_WC_FETCH_ADD; #if __FreeBSD_version >= 1102000 case IB_WR_REG_MR: return IB_WC_REG_MR; #else case IB_WR_FAST_REG_MR: return IB_WC_FAST_REG_MR; #endif /* #if __FreeBSD_version >= 1102000 */ case IB_WR_LOCAL_INV: return IB_WC_LOCAL_INV; default: return IB_WC_SEND; } } static inline bool -qlnxr_can_post_send(struct qlnxr_qp *qp, struct ib_send_wr *wr) +qlnxr_can_post_send(struct qlnxr_qp *qp, const struct ib_send_wr *wr) { int wq_is_full, err_wr, pbl_is_full; struct qlnxr_dev *dev = qp->dev; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter[qp, wr] = [%p,%p]\n", qp, wr); /* prevent SQ overflow and/or processing of a bad WR */ err_wr = wr->num_sge > qp->sq.max_sges; wq_is_full = qlnxr_wq_is_full(&qp->sq); pbl_is_full = ecore_chain_get_elem_left_u32(&qp->sq.pbl) < QLNXR_MAX_SQE_ELEMENTS_PER_SQE; if (wq_is_full || err_wr || pbl_is_full) { if (wq_is_full && !(qp->err_bitmap & QLNXR_QP_ERR_SQ_FULL)) { qp->err_bitmap |= QLNXR_QP_ERR_SQ_FULL; QL_DPRINT12(ha, "error: WQ is full. Post send on QP failed" " (this error appears only once) " "[qp, wr, qp->err_bitmap]=[%p, %p, 0x%x]\n", qp, wr, qp->err_bitmap); } if (err_wr && !(qp->err_bitmap & QLNXR_QP_ERR_BAD_SR)) { qp->err_bitmap |= QLNXR_QP_ERR_BAD_SR; QL_DPRINT12(ha, "error: WQ is bad. Post send on QP failed" " (this error appears only once) " "[qp, wr, qp->err_bitmap]=[%p, %p, 0x%x]\n", qp, wr, qp->err_bitmap); } if (pbl_is_full && !(qp->err_bitmap & QLNXR_QP_ERR_SQ_PBL_FULL)) { qp->err_bitmap |= QLNXR_QP_ERR_SQ_PBL_FULL; QL_DPRINT12(ha, "error: WQ PBL is full. Post send on QP failed" " (this error appears only once) " "[qp, wr, qp->err_bitmap]=[%p, %p, 0x%x]\n", qp, wr, qp->err_bitmap); } return false; } QL_DPRINT12(ha, "exit[qp, wr] = [%p,%p]\n", qp, wr); return true; } int qlnxr_post_send(struct ib_qp *ibqp, - struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct qlnxr_dev *dev = get_qlnxr_dev(ibqp->device); struct qlnxr_qp *qp = get_qlnxr_qp(ibqp); unsigned long flags; int status = 0, rc = 0; bool comp; qlnx_host_t *ha; uint32_t reg_addr; *bad_wr = NULL; ha = dev->ha; QL_DPRINT12(ha, "exit[ibqp, wr, bad_wr] = [%p, %p, %p]\n", ibqp, wr, bad_wr); if (!(ha->ifp->if_drv_flags & IFF_DRV_RUNNING)) return -EINVAL; if (qp->qp_type == IB_QPT_GSI) return qlnxr_gsi_post_send(ibqp, wr, bad_wr); spin_lock_irqsave(&qp->q_lock, flags); if (QLNX_IS_ROCE(dev) && (qp->state != ECORE_ROCE_QP_STATE_RTS) && (qp->state != ECORE_ROCE_QP_STATE_ERR) && (qp->state != ECORE_ROCE_QP_STATE_SQD)) { spin_unlock_irqrestore(&qp->q_lock, flags); *bad_wr = wr; QL_DPRINT11(ha, "QP in wrong state! QP icid=0x%x state %d\n", qp->icid, qp->state); return -EINVAL; } if (!wr) { QL_DPRINT11(ha, "Got an empty post send???\n"); } while (wr) { struct rdma_sq_common_wqe *wqe; struct rdma_sq_send_wqe *swqe; struct rdma_sq_send_wqe *swqe2; struct rdma_sq_rdma_wqe_1st *rwqe; struct rdma_sq_rdma_wqe_2nd *rwqe2; struct rdma_sq_local_inv_wqe *iwqe; struct rdma_sq_atomic_wqe *awqe1; struct rdma_sq_atomic_wqe *awqe2; struct rdma_sq_atomic_wqe *awqe3; struct rdma_sq_fmr_wqe_1st *fwqe1; if (!qlnxr_can_post_send(qp, wr)) { status = -ENOMEM; *bad_wr = wr; break; } wqe = ecore_chain_produce(&qp->sq.pbl); qp->wqe_wr_id[qp->sq.prod].signaled = !!(wr->send_flags & IB_SEND_SIGNALED) || qp->signaled; /* common fields */ wqe->flags = 0; wqe->flags |= (RDMA_SQ_SEND_WQE_COMP_FLG_MASK << RDMA_SQ_SEND_WQE_COMP_FLG_SHIFT); SET_FIELD2(wqe->flags, RDMA_SQ_SEND_WQE_SE_FLG, \ !!(wr->send_flags & IB_SEND_SOLICITED)); comp = (!!(wr->send_flags & IB_SEND_SIGNALED)) || (qp->signaled); SET_FIELD2(wqe->flags, RDMA_SQ_SEND_WQE_COMP_FLG, comp); SET_FIELD2(wqe->flags, RDMA_SQ_SEND_WQE_RD_FENCE_FLG, \ !!(wr->send_flags & IB_SEND_FENCE)); wqe->prev_wqe_size = qp->prev_wqe_size; qp->wqe_wr_id[qp->sq.prod].opcode = qlnxr_ib_to_wc_opcode(wr->opcode); switch (wr->opcode) { case IB_WR_SEND_WITH_IMM: wqe->req_type = RDMA_SQ_REQ_TYPE_SEND_WITH_IMM; swqe = (struct rdma_sq_send_wqe *)wqe; swqe->wqe_size = 2; swqe2 = (struct rdma_sq_send_wqe *) ecore_chain_produce(&qp->sq.pbl); swqe->inv_key_or_imm_data = cpu_to_le32(wr->ex.imm_data); swqe->length = cpu_to_le32( qlnxr_prepare_sq_send_data(dev, qp, swqe, swqe2, wr, bad_wr)); qp->wqe_wr_id[qp->sq.prod].wqe_size = swqe->wqe_size; qp->prev_wqe_size = swqe->wqe_size; qp->wqe_wr_id[qp->sq.prod].bytes_len = swqe->length; QL_DPRINT12(ha, "SEND w/ IMM length = %d imm data=%x\n", swqe->length, wr->ex.imm_data); break; case IB_WR_SEND: wqe->req_type = RDMA_SQ_REQ_TYPE_SEND; swqe = (struct rdma_sq_send_wqe *)wqe; swqe->wqe_size = 2; swqe2 = (struct rdma_sq_send_wqe *) ecore_chain_produce(&qp->sq.pbl); swqe->length = cpu_to_le32( qlnxr_prepare_sq_send_data(dev, qp, swqe, swqe2, wr, bad_wr)); qp->wqe_wr_id[qp->sq.prod].wqe_size = swqe->wqe_size; qp->prev_wqe_size = swqe->wqe_size; qp->wqe_wr_id[qp->sq.prod].bytes_len = swqe->length; QL_DPRINT12(ha, "SEND w/o IMM length = %d\n", swqe->length); break; case IB_WR_SEND_WITH_INV: wqe->req_type = RDMA_SQ_REQ_TYPE_SEND_WITH_INVALIDATE; swqe = (struct rdma_sq_send_wqe *)wqe; swqe2 = (struct rdma_sq_send_wqe *) ecore_chain_produce(&qp->sq.pbl); swqe->wqe_size = 2; swqe->inv_key_or_imm_data = cpu_to_le32(wr->ex.invalidate_rkey); swqe->length = cpu_to_le32(qlnxr_prepare_sq_send_data(dev, qp, swqe, swqe2, wr, bad_wr)); qp->wqe_wr_id[qp->sq.prod].wqe_size = swqe->wqe_size; qp->prev_wqe_size = swqe->wqe_size; qp->wqe_wr_id[qp->sq.prod].bytes_len = swqe->length; QL_DPRINT12(ha, "SEND w INVALIDATE length = %d\n", swqe->length); break; case IB_WR_RDMA_WRITE_WITH_IMM: wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_WR_WITH_IMM; rwqe = (struct rdma_sq_rdma_wqe_1st *)wqe; rwqe->wqe_size = 2; rwqe->imm_data = htonl(cpu_to_le32(wr->ex.imm_data)); rwqe2 = (struct rdma_sq_rdma_wqe_2nd *) ecore_chain_produce(&qp->sq.pbl); rwqe->length = cpu_to_le32(qlnxr_prepare_sq_rdma_data(dev, qp, rwqe, rwqe2, wr, bad_wr)); qp->wqe_wr_id[qp->sq.prod].wqe_size = rwqe->wqe_size; qp->prev_wqe_size = rwqe->wqe_size; qp->wqe_wr_id[qp->sq.prod].bytes_len = rwqe->length; QL_DPRINT12(ha, "RDMA WRITE w/ IMM length = %d imm data=%x\n", rwqe->length, rwqe->imm_data); break; case IB_WR_RDMA_WRITE: wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_WR; rwqe = (struct rdma_sq_rdma_wqe_1st *)wqe; rwqe->wqe_size = 2; rwqe2 = (struct rdma_sq_rdma_wqe_2nd *) ecore_chain_produce(&qp->sq.pbl); rwqe->length = cpu_to_le32(qlnxr_prepare_sq_rdma_data(dev, qp, rwqe, rwqe2, wr, bad_wr)); qp->wqe_wr_id[qp->sq.prod].wqe_size = rwqe->wqe_size; qp->prev_wqe_size = rwqe->wqe_size; qp->wqe_wr_id[qp->sq.prod].bytes_len = rwqe->length; QL_DPRINT12(ha, "RDMA WRITE w/o IMM length = %d\n", rwqe->length); break; case IB_WR_RDMA_READ_WITH_INV: QL_DPRINT12(ha, "RDMA READ WITH INVALIDATE not supported\n"); *bad_wr = wr; rc = -EINVAL; break; case IB_WR_RDMA_READ: wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_RD; rwqe = (struct rdma_sq_rdma_wqe_1st *)wqe; rwqe->wqe_size = 2; rwqe2 = (struct rdma_sq_rdma_wqe_2nd *) ecore_chain_produce(&qp->sq.pbl); rwqe->length = cpu_to_le32(qlnxr_prepare_sq_rdma_data(dev, qp, rwqe, rwqe2, wr, bad_wr)); qp->wqe_wr_id[qp->sq.prod].wqe_size = rwqe->wqe_size; qp->prev_wqe_size = rwqe->wqe_size; qp->wqe_wr_id[qp->sq.prod].bytes_len = rwqe->length; QL_DPRINT12(ha, "RDMA READ length = %d\n", rwqe->length); break; case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: QL_DPRINT12(ha, "ATOMIC operation = %s\n", ((wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) ? "IB_WR_ATOMIC_CMP_AND_SWP" : "IB_WR_ATOMIC_FETCH_AND_ADD")); awqe1 = (struct rdma_sq_atomic_wqe *)wqe; awqe1->prev_wqe_size = 4; awqe2 = (struct rdma_sq_atomic_wqe *) ecore_chain_produce(&qp->sq.pbl); TYPEPTR_ADDR_SET(awqe2, remote_va, \ atomic_wr(wr)->remote_addr); awqe2->r_key = cpu_to_le32(atomic_wr(wr)->rkey); awqe3 = (struct rdma_sq_atomic_wqe *) ecore_chain_produce(&qp->sq.pbl); if (wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { wqe->req_type = RDMA_SQ_REQ_TYPE_ATOMIC_ADD; TYPEPTR_ADDR_SET(awqe3, swap_data, atomic_wr(wr)->compare_add); } else { wqe->req_type = RDMA_SQ_REQ_TYPE_ATOMIC_CMP_AND_SWAP; TYPEPTR_ADDR_SET(awqe3, swap_data, atomic_wr(wr)->swap); TYPEPTR_ADDR_SET(awqe3, cmp_data, atomic_wr(wr)->compare_add); } qlnxr_prepare_sq_sges(dev, qp, NULL, wr); qp->wqe_wr_id[qp->sq.prod].wqe_size = awqe1->prev_wqe_size; qp->prev_wqe_size = awqe1->prev_wqe_size; break; case IB_WR_LOCAL_INV: QL_DPRINT12(ha, "INVALIDATE length (IB_WR_LOCAL_INV)\n"); iwqe = (struct rdma_sq_local_inv_wqe *)wqe; iwqe->prev_wqe_size = 1; iwqe->req_type = RDMA_SQ_REQ_TYPE_LOCAL_INVALIDATE; iwqe->inv_l_key = wr->ex.invalidate_rkey; qp->wqe_wr_id[qp->sq.prod].wqe_size = iwqe->prev_wqe_size; qp->prev_wqe_size = iwqe->prev_wqe_size; break; #if __FreeBSD_version >= 1102000 case IB_WR_REG_MR: QL_DPRINT12(ha, "IB_WR_REG_MR\n"); wqe->req_type = RDMA_SQ_REQ_TYPE_FAST_MR; fwqe1 = (struct rdma_sq_fmr_wqe_1st *)wqe; fwqe1->wqe_size = 2; rc = qlnxr_prepare_reg(qp, fwqe1, reg_wr(wr)); if (rc) { QL_DPRINT11(ha, "IB_WR_REG_MR failed rc=%d\n", rc); *bad_wr = wr; break; } qp->wqe_wr_id[qp->sq.prod].wqe_size = fwqe1->wqe_size; qp->prev_wqe_size = fwqe1->wqe_size; break; #else case IB_WR_FAST_REG_MR: QL_DPRINT12(ha, "FAST_MR (IB_WR_FAST_REG_MR)\n"); wqe->req_type = RDMA_SQ_REQ_TYPE_FAST_MR; fwqe1 = (struct rdma_sq_fmr_wqe_1st *)wqe; fwqe1->prev_wqe_size = 3; rc = qlnxr_prepare_fmr(qp, fwqe1, wr); if (rc) { QL_DPRINT12(ha, "FAST_MR (IB_WR_FAST_REG_MR) failed" " rc = %d\n", rc); *bad_wr = wr; break; } qp->wqe_wr_id[qp->sq.prod].wqe_size = fwqe1->prev_wqe_size; qp->prev_wqe_size = fwqe1->prev_wqe_size; break; #endif /* #if __FreeBSD_version >= 1102000 */ default: QL_DPRINT12(ha, "Invalid Opcode 0x%x!\n", wr->opcode); rc = -EINVAL; *bad_wr = wr; break; } if (*bad_wr) { /* * restore prod to its position before this WR was processed */ ecore_chain_set_prod(&qp->sq.pbl, le16_to_cpu(qp->sq.db_data.data.value), wqe); /* restore prev_wqe_size */ qp->prev_wqe_size = wqe->prev_wqe_size; status = rc; QL_DPRINT12(ha, "failed *bad_wr = %p\n", *bad_wr); break; /* out of the loop */ } qp->wqe_wr_id[qp->sq.prod].wr_id = wr->wr_id; qlnxr_inc_sw_prod(&qp->sq); qp->sq.db_data.data.value++; wr = wr->next; } /* Trigger doorbell * If there was a failure in the first WR then it will be triggered in * vane. However this is not harmful (as long as the producer value is * unchanged). For performance reasons we avoid checking for this * redundant doorbell. */ wmb(); //writel(qp->sq.db_data.raw, qp->sq.db); reg_addr = (uint32_t)((uint8_t *)qp->sq.db - (uint8_t *)ha->cdev.doorbells); bus_write_4(ha->pci_dbells, reg_addr, qp->sq.db_data.raw); bus_barrier(ha->pci_dbells, 0, 0, BUS_SPACE_BARRIER_READ); mmiowb(); spin_unlock_irqrestore(&qp->q_lock, flags); QL_DPRINT12(ha, "exit[ibqp, wr, bad_wr] = [%p, %p, %p]\n", ibqp, wr, bad_wr); return status; } static u32 qlnxr_srq_elem_left(struct qlnxr_srq_hwq_info *hw_srq) { u32 used; /* Calculate number of elements used based on producer * count and consumer count and subtract it from max * work request supported so that we get elements left. */ used = hw_srq->wr_prod_cnt - hw_srq->wr_cons_cnt; return hw_srq->max_wr - used; } int qlnxr_post_recv(struct ib_qp *ibqp, - struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) + const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct qlnxr_qp *qp = get_qlnxr_qp(ibqp); struct qlnxr_dev *dev = qp->dev; unsigned long flags; int status = 0; qlnx_host_t *ha; uint32_t reg_addr; ha = dev->ha; if (!(ha->ifp->if_drv_flags & IFF_DRV_RUNNING)) return -EINVAL; QL_DPRINT12(ha, "enter\n"); if (qp->qp_type == IB_QPT_GSI) { QL_DPRINT12(ha, "(qp->qp_type = IB_QPT_GSI)\n"); return qlnxr_gsi_post_recv(ibqp, wr, bad_wr); } if (qp->srq) { QL_DPRINT11(ha, "qp->srq [%p]" " QP is associated with SRQ, cannot post RQ buffers\n", qp->srq); return -EINVAL; } spin_lock_irqsave(&qp->q_lock, flags); if (qp->state == ECORE_ROCE_QP_STATE_RESET) { spin_unlock_irqrestore(&qp->q_lock, flags); *bad_wr = wr; QL_DPRINT11(ha, "qp->qp_type = ECORE_ROCE_QP_STATE_RESET\n"); return -EINVAL; } while (wr) { int i; if ((ecore_chain_get_elem_left_u32(&qp->rq.pbl) < QLNXR_MAX_RQE_ELEMENTS_PER_RQE) || (wr->num_sge > qp->rq.max_sges)) { status = -ENOMEM; *bad_wr = wr; break; } for (i = 0; i < wr->num_sge; i++) { u32 flags = 0; struct rdma_rq_sge *rqe = ecore_chain_produce(&qp->rq.pbl); /* first one must include the number of SGE in the list */ if (!i) SET_FIELD(flags, RDMA_RQ_SGE_NUM_SGES, wr->num_sge); SET_FIELD(flags, RDMA_RQ_SGE_L_KEY, wr->sg_list[i].lkey); RQ_SGE_SET(rqe, wr->sg_list[i].addr, \ wr->sg_list[i].length, flags); } /* Special case of no sges. FW requires between 1-4 sges... * in this case we need to post 1 sge with length zero. this is * because rdma write with immediate consumes an RQ. */ if (!wr->num_sge) { u32 flags = 0; struct rdma_rq_sge *rqe = ecore_chain_produce(&qp->rq.pbl); /* first one must include the number of SGE in the list */ SET_FIELD(flags, RDMA_RQ_SGE_L_KEY, 0); SET_FIELD(flags, RDMA_RQ_SGE_NUM_SGES, 1); //RQ_SGE_SET(rqe, 0, 0, flags); rqe->addr.hi = 0; rqe->addr.lo = 0; rqe->length = 0; rqe->flags = cpu_to_le32(flags); i = 1; } qp->rqe_wr_id[qp->rq.prod].wr_id = wr->wr_id; qp->rqe_wr_id[qp->rq.prod].wqe_size = i; qlnxr_inc_sw_prod(&qp->rq); wmb(); qp->rq.db_data.data.value++; // writel(qp->rq.db_data.raw, qp->rq.db); mmiowb(); // if (QLNX_IS_IWARP(dev)) { // writel(qp->rq.iwarp_db2_data.raw, qp->rq.iwarp_db2); // mmiowb(); /* for second doorbell */ // } reg_addr = (uint32_t)((uint8_t *)qp->rq.db - (uint8_t *)ha->cdev.doorbells); bus_write_4(ha->pci_dbells, reg_addr, qp->rq.db_data.raw); bus_barrier(ha->pci_dbells, 0, 0, BUS_SPACE_BARRIER_READ); if (QLNX_IS_IWARP(dev)) { reg_addr = (uint32_t)((uint8_t *)qp->rq.iwarp_db2 - (uint8_t *)ha->cdev.doorbells); bus_write_4(ha->pci_dbells, reg_addr, \ qp->rq.iwarp_db2_data.raw); bus_barrier(ha->pci_dbells, 0, 0, \ BUS_SPACE_BARRIER_READ); } wr = wr->next; } spin_unlock_irqrestore(&qp->q_lock, flags); QL_DPRINT12(ha, "exit status = 0x%x\n", status); return status; } /* In fmr we need to increase the number of fmr completed counter for the fmr * algorithm determining whether we can free a pbl or not. * we need to perform this whether the work request was signaled or not. for * this purpose we call this function from the condition that checks if a wr * should be skipped, to make sure we don't miss it ( possibly this fmr * operation was not signalted) */ static inline void qlnxr_chk_if_fmr(struct qlnxr_qp *qp) { #if __FreeBSD_version >= 1102000 if (qp->wqe_wr_id[qp->sq.cons].opcode == IB_WC_REG_MR) qp->wqe_wr_id[qp->sq.cons].mr->info.completed++; #else if (qp->wqe_wr_id[qp->sq.cons].opcode == IB_WC_FAST_REG_MR) qp->wqe_wr_id[qp->sq.cons].frmr->info.completed++; #endif /* #if __FreeBSD_version >= 1102000 */ } static int process_req(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct qlnxr_cq *cq, int num_entries, struct ib_wc *wc, u16 hw_cons, enum ib_wc_status status, int force) { u16 cnt = 0; qlnx_host_t *ha = dev->ha; QL_DPRINT12(ha, "enter\n"); while (num_entries && qp->sq.wqe_cons != hw_cons) { if (!qp->wqe_wr_id[qp->sq.cons].signaled && !force) { qlnxr_chk_if_fmr(qp); /* skip WC */ goto next_cqe; } /* fill WC */ wc->status = status; wc->vendor_err = 0; wc->wc_flags = 0; wc->src_qp = qp->id; wc->qp = &qp->ibqp; // common section wc->wr_id = qp->wqe_wr_id[qp->sq.cons].wr_id; wc->opcode = qp->wqe_wr_id[qp->sq.cons].opcode; switch (wc->opcode) { case IB_WC_RDMA_WRITE: wc->byte_len = qp->wqe_wr_id[qp->sq.cons].bytes_len; QL_DPRINT12(ha, "opcode = IB_WC_RDMA_WRITE bytes = %d\n", qp->wqe_wr_id[qp->sq.cons].bytes_len); break; case IB_WC_COMP_SWAP: case IB_WC_FETCH_ADD: wc->byte_len = 8; break; #if __FreeBSD_version >= 1102000 case IB_WC_REG_MR: qp->wqe_wr_id[qp->sq.cons].mr->info.completed++; break; #else case IB_WC_FAST_REG_MR: qp->wqe_wr_id[qp->sq.cons].frmr->info.completed++; break; #endif /* #if __FreeBSD_version >= 1102000 */ case IB_WC_RDMA_READ: case IB_WC_SEND: QL_DPRINT12(ha, "opcode = 0x%x \n", wc->opcode); break; default: ;//DP_ERR("TBD ERROR"); } num_entries--; wc++; cnt++; next_cqe: while (qp->wqe_wr_id[qp->sq.cons].wqe_size--) ecore_chain_consume(&qp->sq.pbl); qlnxr_inc_sw_cons(&qp->sq); } QL_DPRINT12(ha, "exit cnt = 0x%x\n", cnt); return cnt; } static int qlnxr_poll_cq_req(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct qlnxr_cq *cq, int num_entries, struct ib_wc *wc, struct rdma_cqe_requester *req) { int cnt = 0; qlnx_host_t *ha = dev->ha; QL_DPRINT12(ha, "enter req->status = 0x%x\n", req->status); switch (req->status) { case RDMA_CQE_REQ_STS_OK: cnt = process_req(dev, qp, cq, num_entries, wc, req->sq_cons, IB_WC_SUCCESS, 0); break; case RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR: if (qp->state != ECORE_ROCE_QP_STATE_ERR) cnt = process_req(dev, qp, cq, num_entries, wc, req->sq_cons, IB_WC_WR_FLUSH_ERR, 1); break; default: /* other errors case */ /* process all WQE before the cosumer */ qp->state = ECORE_ROCE_QP_STATE_ERR; cnt = process_req(dev, qp, cq, num_entries, wc, req->sq_cons - 1, IB_WC_SUCCESS, 0); wc += cnt; /* if we have extra WC fill it with actual error info */ if (cnt < num_entries) { enum ib_wc_status wc_status; switch (req->status) { case RDMA_CQE_REQ_STS_BAD_RESPONSE_ERR: wc_status = IB_WC_BAD_RESP_ERR; break; case RDMA_CQE_REQ_STS_LOCAL_LENGTH_ERR: wc_status = IB_WC_LOC_LEN_ERR; break; case RDMA_CQE_REQ_STS_LOCAL_QP_OPERATION_ERR: wc_status = IB_WC_LOC_QP_OP_ERR; break; case RDMA_CQE_REQ_STS_LOCAL_PROTECTION_ERR: wc_status = IB_WC_LOC_PROT_ERR; break; case RDMA_CQE_REQ_STS_MEMORY_MGT_OPERATION_ERR: wc_status = IB_WC_MW_BIND_ERR; break; case RDMA_CQE_REQ_STS_REMOTE_INVALID_REQUEST_ERR: wc_status = IB_WC_REM_INV_REQ_ERR; break; case RDMA_CQE_REQ_STS_REMOTE_ACCESS_ERR: wc_status = IB_WC_REM_ACCESS_ERR; break; case RDMA_CQE_REQ_STS_REMOTE_OPERATION_ERR: wc_status = IB_WC_REM_OP_ERR; break; case RDMA_CQE_REQ_STS_RNR_NAK_RETRY_CNT_ERR: wc_status = IB_WC_RNR_RETRY_EXC_ERR; break; case RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR: wc_status = IB_WC_RETRY_EXC_ERR; break; default: wc_status = IB_WC_GENERAL_ERR; } cnt += process_req(dev, qp, cq, 1, wc, req->sq_cons, wc_status, 1 /* force use of WC */); } } QL_DPRINT12(ha, "exit cnt = %d\n", cnt); return cnt; } static void __process_resp_one(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct qlnxr_cq *cq, struct ib_wc *wc, struct rdma_cqe_responder *resp, u64 wr_id) { enum ib_wc_status wc_status = IB_WC_SUCCESS; #if __FreeBSD_version < 1102000 u8 flags; #endif qlnx_host_t *ha = dev->ha; QL_DPRINT12(ha, "enter qp = %p resp->status = 0x%x\n", qp, resp->status); wc->opcode = IB_WC_RECV; wc->wc_flags = 0; switch (resp->status) { case RDMA_CQE_RESP_STS_LOCAL_ACCESS_ERR: wc_status = IB_WC_LOC_ACCESS_ERR; break; case RDMA_CQE_RESP_STS_LOCAL_LENGTH_ERR: wc_status = IB_WC_LOC_LEN_ERR; break; case RDMA_CQE_RESP_STS_LOCAL_QP_OPERATION_ERR: wc_status = IB_WC_LOC_QP_OP_ERR; break; case RDMA_CQE_RESP_STS_LOCAL_PROTECTION_ERR: wc_status = IB_WC_LOC_PROT_ERR; break; case RDMA_CQE_RESP_STS_MEMORY_MGT_OPERATION_ERR: wc_status = IB_WC_MW_BIND_ERR; break; case RDMA_CQE_RESP_STS_REMOTE_INVALID_REQUEST_ERR: wc_status = IB_WC_REM_INV_RD_REQ_ERR; break; case RDMA_CQE_RESP_STS_OK: #if __FreeBSD_version >= 1102000 if (resp->flags & QLNXR_RESP_IMM) { wc->ex.imm_data = le32_to_cpu(resp->imm_data_or_inv_r_Key); wc->wc_flags |= IB_WC_WITH_IMM; if (resp->flags & QLNXR_RESP_RDMA) wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; if (resp->flags & QLNXR_RESP_INV) { QL_DPRINT11(ha, "Invalid flags QLNXR_RESP_INV [0x%x]" "qp = %p qp->id = 0x%x cq = %p" " cq->icid = 0x%x\n", resp->flags, qp, qp->id, cq, cq->icid ); } } else if (resp->flags & QLNXR_RESP_INV) { wc->ex.imm_data = le32_to_cpu(resp->imm_data_or_inv_r_Key); wc->wc_flags |= IB_WC_WITH_INVALIDATE; if (resp->flags & QLNXR_RESP_RDMA) { QL_DPRINT11(ha, "Invalid flags QLNXR_RESP_RDMA [0x%x]" "qp = %p qp->id = 0x%x cq = %p" " cq->icid = 0x%x\n", resp->flags, qp, qp->id, cq, cq->icid ); } } else if (resp->flags & QLNXR_RESP_RDMA) { QL_DPRINT11(ha, "Invalid flags QLNXR_RESP_RDMA [0x%x]" "qp = %p qp->id = 0x%x cq = %p cq->icid = 0x%x\n", resp->flags, qp, qp->id, cq, cq->icid ); } #else wc_status = IB_WC_SUCCESS; wc->byte_len = le32_to_cpu(resp->length); flags = resp->flags & QLNXR_RESP_RDMA_IMM; switch (flags) { case QLNXR_RESP_RDMA_IMM: /* update opcode */ wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; /* fall to set imm data */ case QLNXR_RESP_IMM: wc->ex.imm_data = le32_to_cpu(resp->imm_data_or_inv_r_Key); wc->wc_flags |= IB_WC_WITH_IMM; break; case QLNXR_RESP_RDMA: QL_DPRINT11(ha, "Invalid flags QLNXR_RESP_RDMA [0x%x]" "qp = %p qp->id = 0x%x cq = %p cq->icid = 0x%x\n", resp->flags, qp, qp->id, cq, cq->icid ); break; default: /* valid configuration, but nothing todo here */ ; } #endif /* #if __FreeBSD_version >= 1102000 */ break; default: wc_status = IB_WC_GENERAL_ERR; } /* fill WC */ wc->status = wc_status; wc->vendor_err = 0; wc->src_qp = qp->id; wc->qp = &qp->ibqp; wc->wr_id = wr_id; QL_DPRINT12(ha, "exit status = 0x%x\n", wc_status); return; } static int process_resp_one_srq(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct qlnxr_cq *cq, struct ib_wc *wc, struct rdma_cqe_responder *resp) { struct qlnxr_srq *srq = qp->srq; u64 wr_id; qlnx_host_t *ha = dev->ha; QL_DPRINT12(ha, "enter\n"); wr_id = HILO_U64(resp->srq_wr_id.hi, resp->srq_wr_id.lo); if (resp->status == RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR) { wc->status = IB_WC_WR_FLUSH_ERR; wc->vendor_err = 0; wc->wr_id = wr_id; wc->byte_len = 0; wc->src_qp = qp->id; wc->qp = &qp->ibqp; wc->wr_id = wr_id; } else { __process_resp_one(dev, qp, cq, wc, resp, wr_id); } /* PBL is maintained in case of WR granularity. * So increment WR consumer after consuming WR */ srq->hw_srq.wr_cons_cnt++; QL_DPRINT12(ha, "exit\n"); return 1; } static int process_resp_one(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct qlnxr_cq *cq, struct ib_wc *wc, struct rdma_cqe_responder *resp) { qlnx_host_t *ha = dev->ha; u64 wr_id = qp->rqe_wr_id[qp->rq.cons].wr_id; QL_DPRINT12(ha, "enter\n"); __process_resp_one(dev, qp, cq, wc, resp, wr_id); while (qp->rqe_wr_id[qp->rq.cons].wqe_size--) ecore_chain_consume(&qp->rq.pbl); qlnxr_inc_sw_cons(&qp->rq); QL_DPRINT12(ha, "exit\n"); return 1; } static int process_resp_flush(struct qlnxr_qp *qp, int num_entries, struct ib_wc *wc, u16 hw_cons) { u16 cnt = 0; qlnx_host_t *ha = qp->dev->ha; QL_DPRINT12(ha, "enter\n"); while (num_entries && qp->rq.wqe_cons != hw_cons) { /* fill WC */ wc->status = IB_WC_WR_FLUSH_ERR; wc->vendor_err = 0; wc->wc_flags = 0; wc->src_qp = qp->id; wc->byte_len = 0; wc->wr_id = qp->rqe_wr_id[qp->rq.cons].wr_id; wc->qp = &qp->ibqp; num_entries--; wc++; cnt++; while (qp->rqe_wr_id[qp->rq.cons].wqe_size--) ecore_chain_consume(&qp->rq.pbl); qlnxr_inc_sw_cons(&qp->rq); } QL_DPRINT12(ha, "exit cnt = 0x%x\n", cnt); return cnt; } static void try_consume_resp_cqe(struct qlnxr_cq *cq, struct qlnxr_qp *qp, struct rdma_cqe_responder *resp, int *update) { if (le16_to_cpu(resp->rq_cons) == qp->rq.wqe_cons) { consume_cqe(cq); *update |= 1; } } static int qlnxr_poll_cq_resp_srq(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct qlnxr_cq *cq, int num_entries, struct ib_wc *wc, struct rdma_cqe_responder *resp, int *update) { int cnt; qlnx_host_t *ha = dev->ha; QL_DPRINT12(ha, "enter\n"); cnt = process_resp_one_srq(dev, qp, cq, wc, resp); consume_cqe(cq); *update |= 1; QL_DPRINT12(ha, "exit cnt = 0x%x\n", cnt); return cnt; } static int qlnxr_poll_cq_resp(struct qlnxr_dev *dev, struct qlnxr_qp *qp, struct qlnxr_cq *cq, int num_entries, struct ib_wc *wc, struct rdma_cqe_responder *resp, int *update) { int cnt; qlnx_host_t *ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (resp->status == RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR) { cnt = process_resp_flush(qp, num_entries, wc, resp->rq_cons); try_consume_resp_cqe(cq, qp, resp, update); } else { cnt = process_resp_one(dev, qp, cq, wc, resp); consume_cqe(cq); *update |= 1; } QL_DPRINT12(ha, "exit cnt = 0x%x\n", cnt); return cnt; } static void try_consume_req_cqe(struct qlnxr_cq *cq, struct qlnxr_qp *qp, struct rdma_cqe_requester *req, int *update) { if (le16_to_cpu(req->sq_cons) == qp->sq.wqe_cons) { consume_cqe(cq); *update |= 1; } } static void doorbell_cq(struct qlnxr_dev *dev, struct qlnxr_cq *cq, u32 cons, u8 flags) { uint64_t reg_addr; qlnx_host_t *ha = dev->ha; QL_DPRINT12(ha, "enter\n"); wmb(); cq->db.data.agg_flags = flags; cq->db.data.value = cpu_to_le32(cons); reg_addr = (uint64_t)((uint8_t *)cq->db_addr - (uint8_t *)(ha->cdev.doorbells)); bus_write_8(ha->pci_dbells, reg_addr, cq->db.raw); bus_barrier(ha->pci_dbells, 0, 0, BUS_SPACE_BARRIER_READ); QL_DPRINT12(ha, "exit\n"); return; //#ifdef __LP64__ // writeq(cq->db.raw, cq->db_addr); //#else /* Note that since the FW allows 64 bit write only, in 32bit systems * the value of db_addr must be low enough. This is currently not * enforced. */ // writel(cq->db.raw & 0xffffffff, cq->db_addr); // mmiowb(); //#endif } static int is_valid_cqe(struct qlnxr_cq *cq, union rdma_cqe *cqe) { struct rdma_cqe_requester *resp_cqe = &cqe->req; return (resp_cqe->flags & RDMA_RESIZE_CQ_RAMROD_DATA_TOGGLE_BIT_MASK) == cq->pbl_toggle; } int qlnxr_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { struct qlnxr_cq *cq = get_qlnxr_cq(ibcq); struct qlnxr_dev *dev = get_qlnxr_dev((ibcq->device)); int done = 0; union rdma_cqe *cqe = cq->latest_cqe; int update = 0; u32 old_cons, new_cons; unsigned long flags; qlnx_host_t *ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (!(ha->ifp->if_drv_flags & IFF_DRV_RUNNING)) return -EINVAL; if (cq->destroyed) { QL_DPRINT11(ha, "called after destroy for cq %p (icid=%d)\n", cq, cq->icid); return 0; } if (cq->cq_type == QLNXR_CQ_TYPE_GSI) return qlnxr_gsi_poll_cq(ibcq, num_entries, wc); spin_lock_irqsave(&cq->cq_lock, flags); old_cons = ecore_chain_get_cons_idx_u32(&cq->pbl); while (num_entries && is_valid_cqe(cq, cqe)) { int cnt = 0; struct qlnxr_qp *qp; struct rdma_cqe_requester *resp_cqe; enum rdma_cqe_type cqe_type; /* prevent speculative reads of any field of CQE */ rmb(); resp_cqe = &cqe->req; qp = (struct qlnxr_qp *)(uintptr_t)HILO_U64(resp_cqe->qp_handle.hi, resp_cqe->qp_handle.lo); if (!qp) { QL_DPRINT11(ha, "qp = NULL\n"); break; } wc->qp = &qp->ibqp; cqe_type = GET_FIELD(resp_cqe->flags, RDMA_CQE_REQUESTER_TYPE); switch (cqe_type) { case RDMA_CQE_TYPE_REQUESTER: cnt = qlnxr_poll_cq_req(dev, qp, cq, num_entries, wc, &cqe->req); try_consume_req_cqe(cq, qp, &cqe->req, &update); break; case RDMA_CQE_TYPE_RESPONDER_RQ: cnt = qlnxr_poll_cq_resp(dev, qp, cq, num_entries, wc, &cqe->resp, &update); break; case RDMA_CQE_TYPE_RESPONDER_SRQ: cnt = qlnxr_poll_cq_resp_srq(dev, qp, cq, num_entries, wc, &cqe->resp, &update); break; case RDMA_CQE_TYPE_INVALID: default: QL_DPRINT11(ha, "cqe type [0x%x] invalid\n", cqe_type); break; } num_entries -= cnt; wc += cnt; done += cnt; cqe = cq->latest_cqe; } new_cons = ecore_chain_get_cons_idx_u32(&cq->pbl); cq->cq_cons += new_cons - old_cons; if (update) { /* doorbell notifies abount latest VALID entry, * but chain already point to the next INVALID one */ doorbell_cq(dev, cq, cq->cq_cons - 1, cq->arm_flags); QL_DPRINT12(ha, "cq = %p cons = 0x%x " "arm_flags = 0x%x db.icid = 0x%x\n", cq, (cq->cq_cons - 1), cq->arm_flags, cq->db.data.icid); } spin_unlock_irqrestore(&cq->cq_lock, flags); QL_DPRINT12(ha, "exit\n"); return done; } int qlnxr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct qlnxr_cq *cq = get_qlnxr_cq(ibcq); unsigned long sflags; struct qlnxr_dev *dev; qlnx_host_t *ha; dev = get_qlnxr_dev((ibcq->device)); ha = dev->ha; QL_DPRINT12(ha, "enter ibcq = %p flags = 0x%x " "cp = %p cons = 0x%x cq_type = 0x%x\n", ibcq, flags, cq, cq->cq_cons, cq->cq_type); if (!(ha->ifp->if_drv_flags & IFF_DRV_RUNNING)) return -EINVAL; if (cq->destroyed) { QL_DPRINT11(ha, "cq was already destroyed cq = %p icid=%d\n", cq, cq->icid); return -EINVAL; } if (cq->cq_type == QLNXR_CQ_TYPE_GSI) { return 0; } spin_lock_irqsave(&cq->cq_lock, sflags); cq->arm_flags = 0; if (flags & IB_CQ_SOLICITED) { cq->arm_flags |= DQ_UCM_ROCE_CQ_ARM_SE_CF_CMD; } if (flags & IB_CQ_NEXT_COMP) { cq->arm_flags |= DQ_UCM_ROCE_CQ_ARM_CF_CMD; } doorbell_cq(dev, cq, (cq->cq_cons - 1), cq->arm_flags); spin_unlock_irqrestore(&cq->cq_lock, sflags); QL_DPRINT12(ha, "exit ibcq = %p flags = 0x%x\n", ibcq, flags); return 0; } static struct qlnxr_mr * __qlnxr_alloc_mr(struct ib_pd *ibpd, int max_page_list_len) { struct qlnxr_pd *pd = get_qlnxr_pd(ibpd); struct qlnxr_dev *dev = get_qlnxr_dev((ibpd->device)); struct qlnxr_mr *mr; int rc = -ENOMEM; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter ibpd = %p pd = %p " " pd_id = %d max_page_list_len = %d\n", ibpd, pd, pd->pd_id, max_page_list_len); mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { QL_DPRINT11(ha, "kzalloc(mr) failed\n"); return ERR_PTR(rc); } mr->dev = dev; mr->type = QLNXR_MR_FRMR; rc = qlnxr_init_mr_info(dev, &mr->info, max_page_list_len, 1 /* allow dual layer pbl */); if (rc) { QL_DPRINT11(ha, "qlnxr_init_mr_info failed\n"); goto err0; } rc = ecore_rdma_alloc_tid(dev->rdma_ctx, &mr->hw_mr.itid); if (rc) { QL_DPRINT11(ha, "ecore_rdma_alloc_tid failed\n"); goto err0; } /* index only, 18 bit long, lkey = itid << 8 | key */ mr->hw_mr.tid_type = ECORE_RDMA_TID_FMR; mr->hw_mr.key = 0; mr->hw_mr.pd = pd->pd_id; mr->hw_mr.local_read = 1; mr->hw_mr.local_write = 0; mr->hw_mr.remote_read = 0; mr->hw_mr.remote_write = 0; mr->hw_mr.remote_atomic = 0; mr->hw_mr.mw_bind = false; /* TBD MW BIND */ mr->hw_mr.pbl_ptr = 0; /* Will be supplied during post */ mr->hw_mr.pbl_two_level = mr->info.pbl_info.two_layered; mr->hw_mr.pbl_page_size_log = ilog2(mr->info.pbl_info.pbl_size); mr->hw_mr.fbo = 0; mr->hw_mr.length = 0; mr->hw_mr.vaddr = 0; mr->hw_mr.zbva = false; /* TBD figure when this should be true */ mr->hw_mr.phy_mr = true; /* Fast MR - True, Regular Register False */ mr->hw_mr.dma_mr = false; rc = ecore_rdma_register_tid(dev->rdma_ctx, &mr->hw_mr); if (rc) { QL_DPRINT11(ha, "ecore_rdma_register_tid failed\n"); goto err1; } mr->ibmr.lkey = mr->hw_mr.itid << 8 | mr->hw_mr.key; mr->ibmr.rkey = mr->ibmr.lkey; QL_DPRINT12(ha, "exit mr = %p mr->ibmr.lkey = 0x%x\n", mr, mr->ibmr.lkey); return mr; err1: ecore_rdma_free_tid(dev->rdma_ctx, mr->hw_mr.itid); err0: kfree(mr); QL_DPRINT12(ha, "exit\n"); return ERR_PTR(rc); } #if __FreeBSD_version >= 1102000 struct ib_mr * qlnxr_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, u32 max_num_sg) { struct qlnxr_dev *dev; struct qlnxr_mr *mr; qlnx_host_t *ha; dev = get_qlnxr_dev(ibpd->device); ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (mr_type != IB_MR_TYPE_MEM_REG) return ERR_PTR(-EINVAL); mr = __qlnxr_alloc_mr(ibpd, max_num_sg); if (IS_ERR(mr)) return ERR_PTR(-EINVAL); QL_DPRINT12(ha, "exit mr = %p &mr->ibmr = %p\n", mr, &mr->ibmr); return &mr->ibmr; } static int qlnxr_set_page(struct ib_mr *ibmr, u64 addr) { struct qlnxr_mr *mr = get_qlnxr_mr(ibmr); struct qlnxr_pbl *pbl_table; struct regpair *pbe; struct qlnxr_dev *dev; qlnx_host_t *ha; u32 pbes_in_page; dev = mr->dev; ha = dev->ha; if (unlikely(mr->npages == mr->info.pbl_info.num_pbes)) { QL_DPRINT12(ha, "fails mr->npages %d\n", mr->npages); return -ENOMEM; } QL_DPRINT12(ha, "mr->npages %d addr = %p enter\n", mr->npages, ((void *)addr)); pbes_in_page = mr->info.pbl_info.pbl_size / sizeof(u64); pbl_table = mr->info.pbl_table + (mr->npages / pbes_in_page); pbe = (struct regpair *)pbl_table->va; pbe += mr->npages % pbes_in_page; pbe->lo = cpu_to_le32((u32)addr); pbe->hi = cpu_to_le32((u32)upper_32_bits(addr)); mr->npages++; QL_DPRINT12(ha, "mr->npages %d addr = %p exit \n", mr->npages, ((void *)addr)); return 0; } int qlnxr_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset) { int ret; struct qlnxr_mr *mr = get_qlnxr_mr(ibmr); qlnx_host_t *ha; if (mr == NULL) return (-1); if (mr->dev == NULL) return (-1); ha = mr->dev->ha; QL_DPRINT12(ha, "enter\n"); mr->npages = 0; qlnx_handle_completed_mrs(mr->dev, &mr->info); ret = ib_sg_to_pages(ibmr, sg, sg_nents, NULL, qlnxr_set_page); QL_DPRINT12(ha, "exit ret = %d\n", ret); return (ret); } #else struct ib_mr * qlnxr_alloc_frmr(struct ib_pd *ibpd, int max_page_list_len) { struct qlnxr_dev *dev; struct qlnxr_mr *mr; qlnx_host_t *ha; struct ib_mr *ibmr = NULL; dev = get_qlnxr_dev((ibpd->device)); ha = dev->ha; QL_DPRINT12(ha, "enter\n"); mr = __qlnxr_alloc_mr(ibpd, max_page_list_len); if (IS_ERR(mr)) { ibmr = ERR_PTR(-EINVAL); } else { ibmr = &mr->ibmr; } QL_DPRINT12(ha, "exit %p\n", ibmr); return (ibmr); } void qlnxr_free_frmr_page_list(struct ib_fast_reg_page_list *page_list) { struct qlnxr_fast_reg_page_list *frmr_list; frmr_list = get_qlnxr_frmr_list(page_list); free_mr_info(frmr_list->dev, &frmr_list->info); kfree(frmr_list->ibfrpl.page_list); kfree(frmr_list); return; } struct ib_fast_reg_page_list * qlnxr_alloc_frmr_page_list(struct ib_device *ibdev, int page_list_len) { struct qlnxr_fast_reg_page_list *frmr_list = NULL; struct qlnxr_dev *dev; int size = page_list_len * sizeof(u64); int rc = -ENOMEM; qlnx_host_t *ha; dev = get_qlnxr_dev(ibdev); ha = dev->ha; QL_DPRINT12(ha, "enter\n"); frmr_list = kzalloc(sizeof(*frmr_list), GFP_KERNEL); if (!frmr_list) { QL_DPRINT11(ha, "kzalloc(frmr_list) failed\n"); goto err; } frmr_list->dev = dev; frmr_list->ibfrpl.page_list = kzalloc(size, GFP_KERNEL); if (!frmr_list->ibfrpl.page_list) { QL_DPRINT11(ha, "frmr_list->ibfrpl.page_list = NULL failed\n"); goto err0; } rc = qlnxr_init_mr_info(dev, &frmr_list->info, page_list_len, 1 /* allow dual layer pbl */); if (rc) goto err1; QL_DPRINT12(ha, "exit %p\n", &frmr_list->ibfrpl); return &frmr_list->ibfrpl; err1: kfree(frmr_list->ibfrpl.page_list); err0: kfree(frmr_list); err: QL_DPRINT12(ha, "exit with error\n"); return ERR_PTR(rc); } static int qlnxr_validate_phys_buf_list(qlnx_host_t *ha, struct ib_phys_buf *buf_list, int buf_cnt, uint64_t *total_size) { u64 size = 0; *total_size = 0; if (!buf_cnt || buf_list == NULL) { QL_DPRINT11(ha, "failed buf_list = %p buf_cnt = %d\n", buf_list, buf_cnt); return (-1); } size = buf_list->size; if (!size) { QL_DPRINT11(ha, "failed buf_list = %p buf_cnt = %d" " buf_list->size = 0\n", buf_list, buf_cnt); return (-1); } while (buf_cnt) { *total_size += buf_list->size; if (buf_list->size != size) { QL_DPRINT11(ha, "failed buf_list = %p buf_cnt = %d" " all buffers should have same size\n", buf_list, buf_cnt); return (-1); } buf_list++; buf_cnt--; } return (0); } static size_t qlnxr_get_num_pages(qlnx_host_t *ha, struct ib_phys_buf *buf_list, int buf_cnt) { int i; size_t num_pages = 0; u64 size; for (i = 0; i < buf_cnt; i++) { size = 0; while (size < buf_list->size) { size += PAGE_SIZE; num_pages++; } buf_list++; } return (num_pages); } static void qlnxr_populate_phys_mem_pbls(struct qlnxr_dev *dev, struct ib_phys_buf *buf_list, int buf_cnt, struct qlnxr_pbl *pbl, struct qlnxr_pbl_info *pbl_info) { struct regpair *pbe; struct qlnxr_pbl *pbl_tbl; int pg_cnt, pages, pbe_cnt, total_num_pbes = 0; qlnx_host_t *ha; int i; u64 pbe_addr; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (!pbl_info) { QL_DPRINT11(ha, "PBL_INFO not initialized\n"); return; } if (!pbl_info->num_pbes) { QL_DPRINT11(ha, "pbl_info->num_pbes == 0\n"); return; } /* If we have a two layered pbl, the first pbl points to the rest * of the pbls and the first entry lays on the second pbl in the table */ if (pbl_info->two_layered) pbl_tbl = &pbl[1]; else pbl_tbl = pbl; pbe = (struct regpair *)pbl_tbl->va; if (!pbe) { QL_DPRINT12(ha, "pbe is NULL\n"); return; } pbe_cnt = 0; for (i = 0; i < buf_cnt; i++) { pages = buf_list->size >> PAGE_SHIFT; for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) { /* store the page address in pbe */ pbe_addr = buf_list->addr + (PAGE_SIZE * pg_cnt); pbe->lo = cpu_to_le32((u32)pbe_addr); pbe->hi = cpu_to_le32(((u32)(pbe_addr >> 32))); QL_DPRINT12(ha, "Populate pbl table:" " pbe->addr=0x%x:0x%x " " pbe_cnt = %d total_num_pbes=%d" " pbe=%p\n", pbe->lo, pbe->hi, pbe_cnt, total_num_pbes, pbe); pbe_cnt ++; total_num_pbes ++; pbe++; if (total_num_pbes == pbl_info->num_pbes) return; /* if the given pbl is full storing the pbes, * move to next pbl. */ if (pbe_cnt == (pbl_info->pbl_size / sizeof(u64))) { pbl_tbl++; pbe = (struct regpair *)pbl_tbl->va; pbe_cnt = 0; } } buf_list++; } QL_DPRINT12(ha, "exit\n"); return; } struct ib_mr * qlnxr_reg_kernel_mr(struct ib_pd *ibpd, struct ib_phys_buf *buf_list, int buf_cnt, int acc, u64 *iova_start) { int rc = -ENOMEM; struct qlnxr_dev *dev = get_qlnxr_dev((ibpd->device)); struct qlnxr_mr *mr; struct qlnxr_pd *pd; qlnx_host_t *ha; size_t num_pages = 0; uint64_t length; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); pd = get_qlnxr_pd(ibpd); QL_DPRINT12(ha, "pd = %d buf_list = %p, buf_cnt = %d," " iova_start = %p, acc = %d\n", pd->pd_id, buf_list, buf_cnt, iova_start, acc); //if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE)) { // QL_DPRINT11(ha, "(acc & IB_ACCESS_REMOTE_WRITE &&" // " !(acc & IB_ACCESS_LOCAL_WRITE))\n"); // return ERR_PTR(-EINVAL); //} mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { QL_DPRINT11(ha, "kzalloc(mr) failed\n"); return ERR_PTR(rc); } mr->type = QLNXR_MR_KERNEL; mr->iova_start = iova_start; rc = qlnxr_validate_phys_buf_list(ha, buf_list, buf_cnt, &length); if (rc) goto err0; num_pages = qlnxr_get_num_pages(ha, buf_list, buf_cnt); if (!num_pages) goto err0; rc = qlnxr_init_mr_info(dev, &mr->info, num_pages, 1); if (rc) { QL_DPRINT11(ha, "qlnxr_init_mr_info failed [%d]\n", rc); goto err1; } qlnxr_populate_phys_mem_pbls(dev, buf_list, buf_cnt, mr->info.pbl_table, &mr->info.pbl_info); rc = ecore_rdma_alloc_tid(dev->rdma_ctx, &mr->hw_mr.itid); if (rc) { QL_DPRINT11(ha, "roce alloc tid returned an error %d\n", rc); goto err1; } /* index only, 18 bit long, lkey = itid << 8 | key */ mr->hw_mr.tid_type = ECORE_RDMA_TID_REGISTERED_MR; mr->hw_mr.key = 0; mr->hw_mr.pd = pd->pd_id; mr->hw_mr.local_read = 1; mr->hw_mr.local_write = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0; mr->hw_mr.remote_read = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0; mr->hw_mr.remote_write = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; mr->hw_mr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; mr->hw_mr.mw_bind = false; /* TBD MW BIND */ mr->hw_mr.pbl_ptr = mr->info.pbl_table[0].pa; mr->hw_mr.pbl_two_level = mr->info.pbl_info.two_layered; mr->hw_mr.pbl_page_size_log = ilog2(mr->info.pbl_info.pbl_size); mr->hw_mr.page_size_log = ilog2(PAGE_SIZE); /* for the MR pages */ mr->hw_mr.fbo = 0; mr->hw_mr.length = length; mr->hw_mr.vaddr = (uint64_t)iova_start; mr->hw_mr.zbva = false; /* TBD figure when this should be true */ mr->hw_mr.phy_mr = false; /* Fast MR - True, Regular Register False */ mr->hw_mr.dma_mr = false; rc = ecore_rdma_register_tid(dev->rdma_ctx, &mr->hw_mr); if (rc) { QL_DPRINT11(ha, "roce register tid returned an error %d\n", rc); goto err2; } mr->ibmr.lkey = mr->hw_mr.itid << 8 | mr->hw_mr.key; if (mr->hw_mr.remote_write || mr->hw_mr.remote_read || mr->hw_mr.remote_atomic) mr->ibmr.rkey = mr->hw_mr.itid << 8 | mr->hw_mr.key; QL_DPRINT12(ha, "lkey: %x\n", mr->ibmr.lkey); return (&mr->ibmr); err2: ecore_rdma_free_tid(dev->rdma_ctx, mr->hw_mr.itid); err1: qlnxr_free_pbl(dev, &mr->info.pbl_info, mr->info.pbl_table); err0: kfree(mr); QL_DPRINT12(ha, "exit [%d]\n", rc); return (ERR_PTR(rc)); } #endif /* #if __FreeBSD_version >= 1102000 */ struct ib_ah * #if __FreeBSD_version >= 1102000 qlnxr_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr, struct ib_udata *udata) #else qlnxr_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) #endif /* #if __FreeBSD_version >= 1102000 */ { struct qlnxr_dev *dev; qlnx_host_t *ha; struct qlnxr_ah *ah; dev = get_qlnxr_dev((ibpd->device)); ha = dev->ha; QL_DPRINT12(ha, "in create_ah\n"); ah = kzalloc(sizeof(*ah), GFP_ATOMIC); if (!ah) { QL_DPRINT12(ha, "no address handle can be allocated\n"); return ERR_PTR(-ENOMEM); } ah->attr = *attr; return &ah->ibah; } int qlnxr_destroy_ah(struct ib_ah *ibah) { struct qlnxr_dev *dev; qlnx_host_t *ha; struct qlnxr_ah *ah = get_qlnxr_ah(ibah); dev = get_qlnxr_dev((ibah->device)); ha = dev->ha; QL_DPRINT12(ha, "in destroy_ah\n"); kfree(ah); return 0; } int qlnxr_query_ah(struct ib_ah *ibah, struct ib_ah_attr *attr) { struct qlnxr_dev *dev; qlnx_host_t *ha; dev = get_qlnxr_dev((ibah->device)); ha = dev->ha; QL_DPRINT12(ha, "Query AH not supported\n"); return -EINVAL; } int qlnxr_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *attr) { struct qlnxr_dev *dev; qlnx_host_t *ha; dev = get_qlnxr_dev((ibah->device)); ha = dev->ha; QL_DPRINT12(ha, "Modify AH not supported\n"); return -ENOSYS; } #if __FreeBSD_version >= 1102000 int qlnxr_process_mad(struct ib_device *ibdev, int process_mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *mad_hdr, size_t in_mad_size, struct ib_mad_hdr *out_mad, size_t *out_mad_size, u16 *out_mad_pkey_index) #else int qlnxr_process_mad(struct ib_device *ibdev, int process_mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad) #endif /* #if __FreeBSD_version >= 1102000 */ { struct qlnxr_dev *dev; qlnx_host_t *ha; dev = get_qlnxr_dev(ibdev); ha = dev->ha; QL_DPRINT12(ha, "process mad not supported\n"); return -ENOSYS; // QL_DPRINT12(ha, "qlnxr_process_mad in_mad %x %x %x %x %x %x %x %x\n", // in_mad->mad_hdr.attr_id, in_mad->mad_hdr.base_version, // in_mad->mad_hdr.attr_mod, in_mad->mad_hdr.class_specific, // in_mad->mad_hdr.class_version, in_mad->mad_hdr.method, // in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.status); // return IB_MAD_RESULT_SUCCESS; } #if __FreeBSD_version >= 1102000 int qlnxr_get_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable) { struct qlnxr_dev *dev; qlnx_host_t *ha; struct ib_port_attr attr; int err; dev = get_qlnxr_dev(ibdev); ha = dev->ha; QL_DPRINT12(ha, "enter\n"); err = qlnxr_query_port(ibdev, port_num, &attr); if (err) return err; if (QLNX_IS_IWARP(dev)) { immutable->pkey_tbl_len = 1; immutable->gid_tbl_len = 1; immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; immutable->max_mad_size = 0; } else { immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE; immutable->max_mad_size = IB_MGMT_MAD_SIZE; } QL_DPRINT12(ha, "exit\n"); return 0; } #endif /* #if __FreeBSD_version > 1102000 */ /***** iWARP related functions *************/ static void qlnxr_iw_mpa_request(void *context, struct ecore_iwarp_cm_event_params *params) { struct qlnxr_iw_listener *listener = (struct qlnxr_iw_listener *)context; struct qlnxr_dev *dev = listener->dev; struct qlnxr_iw_ep *ep; struct iw_cm_event event; struct sockaddr_in *laddr; struct sockaddr_in *raddr; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (params->cm_info->ip_version != ECORE_TCP_IPV4) { QL_DPRINT11(ha, "only IPv4 supported [0x%x]\n", params->cm_info->ip_version); return; } ep = kzalloc(sizeof(*ep), GFP_ATOMIC); if (!ep) { QL_DPRINT11(ha, "kzalloc{ep) failed\n"); return; } ep->dev = dev; ep->ecore_context = params->ep_context; memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CONNECT_REQUEST; event.status = params->status; laddr = (struct sockaddr_in *)&event.local_addr; raddr = (struct sockaddr_in *)&event.remote_addr; laddr->sin_family = AF_INET; raddr->sin_family = AF_INET; laddr->sin_port = htons(params->cm_info->local_port); raddr->sin_port = htons(params->cm_info->remote_port); laddr->sin_addr.s_addr = htonl(params->cm_info->local_ip[0]); raddr->sin_addr.s_addr = htonl(params->cm_info->remote_ip[0]); event.provider_data = (void *)ep; event.private_data = (void *)params->cm_info->private_data; event.private_data_len = (u8)params->cm_info->private_data_len; #if __FreeBSD_version >= 1100000 event.ord = params->cm_info->ord; event.ird = params->cm_info->ird; #endif /* #if __FreeBSD_version >= 1100000 */ listener->cm_id->event_handler(listener->cm_id, &event); QL_DPRINT12(ha, "exit\n"); return; } static void qlnxr_iw_issue_event(void *context, struct ecore_iwarp_cm_event_params *params, enum iw_cm_event_type event_type, char *str) { struct qlnxr_iw_ep *ep = (struct qlnxr_iw_ep *)context; struct qlnxr_dev *dev = ep->dev; struct iw_cm_event event; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); memset(&event, 0, sizeof(event)); event.status = params->status; event.event = event_type; if (params->cm_info != NULL) { #if __FreeBSD_version >= 1100000 event.ird = params->cm_info->ird; event.ord = params->cm_info->ord; QL_DPRINT12(ha, "ord=[%d] \n", event.ord); QL_DPRINT12(ha, "ird=[%d] \n", event.ird); #endif /* #if __FreeBSD_version >= 1100000 */ event.private_data_len = params->cm_info->private_data_len; event.private_data = (void *)params->cm_info->private_data; QL_DPRINT12(ha, "private_data_len=[%d] \n", event.private_data_len); } QL_DPRINT12(ha, "event=[%d] %s\n", event.event, str); QL_DPRINT12(ha, "status=[%d] \n", event.status); if (ep) { if (ep->cm_id) ep->cm_id->event_handler(ep->cm_id, &event); else QL_DPRINT11(ha, "ep->cm_id == NULL \n"); } else { QL_DPRINT11(ha, "ep == NULL \n"); } QL_DPRINT12(ha, "exit\n"); return; } static void qlnxr_iw_close_event(void *context, struct ecore_iwarp_cm_event_params *params) { struct qlnxr_iw_ep *ep = (struct qlnxr_iw_ep *)context; struct qlnxr_dev *dev = ep->dev; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (ep->cm_id) { qlnxr_iw_issue_event(context, params, IW_CM_EVENT_CLOSE, "IW_CM_EVENT_EVENT_CLOSE"); ep->cm_id->rem_ref(ep->cm_id); ep->cm_id = NULL; } QL_DPRINT12(ha, "exit\n"); return; } #if __FreeBSD_version >= 1102000 static void qlnxr_iw_passive_complete(void *context, struct ecore_iwarp_cm_event_params *params) { struct qlnxr_iw_ep *ep = (struct qlnxr_iw_ep *)context; struct qlnxr_dev *dev = ep->dev; qlnx_host_t *ha; ha = dev->ha; /* We will only reach the following state if MPA_REJECT was called on * passive. In this case there will be no associated QP. */ if ((params->status == -ECONNREFUSED) && (ep->qp == NULL)) { QL_DPRINT11(ha, "PASSIVE connection refused releasing ep...\n"); kfree(ep); return; } /* We always issue an established event, however, ofed does not look * at event code for established. So if there was a failure, we follow * with close... */ qlnxr_iw_issue_event(context, params, IW_CM_EVENT_ESTABLISHED, "IW_CM_EVENT_ESTABLISHED"); if (params->status < 0) { qlnxr_iw_close_event(context, params); } return; } struct qlnxr_discon_work { struct work_struct work; struct qlnxr_iw_ep *ep; enum ecore_iwarp_event_type event; int status; }; static void qlnxr_iw_disconnect_worker(struct work_struct *work) { struct qlnxr_discon_work *dwork = container_of(work, struct qlnxr_discon_work, work); struct ecore_rdma_modify_qp_in_params qp_params = { 0 }; struct qlnxr_iw_ep *ep = dwork->ep; struct qlnxr_dev *dev = ep->dev; struct qlnxr_qp *qp = ep->qp; struct iw_cm_event event; if (qp->destroyed) { kfree(dwork); qlnxr_iw_qp_rem_ref(&qp->ibqp); return; } memset(&event, 0, sizeof(event)); event.status = dwork->status; event.event = IW_CM_EVENT_DISCONNECT; /* Success means graceful disconnect was requested. modifying * to SQD is translated to graceful disconnect. O/w reset is sent */ if (dwork->status) qp_params.new_state = ECORE_ROCE_QP_STATE_ERR; else qp_params.new_state = ECORE_ROCE_QP_STATE_SQD; kfree(dwork); if (ep->cm_id) ep->cm_id->event_handler(ep->cm_id, &event); SET_FIELD(qp_params.modify_flags, ECORE_RDMA_MODIFY_QP_VALID_NEW_STATE, 1); ecore_rdma_modify_qp(dev->rdma_ctx, qp->ecore_qp, &qp_params); qlnxr_iw_qp_rem_ref(&qp->ibqp); return; } void qlnxr_iw_disconnect_event(void *context, struct ecore_iwarp_cm_event_params *params) { struct qlnxr_discon_work *work; struct qlnxr_iw_ep *ep = (struct qlnxr_iw_ep *)context; struct qlnxr_dev *dev = ep->dev; struct qlnxr_qp *qp = ep->qp; work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) return; qlnxr_iw_qp_add_ref(&qp->ibqp); work->ep = ep; work->event = params->event; work->status = params->status; INIT_WORK(&work->work, qlnxr_iw_disconnect_worker); queue_work(dev->iwarp_wq, &work->work); return; } #endif /* #if __FreeBSD_version >= 1102000 */ static int qlnxr_iw_mpa_reply(void *context, struct ecore_iwarp_cm_event_params *params) { struct qlnxr_iw_ep *ep = (struct qlnxr_iw_ep *)context; struct qlnxr_dev *dev = ep->dev; struct ecore_iwarp_send_rtr_in rtr_in; int rc; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (!(ha->ifp->if_drv_flags & IFF_DRV_RUNNING)) return -EINVAL; bzero(&rtr_in, sizeof(struct ecore_iwarp_send_rtr_in)); rtr_in.ep_context = params->ep_context; rc = ecore_iwarp_send_rtr(dev->rdma_ctx, &rtr_in); QL_DPRINT12(ha, "exit rc = %d\n", rc); return rc; } void qlnxr_iw_qp_event(void *context, struct ecore_iwarp_cm_event_params *params, enum ib_event_type ib_event, char *str) { struct qlnxr_iw_ep *ep = (struct qlnxr_iw_ep *)context; struct qlnxr_dev *dev = ep->dev; struct ib_qp *ibqp = &(ep->qp->ibqp); struct ib_event event; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "[context, event, event_handler] = [%p, 0x%x, %s, %p] enter\n", context, params->event, str, ibqp->event_handler); if (ibqp->event_handler) { event.event = ib_event; event.device = ibqp->device; event.element.qp = ibqp; ibqp->event_handler(&event, ibqp->qp_context); } return; } int qlnxr_iw_event_handler(void *context, struct ecore_iwarp_cm_event_params *params) { struct qlnxr_iw_ep *ep = (struct qlnxr_iw_ep *)context; struct qlnxr_dev *dev = ep->dev; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "[context, event] = [%p, 0x%x] " "enter\n", context, params->event); switch (params->event) { /* Passive side request received */ case ECORE_IWARP_EVENT_MPA_REQUEST: qlnxr_iw_mpa_request(context, params); break; case ECORE_IWARP_EVENT_ACTIVE_MPA_REPLY: qlnxr_iw_mpa_reply(context, params); break; /* Passive side established ( ack on mpa response ) */ case ECORE_IWARP_EVENT_PASSIVE_COMPLETE: #if __FreeBSD_version >= 1102000 ep->during_connect = 0; qlnxr_iw_passive_complete(context, params); #else qlnxr_iw_issue_event(context, params, IW_CM_EVENT_ESTABLISHED, "IW_CM_EVENT_ESTABLISHED"); #endif /* #if __FreeBSD_version >= 1102000 */ break; /* Active side reply received */ case ECORE_IWARP_EVENT_ACTIVE_COMPLETE: ep->during_connect = 0; qlnxr_iw_issue_event(context, params, IW_CM_EVENT_CONNECT_REPLY, "IW_CM_EVENT_CONNECT_REPLY"); if (params->status < 0) { struct qlnxr_iw_ep *ep = (struct qlnxr_iw_ep *)context; ep->cm_id->rem_ref(ep->cm_id); ep->cm_id = NULL; } break; case ECORE_IWARP_EVENT_DISCONNECT: #if __FreeBSD_version >= 1102000 qlnxr_iw_disconnect_event(context, params); #else qlnxr_iw_issue_event(context, params, IW_CM_EVENT_DISCONNECT, "IW_CM_EVENT_DISCONNECT"); qlnxr_iw_close_event(context, params); #endif /* #if __FreeBSD_version >= 1102000 */ break; case ECORE_IWARP_EVENT_CLOSE: ep->during_connect = 0; qlnxr_iw_close_event(context, params); break; case ECORE_IWARP_EVENT_RQ_EMPTY: qlnxr_iw_qp_event(context, params, IB_EVENT_QP_FATAL, "IWARP_EVENT_RQ_EMPTY"); break; case ECORE_IWARP_EVENT_IRQ_FULL: qlnxr_iw_qp_event(context, params, IB_EVENT_QP_FATAL, "IWARP_EVENT_IRQ_FULL"); break; case ECORE_IWARP_EVENT_LLP_TIMEOUT: qlnxr_iw_qp_event(context, params, IB_EVENT_QP_FATAL, "IWARP_EVENT_LLP_TIMEOUT"); break; case ECORE_IWARP_EVENT_REMOTE_PROTECTION_ERROR: qlnxr_iw_qp_event(context, params, IB_EVENT_QP_ACCESS_ERR, "IWARP_EVENT_REMOTE_PROTECTION_ERROR"); break; case ECORE_IWARP_EVENT_CQ_OVERFLOW: qlnxr_iw_qp_event(context, params, IB_EVENT_QP_FATAL, "QED_IWARP_EVENT_CQ_OVERFLOW"); break; case ECORE_IWARP_EVENT_QP_CATASTROPHIC: qlnxr_iw_qp_event(context, params, IB_EVENT_QP_FATAL, "QED_IWARP_EVENT_QP_CATASTROPHIC"); break; case ECORE_IWARP_EVENT_LOCAL_ACCESS_ERROR: qlnxr_iw_qp_event(context, params, IB_EVENT_QP_ACCESS_ERR, "IWARP_EVENT_LOCAL_ACCESS_ERROR"); break; case ECORE_IWARP_EVENT_REMOTE_OPERATION_ERROR: qlnxr_iw_qp_event(context, params, IB_EVENT_QP_FATAL, "IWARP_EVENT_REMOTE_OPERATION_ERROR"); break; case ECORE_IWARP_EVENT_TERMINATE_RECEIVED: QL_DPRINT12(ha, "Got terminate message" " ECORE_IWARP_EVENT_TERMINATE_RECEIVED\n"); break; default: QL_DPRINT12(ha, "Unknown event [0x%x] received \n", params->event); break; }; QL_DPRINT12(ha, "[context, event] = [%p, 0x%x] " "exit\n", context, params->event); return 0; } static int qlnxr_addr4_resolve(struct qlnxr_dev *dev, struct sockaddr_in *src_in, struct sockaddr_in *dst_in, u8 *dst_mac) { int rc; #if __FreeBSD_version >= 1100000 rc = arpresolve(dev->ha->ifp, 0, NULL, (struct sockaddr *)dst_in, dst_mac, NULL, NULL); #else struct llentry *lle; rc = arpresolve(dev->ha->ifp, NULL, NULL, (struct sockaddr *)dst_in, dst_mac, &lle); #endif QL_DPRINT12(dev->ha, "rc = %d " "sa_len = 0x%x sa_family = 0x%x IP Address = %d.%d.%d.%d " "Dest MAC %02x:%02x:%02x:%02x:%02x:%02x\n", rc, dst_in->sin_len, dst_in->sin_family, NIPQUAD((dst_in->sin_addr.s_addr)), dst_mac[0], dst_mac[1], dst_mac[2], dst_mac[3], dst_mac[4], dst_mac[5]); return rc; } int qlnxr_iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) { struct qlnxr_dev *dev; struct ecore_iwarp_connect_out out_params; struct ecore_iwarp_connect_in in_params; struct qlnxr_iw_ep *ep; struct qlnxr_qp *qp; struct sockaddr_in *laddr; struct sockaddr_in *raddr; int rc = 0; qlnx_host_t *ha; dev = get_qlnxr_dev((cm_id->device)); ha = dev->ha; QL_DPRINT12(ha, "[cm_id, conn_param] = [%p, %p] " "enter \n", cm_id, conn_param); if (!(ha->ifp->if_drv_flags & IFF_DRV_RUNNING)) return -EINVAL; qp = idr_find(&dev->qpidr, conn_param->qpn); laddr = (struct sockaddr_in *)&cm_id->local_addr; raddr = (struct sockaddr_in *)&cm_id->remote_addr; QL_DPRINT12(ha, "local = [%d.%d.%d.%d, %d] remote = [%d.%d.%d.%d, %d]\n", NIPQUAD((laddr->sin_addr.s_addr)), laddr->sin_port, NIPQUAD((raddr->sin_addr.s_addr)), raddr->sin_port); ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (!ep) { QL_DPRINT11(ha, "struct qlnxr_iw_ep " "alloc memory failed\n"); return -ENOMEM; } ep->dev = dev; ep->qp = qp; cm_id->add_ref(cm_id); ep->cm_id = cm_id; memset(&in_params, 0, sizeof (struct ecore_iwarp_connect_in)); memset(&out_params, 0, sizeof (struct ecore_iwarp_connect_out)); in_params.event_cb = qlnxr_iw_event_handler; in_params.cb_context = ep; in_params.cm_info.ip_version = ECORE_TCP_IPV4; in_params.cm_info.remote_ip[0] = ntohl(raddr->sin_addr.s_addr); in_params.cm_info.local_ip[0] = ntohl(laddr->sin_addr.s_addr); in_params.cm_info.remote_port = ntohs(raddr->sin_port); in_params.cm_info.local_port = ntohs(laddr->sin_port); in_params.cm_info.vlan = 0; in_params.mss = dev->ha->ifp->if_mtu - 40; QL_DPRINT12(ha, "remote_ip = [%d.%d.%d.%d] " "local_ip = [%d.%d.%d.%d] remote_port = %d local_port = %d " "vlan = %d\n", NIPQUAD((in_params.cm_info.remote_ip[0])), NIPQUAD((in_params.cm_info.local_ip[0])), in_params.cm_info.remote_port, in_params.cm_info.local_port, in_params.cm_info.vlan); rc = qlnxr_addr4_resolve(dev, laddr, raddr, (u8 *)in_params.remote_mac_addr); if (rc) { QL_DPRINT11(ha, "qlnxr_addr4_resolve failed\n"); goto err; } QL_DPRINT12(ha, "ord = %d ird=%d private_data=%p" " private_data_len=%d rq_psn=%d\n", conn_param->ord, conn_param->ird, conn_param->private_data, conn_param->private_data_len, qp->rq_psn); in_params.cm_info.ord = conn_param->ord; in_params.cm_info.ird = conn_param->ird; in_params.cm_info.private_data = conn_param->private_data; in_params.cm_info.private_data_len = conn_param->private_data_len; in_params.qp = qp->ecore_qp; memcpy(in_params.local_mac_addr, dev->ha->primary_mac, ETH_ALEN); rc = ecore_iwarp_connect(dev->rdma_ctx, &in_params, &out_params); if (rc) { QL_DPRINT12(ha, "ecore_iwarp_connect failed\n"); goto err; } QL_DPRINT12(ha, "exit\n"); return rc; err: cm_id->rem_ref(cm_id); kfree(ep); QL_DPRINT12(ha, "exit [%d]\n", rc); return rc; } int qlnxr_iw_create_listen(struct iw_cm_id *cm_id, int backlog) { struct qlnxr_dev *dev; struct qlnxr_iw_listener *listener; struct ecore_iwarp_listen_in iparams; struct ecore_iwarp_listen_out oparams; struct sockaddr_in *laddr; qlnx_host_t *ha; int rc; dev = get_qlnxr_dev((cm_id->device)); ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (!(ha->ifp->if_drv_flags & IFF_DRV_RUNNING)) return -EINVAL; laddr = (struct sockaddr_in *)&cm_id->local_addr; listener = kzalloc(sizeof(*listener), GFP_KERNEL); if (listener == NULL) { QL_DPRINT11(ha, "listener memory alloc failed\n"); return -ENOMEM; } listener->dev = dev; cm_id->add_ref(cm_id); listener->cm_id = cm_id; listener->backlog = backlog; memset(&iparams, 0, sizeof (struct ecore_iwarp_listen_in)); memset(&oparams, 0, sizeof (struct ecore_iwarp_listen_out)); iparams.cb_context = listener; iparams.event_cb = qlnxr_iw_event_handler; iparams.max_backlog = backlog; iparams.ip_version = ECORE_TCP_IPV4; iparams.ip_addr[0] = ntohl(laddr->sin_addr.s_addr); iparams.port = ntohs(laddr->sin_port); iparams.vlan = 0; QL_DPRINT12(ha, "[%d.%d.%d.%d, %d] iparamsport=%d\n", NIPQUAD((laddr->sin_addr.s_addr)), laddr->sin_port, iparams.port); rc = ecore_iwarp_create_listen(dev->rdma_ctx, &iparams, &oparams); if (rc) { QL_DPRINT11(ha, "ecore_iwarp_create_listen failed rc = %d\n", rc); goto err; } listener->ecore_handle = oparams.handle; cm_id->provider_data = listener; QL_DPRINT12(ha, "exit\n"); return rc; err: cm_id->rem_ref(cm_id); kfree(listener); QL_DPRINT12(ha, "exit [%d]\n", rc); return rc; } void qlnxr_iw_destroy_listen(struct iw_cm_id *cm_id) { struct qlnxr_iw_listener *listener = cm_id->provider_data; struct qlnxr_dev *dev = get_qlnxr_dev((cm_id->device)); int rc = 0; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter\n"); if (listener->ecore_handle) rc = ecore_iwarp_destroy_listen(dev->rdma_ctx, listener->ecore_handle); cm_id->rem_ref(cm_id); QL_DPRINT12(ha, "exit [%d]\n", rc); return; } int qlnxr_iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) { struct qlnxr_iw_ep *ep = (struct qlnxr_iw_ep *)cm_id->provider_data; struct qlnxr_dev *dev = ep->dev; struct qlnxr_qp *qp; struct ecore_iwarp_accept_in params; int rc; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter qpid=%d\n", conn_param->qpn); if (!(ha->ifp->if_drv_flags & IFF_DRV_RUNNING)) return -EINVAL; qp = idr_find(&dev->qpidr, conn_param->qpn); if (!qp) { QL_DPRINT11(ha, "idr_find failed invalid qpn = %d\n", conn_param->qpn); return -EINVAL; } ep->qp = qp; qp->ep = ep; cm_id->add_ref(cm_id); ep->cm_id = cm_id; params.ep_context = ep->ecore_context; params.cb_context = ep; params.qp = ep->qp->ecore_qp; params.private_data = conn_param->private_data; params.private_data_len = conn_param->private_data_len; params.ird = conn_param->ird; params.ord = conn_param->ord; rc = ecore_iwarp_accept(dev->rdma_ctx, ¶ms); if (rc) { QL_DPRINT11(ha, "ecore_iwarp_accept failed %d\n", rc); goto err; } QL_DPRINT12(ha, "exit\n"); return 0; err: cm_id->rem_ref(cm_id); QL_DPRINT12(ha, "exit rc = %d\n", rc); return rc; } int qlnxr_iw_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) { #if __FreeBSD_version >= 1102000 struct qlnxr_iw_ep *ep = (struct qlnxr_iw_ep *)cm_id->provider_data; struct qlnxr_dev *dev = ep->dev; struct ecore_iwarp_reject_in params; int rc; params.ep_context = ep->ecore_context; params.cb_context = ep; params.private_data = pdata; params.private_data_len = pdata_len; ep->qp = NULL; rc = ecore_iwarp_reject(dev->rdma_ctx, ¶ms); return rc; #else printf("iWARP reject_cr not implemented\n"); return -EINVAL; #endif /* #if __FreeBSD_version >= 1102000 */ } void qlnxr_iw_qp_add_ref(struct ib_qp *ibqp) { struct qlnxr_qp *qp = get_qlnxr_qp(ibqp); qlnx_host_t *ha; ha = qp->dev->ha; QL_DPRINT12(ha, "enter ibqp = %p\n", ibqp); atomic_inc(&qp->refcnt); QL_DPRINT12(ha, "exit \n"); return; } void qlnxr_iw_qp_rem_ref(struct ib_qp *ibqp) { struct qlnxr_qp *qp = get_qlnxr_qp(ibqp); qlnx_host_t *ha; ha = qp->dev->ha; QL_DPRINT12(ha, "enter ibqp = %p qp = %p\n", ibqp, qp); if (atomic_dec_and_test(&qp->refcnt)) { qlnxr_idr_remove(qp->dev, qp->qp_id); kfree(qp); } QL_DPRINT12(ha, "exit \n"); return; } struct ib_qp * qlnxr_iw_get_qp(struct ib_device *ibdev, int qpn) { struct qlnxr_dev *dev = get_qlnxr_dev(ibdev); struct ib_qp *qp; qlnx_host_t *ha; ha = dev->ha; QL_DPRINT12(ha, "enter dev = %p ibdev = %p qpn = %d\n", dev, ibdev, qpn); qp = idr_find(&dev->qpidr, qpn); QL_DPRINT12(ha, "exit qp = %p\n", qp); return (qp); } diff --git a/sys/dev/qlnx/qlnxr/qlnxr_verbs.h b/sys/dev/qlnx/qlnxr/qlnxr_verbs.h index 5539a6a04871..985587dbd9ed 100644 --- a/sys/dev/qlnx/qlnxr/qlnxr_verbs.h +++ b/sys/dev/qlnx/qlnxr/qlnxr_verbs.h @@ -1,261 +1,261 @@ /* * Copyright (c) 2018-2019 Cavium, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __QLNXR_VERBS_H__ #define __QLNXR_VERBS_H__ extern int qlnxr_iw_query_gid(struct ib_device *, uint8_t port, int index, union ib_gid *gid); extern int qlnxr_query_gid(struct ib_device *, u8 port, int index, union ib_gid *gid); extern struct ib_srq *qlnxr_create_srq(struct ib_pd *, struct ib_srq_init_attr *, struct ib_udata *); extern int qlnxr_destroy_srq(struct ib_srq *); extern int qlnxr_modify_srq(struct ib_srq *, struct ib_srq_attr *, enum ib_srq_attr_mask, struct ib_udata *); extern int qlnxr_query_srq(struct ib_srq *, struct ib_srq_attr *); extern int qlnxr_post_srq_recv(struct ib_srq *, - struct ib_recv_wr *, - struct ib_recv_wr **bad_recv_wr); + const struct ib_recv_wr *, + const struct ib_recv_wr **bad_recv_wr); #if __FreeBSD_version < 1102000 extern int qlnxr_query_device(struct ib_device *, struct ib_device_attr *); #else extern int qlnxr_query_device(struct ib_device *, struct ib_device_attr *, struct ib_udata *); extern int qlnxr_get_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable); #endif extern int qlnxr_query_port(struct ib_device *, u8 port, struct ib_port_attr *props); extern int qlnxr_modify_port(struct ib_device *, u8 port, int mask, struct ib_port_modify *props); extern enum rdma_link_layer qlnxr_link_layer(struct ib_device *device, uint8_t port_num); struct ib_pd *qlnxr_alloc_pd(struct ib_device *, struct ib_ucontext *, struct ib_udata *); extern int qlnxr_dealloc_pd(struct ib_pd *pd); #if __FreeBSD_version >= 1102000 extern struct ib_cq *qlnxr_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_ucontext *ib_ctx, struct ib_udata *udata); #else #if __FreeBSD_version >= 1100000 extern struct ib_cq *qlnxr_create_cq(struct ib_device *ibdev, struct ib_cq_init_attr *attr, struct ib_ucontext *ib_ctx, struct ib_udata *udata); #else extern struct ib_cq *qlnxr_create_cq(struct ib_device *ibdev, int cqe, int comp_vector, struct ib_ucontext *ib_ctx, struct ib_udata *udata); #endif #endif /* #if __FreeBSD_version >= 1102000 */ extern int qlnxr_destroy_cq(struct ib_cq *); extern int qlnxr_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); extern int qlnxr_poll_cq(struct ib_cq *, int num_entries, struct ib_wc *wc); extern struct ib_qp *qlnxr_create_qp(struct ib_pd *, struct ib_qp_init_attr *attrs, struct ib_udata *); extern int qlnxr_modify_qp(struct ib_qp *, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); extern int qlnxr_query_qp(struct ib_qp *, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *); extern int qlnxr_destroy_qp(struct ib_qp *); extern int qlnxr_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey); #if __FreeBSD_version >= 1102000 extern struct ib_ah *qlnxr_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr, struct ib_udata *udata); #else extern struct ib_ah *qlnxr_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr); #endif /* #if __FreeBSD_version >= 1102000 */ extern int qlnxr_destroy_ah(struct ib_ah *ibah); extern int qlnxr_query_ah(struct ib_ah *ibah, struct ib_ah_attr *attr); extern int qlnxr_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *attr); #if __FreeBSD_version >= 1102000 extern int qlnxr_process_mad(struct ib_device *ibdev, int process_mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *mad_hdr, size_t in_mad_size, struct ib_mad_hdr *out_mad, size_t *out_mad_size, u16 *out_mad_pkey_index); #else extern int qlnxr_process_mad(struct ib_device *ibdev, int process_mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad); #endif /* #if __FreeBSD_version >= 1102000 */ extern int qlnxr_post_send(struct ib_qp *, - struct ib_send_wr *, - struct ib_send_wr **bad_wr); + const struct ib_send_wr *, + const struct ib_send_wr **bad_wr); extern int qlnxr_post_recv(struct ib_qp *, - struct ib_recv_wr *, - struct ib_recv_wr **bad_wr); + const struct ib_recv_wr *, + const struct ib_recv_wr **bad_wr); extern int qlnxr_arm_cq(struct ib_cq *, enum ib_cq_notify_flags flags); extern struct ib_mr *qlnxr_get_dma_mr(struct ib_pd *, int acc); #if __FreeBSD_version < 1102000 extern struct ib_mr *qlnxr_reg_kernel_mr(struct ib_pd *, struct ib_phys_buf *buffer_list, int num_phys_buf, int acc, u64 *iova_start); #endif /* #if __FreeBSD_version < 1102000 */ extern int qlnxr_dereg_mr(struct ib_mr *); #if __FreeBSD_version >= 1102000 extern struct ib_mr *qlnxr_reg_user_mr(struct ib_pd *, u64 start, u64 length, u64 virt, int acc, struct ib_udata *); #else extern struct ib_mr *qlnxr_reg_user_mr(struct ib_pd *, u64 start, u64 length, u64 virt, int acc, struct ib_udata *, int mr_id); #endif /* #if __FreeBSD_version >= 1102000 */ #if __FreeBSD_version >= 1102000 extern struct ib_mr *qlnxr_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); extern int qlnxr_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); #else extern struct ib_mr *qlnxr_alloc_frmr(struct ib_pd *pd, int max_page_list_len); extern struct ib_fast_reg_page_list *qlnxr_alloc_frmr_page_list( struct ib_device *ibdev, int page_list_len); extern void qlnxr_free_frmr_page_list(struct ib_fast_reg_page_list *page_list); #endif /* #if __FreeBSD_version >= 1102000 */ extern struct ib_ucontext *qlnxr_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata); extern int qlnxr_dealloc_ucontext(struct ib_ucontext *ibctx); extern int qlnxr_mmap(struct ib_ucontext *, struct vm_area_struct *vma); extern int qlnxr_iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); extern int qlnxr_iw_create_listen(struct iw_cm_id *cm_id, int backlog); void qlnxr_iw_destroy_listen(struct iw_cm_id *cm_id); extern int qlnxr_iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); extern int qlnxr_iw_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); extern void qlnxr_iw_qp_add_ref(struct ib_qp *qp); extern void qlnxr_iw_qp_rem_ref(struct ib_qp *qp); extern struct ib_qp *qlnxr_iw_get_qp(struct ib_device *dev, int qpn); #endif /* #ifndef __QLNXR_VERBS_H__ */ diff --git a/sys/ofed/drivers/infiniband/core/ib_mad.c b/sys/ofed/drivers/infiniband/core/ib_mad.c index 82a3f19d92bb..5f9c353052e8 100644 --- a/sys/ofed/drivers/infiniband/core/ib_mad.c +++ b/sys/ofed/drivers/infiniband/core/ib_mad.c @@ -1,3322 +1,3323 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2009 HNR Consulting. All rights reserved. * Copyright (c) 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #define LINUXKPI_PARAM_PREFIX ibcore_ #define KBUILD_MODNAME "ibcore" #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include #include #include #include "mad_priv.h" #include "mad_rmpp.h" #include "smi.h" #include "opa_smi.h" #include "agent.h" #include "core_priv.h" static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; module_param_named(send_queue_size, mad_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests"); module_param_named(recv_queue_size, mad_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); static struct list_head ib_mad_port_list; static u32 ib_mad_client_id = 0; /* Port list lock */ static DEFINE_SPINLOCK(ib_mad_port_list_lock); /* Forward declarations */ static int method_in_use(struct ib_mad_mgmt_method_table **method, struct ib_mad_reg_req *mad_reg_req); static void remove_mad_reg_req(struct ib_mad_agent_private *priv); static struct ib_mad_agent_private *find_mad_agent( struct ib_mad_port_private *port_priv, const struct ib_mad_hdr *mad); static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, struct ib_mad_private *mad); static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv); static void timeout_sends(struct work_struct *work); static void local_completions(struct work_struct *work); static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv, u8 mgmt_class); static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv); static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, struct ib_wc *wc); static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc); /* * Returns a ib_mad_port_private structure or NULL for a device/port * Assumes ib_mad_port_list_lock is being held */ static inline struct ib_mad_port_private * __ib_get_mad_port(struct ib_device *device, int port_num) { struct ib_mad_port_private *entry; list_for_each_entry(entry, &ib_mad_port_list, port_list) { if (entry->device == device && entry->port_num == port_num) return entry; } return NULL; } /* * Wrapper function to return a ib_mad_port_private structure or NULL * for a device/port */ static inline struct ib_mad_port_private * ib_get_mad_port(struct ib_device *device, int port_num) { struct ib_mad_port_private *entry; unsigned long flags; spin_lock_irqsave(&ib_mad_port_list_lock, flags); entry = __ib_get_mad_port(device, port_num); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); return entry; } static inline u8 convert_mgmt_class(u8 mgmt_class) { /* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */ return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ? 0 : mgmt_class; } static int get_spl_qp_index(enum ib_qp_type qp_type) { switch (qp_type) { case IB_QPT_SMI: return 0; case IB_QPT_GSI: return 1; default: return -1; } } static int vendor_class_index(u8 mgmt_class) { return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START; } static int is_vendor_class(u8 mgmt_class) { if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) || (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END)) return 0; return 1; } static int is_vendor_oui(char *oui) { if (oui[0] || oui[1] || oui[2]) return 1; return 0; } static int is_vendor_method_in_use( struct ib_mad_mgmt_vendor_class *vendor_class, struct ib_mad_reg_req *mad_reg_req) { struct ib_mad_mgmt_method_table *method; int i; for (i = 0; i < MAX_MGMT_OUI; i++) { if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) { method = vendor_class->method_table[i]; if (method) { if (method_in_use(&method, mad_reg_req)) return 1; else break; } } } return 0; } int ib_response_mad(const struct ib_mad_hdr *hdr) { return ((hdr->method & IB_MGMT_METHOD_RESP) || (hdr->method == IB_MGMT_METHOD_TRAP_REPRESS) || ((hdr->mgmt_class == IB_MGMT_CLASS_BM) && (hdr->attr_mod & IB_BM_ATTR_MOD_RESP))); } EXPORT_SYMBOL(ib_response_mad); /* * ib_register_mad_agent - Register to send/receive MADs */ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, u8 port_num, enum ib_qp_type qp_type, struct ib_mad_reg_req *mad_reg_req, u8 rmpp_version, ib_mad_send_handler send_handler, ib_mad_recv_handler recv_handler, void *context, u32 registration_flags) { struct ib_mad_port_private *port_priv; struct ib_mad_agent *ret = ERR_PTR(-EINVAL); struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_reg_req *reg_req = NULL; struct ib_mad_mgmt_class_table *class; struct ib_mad_mgmt_vendor_class_table *vendor; struct ib_mad_mgmt_vendor_class *vendor_class; struct ib_mad_mgmt_method_table *method; int ret2, qpn; unsigned long flags; u8 mgmt_class, vclass; if ((qp_type == IB_QPT_SMI && !rdma_cap_ib_smi(device, port_num)) || (qp_type == IB_QPT_GSI && !rdma_cap_ib_cm(device, port_num))) return ERR_PTR(-EPROTONOSUPPORT); /* Validate parameters */ qpn = get_spl_qp_index(qp_type); if (qpn == -1) { dev_notice(&device->dev, "ib_register_mad_agent: invalid QP Type %d\n", qp_type); goto error1; } if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) { dev_notice(&device->dev, "ib_register_mad_agent: invalid RMPP Version %u\n", rmpp_version); goto error1; } /* Validate MAD registration request if supplied */ if (mad_reg_req) { if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) { dev_notice(&device->dev, "ib_register_mad_agent: invalid Class Version %u\n", mad_reg_req->mgmt_class_version); goto error1; } if (!recv_handler) { dev_notice(&device->dev, "ib_register_mad_agent: no recv_handler\n"); goto error1; } if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) { /* * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only * one in this range currently allowed */ if (mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { dev_notice(&device->dev, "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n", mad_reg_req->mgmt_class); goto error1; } } else if (mad_reg_req->mgmt_class == 0) { /* * Class 0 is reserved in IBA and is used for * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE */ dev_notice(&device->dev, "ib_register_mad_agent: Invalid Mgmt Class 0\n"); goto error1; } else if (is_vendor_class(mad_reg_req->mgmt_class)) { /* * If class is in "new" vendor range, * ensure supplied OUI is not zero */ if (!is_vendor_oui(mad_reg_req->oui)) { dev_notice(&device->dev, "ib_register_mad_agent: No OUI specified for class 0x%x\n", mad_reg_req->mgmt_class); goto error1; } } /* Make sure class supplied is consistent with RMPP */ if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) { if (rmpp_version) { dev_notice(&device->dev, "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n", mad_reg_req->mgmt_class); goto error1; } } /* Make sure class supplied is consistent with QP type */ if (qp_type == IB_QPT_SMI) { if ((mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) && (mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { dev_notice(&device->dev, "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n", mad_reg_req->mgmt_class); goto error1; } } else { if ((mad_reg_req->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || (mad_reg_req->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { dev_notice(&device->dev, "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n", mad_reg_req->mgmt_class); goto error1; } } } else { /* No registration request supplied */ if (!send_handler) goto error1; if (registration_flags & IB_MAD_USER_RMPP) goto error1; } /* Validate device and port */ port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { dev_notice(&device->dev, "ib_register_mad_agent: Invalid port\n"); ret = ERR_PTR(-ENODEV); goto error1; } /* Verify the QP requested is supported. For example, Ethernet devices * will not have QP0 */ if (!port_priv->qp_info[qpn].qp) { dev_notice(&device->dev, "ib_register_mad_agent: QP %d not supported\n", qpn); ret = ERR_PTR(-EPROTONOSUPPORT); goto error1; } /* Allocate structures */ mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL); if (!mad_agent_priv) { ret = ERR_PTR(-ENOMEM); goto error1; } if (mad_reg_req) { reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL); if (!reg_req) { ret = ERR_PTR(-ENOMEM); goto error3; } } /* Now, fill in the various structures */ mad_agent_priv->qp_info = &port_priv->qp_info[qpn]; mad_agent_priv->reg_req = reg_req; mad_agent_priv->agent.rmpp_version = rmpp_version; mad_agent_priv->agent.device = device; mad_agent_priv->agent.recv_handler = recv_handler; mad_agent_priv->agent.send_handler = send_handler; mad_agent_priv->agent.context = context; mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp; mad_agent_priv->agent.port_num = port_num; mad_agent_priv->agent.flags = registration_flags; spin_lock_init(&mad_agent_priv->lock); INIT_LIST_HEAD(&mad_agent_priv->send_list); INIT_LIST_HEAD(&mad_agent_priv->wait_list); INIT_LIST_HEAD(&mad_agent_priv->done_list); INIT_LIST_HEAD(&mad_agent_priv->rmpp_list); INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends); INIT_LIST_HEAD(&mad_agent_priv->local_list); INIT_WORK(&mad_agent_priv->local_work, local_completions); atomic_set(&mad_agent_priv->refcount, 1); init_completion(&mad_agent_priv->comp); spin_lock_irqsave(&port_priv->reg_lock, flags); mad_agent_priv->agent.hi_tid = ++ib_mad_client_id; /* * Make sure MAD registration (if supplied) * is non overlapping with any existing ones */ if (mad_reg_req) { mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class); if (!is_vendor_class(mgmt_class)) { class = port_priv->version[mad_reg_req-> mgmt_class_version].class; if (class) { method = class->method_table[mgmt_class]; if (method) { if (method_in_use(&method, mad_reg_req)) goto error4; } } ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv, mgmt_class); } else { /* "New" vendor class range */ vendor = port_priv->version[mad_reg_req-> mgmt_class_version].vendor; if (vendor) { vclass = vendor_class_index(mgmt_class); vendor_class = vendor->vendor_class[vclass]; if (vendor_class) { if (is_vendor_method_in_use( vendor_class, mad_reg_req)) goto error4; } } ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv); } if (ret2) { ret = ERR_PTR(ret2); goto error4; } } /* Add mad agent into port's agent list */ list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list); spin_unlock_irqrestore(&port_priv->reg_lock, flags); return &mad_agent_priv->agent; error4: spin_unlock_irqrestore(&port_priv->reg_lock, flags); kfree(reg_req); error3: kfree(mad_agent_priv); error1: return ret; } EXPORT_SYMBOL(ib_register_mad_agent); static inline int is_snooping_sends(int mad_snoop_flags) { return (mad_snoop_flags & (/*IB_MAD_SNOOP_POSTED_SENDS | IB_MAD_SNOOP_RMPP_SENDS |*/ IB_MAD_SNOOP_SEND_COMPLETIONS /*| IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/)); } static inline int is_snooping_recvs(int mad_snoop_flags) { return (mad_snoop_flags & (IB_MAD_SNOOP_RECVS /*| IB_MAD_SNOOP_RMPP_RECVS*/)); } static int register_snoop_agent(struct ib_mad_qp_info *qp_info, struct ib_mad_snoop_private *mad_snoop_priv) { struct ib_mad_snoop_private **new_snoop_table; unsigned long flags; int i; spin_lock_irqsave(&qp_info->snoop_lock, flags); /* Check for empty slot in array. */ for (i = 0; i < qp_info->snoop_table_size; i++) if (!qp_info->snoop_table[i]) break; if (i == qp_info->snoop_table_size) { /* Grow table. */ new_snoop_table = krealloc(qp_info->snoop_table, sizeof mad_snoop_priv * (qp_info->snoop_table_size + 1), GFP_ATOMIC); if (!new_snoop_table) { i = -ENOMEM; goto out; } qp_info->snoop_table = new_snoop_table; qp_info->snoop_table_size++; } qp_info->snoop_table[i] = mad_snoop_priv; atomic_inc(&qp_info->snoop_count); out: spin_unlock_irqrestore(&qp_info->snoop_lock, flags); return i; } struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device, u8 port_num, enum ib_qp_type qp_type, int mad_snoop_flags, ib_mad_snoop_handler snoop_handler, ib_mad_recv_handler recv_handler, void *context) { struct ib_mad_port_private *port_priv; struct ib_mad_agent *ret; struct ib_mad_snoop_private *mad_snoop_priv; int qpn; /* Validate parameters */ if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) || (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) { ret = ERR_PTR(-EINVAL); goto error1; } qpn = get_spl_qp_index(qp_type); if (qpn == -1) { ret = ERR_PTR(-EINVAL); goto error1; } port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { ret = ERR_PTR(-ENODEV); goto error1; } /* Allocate structures */ mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL); if (!mad_snoop_priv) { ret = ERR_PTR(-ENOMEM); goto error1; } /* Now, fill in the various structures */ mad_snoop_priv->qp_info = &port_priv->qp_info[qpn]; mad_snoop_priv->agent.device = device; mad_snoop_priv->agent.recv_handler = recv_handler; mad_snoop_priv->agent.snoop_handler = snoop_handler; mad_snoop_priv->agent.context = context; mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp; mad_snoop_priv->agent.port_num = port_num; mad_snoop_priv->mad_snoop_flags = mad_snoop_flags; init_completion(&mad_snoop_priv->comp); mad_snoop_priv->snoop_index = register_snoop_agent( &port_priv->qp_info[qpn], mad_snoop_priv); if (mad_snoop_priv->snoop_index < 0) { ret = ERR_PTR(mad_snoop_priv->snoop_index); goto error2; } atomic_set(&mad_snoop_priv->refcount, 1); return &mad_snoop_priv->agent; error2: kfree(mad_snoop_priv); error1: return ret; } EXPORT_SYMBOL(ib_register_mad_snoop); static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv) { if (atomic_dec_and_test(&mad_agent_priv->refcount)) complete(&mad_agent_priv->comp); } static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv) { if (atomic_dec_and_test(&mad_snoop_priv->refcount)) complete(&mad_snoop_priv->comp); } static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) { struct ib_mad_port_private *port_priv; unsigned long flags; /* Note that we could still be handling received MADs */ /* * Canceling all sends results in dropping received response * MADs, preventing us from queuing additional work */ cancel_mads(mad_agent_priv); port_priv = mad_agent_priv->qp_info->port_priv; cancel_delayed_work_sync(&mad_agent_priv->timed_work); spin_lock_irqsave(&port_priv->reg_lock, flags); remove_mad_reg_req(mad_agent_priv); list_del(&mad_agent_priv->agent_list); spin_unlock_irqrestore(&port_priv->reg_lock, flags); flush_workqueue(port_priv->wq); ib_cancel_rmpp_recvs(mad_agent_priv); deref_mad_agent(mad_agent_priv); wait_for_completion(&mad_agent_priv->comp); kfree(mad_agent_priv->reg_req); kfree(mad_agent_priv); } static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv) { struct ib_mad_qp_info *qp_info; unsigned long flags; qp_info = mad_snoop_priv->qp_info; spin_lock_irqsave(&qp_info->snoop_lock, flags); qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL; atomic_dec(&qp_info->snoop_count); spin_unlock_irqrestore(&qp_info->snoop_lock, flags); deref_snoop_agent(mad_snoop_priv); wait_for_completion(&mad_snoop_priv->comp); kfree(mad_snoop_priv); } /* * ib_unregister_mad_agent - Unregisters a client from using MAD services */ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_snoop_private *mad_snoop_priv; /* If the TID is zero, the agent can only snoop. */ if (mad_agent->hi_tid) { mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, agent); unregister_mad_agent(mad_agent_priv); } else { mad_snoop_priv = container_of(mad_agent, struct ib_mad_snoop_private, agent); unregister_mad_snoop(mad_snoop_priv); } return 0; } EXPORT_SYMBOL(ib_unregister_mad_agent); static void dequeue_mad(struct ib_mad_list_head *mad_list) { struct ib_mad_queue *mad_queue; unsigned long flags; BUG_ON(!mad_list->mad_queue); mad_queue = mad_list->mad_queue; spin_lock_irqsave(&mad_queue->lock, flags); list_del(&mad_list->list); mad_queue->count--; spin_unlock_irqrestore(&mad_queue->lock, flags); } static void snoop_send(struct ib_mad_qp_info *qp_info, struct ib_mad_send_buf *send_buf, struct ib_mad_send_wc *mad_send_wc, int mad_snoop_flags) { struct ib_mad_snoop_private *mad_snoop_priv; unsigned long flags; int i; spin_lock_irqsave(&qp_info->snoop_lock, flags); for (i = 0; i < qp_info->snoop_table_size; i++) { mad_snoop_priv = qp_info->snoop_table[i]; if (!mad_snoop_priv || !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags)) continue; atomic_inc(&mad_snoop_priv->refcount); spin_unlock_irqrestore(&qp_info->snoop_lock, flags); mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent, send_buf, mad_send_wc); deref_snoop_agent(mad_snoop_priv); spin_lock_irqsave(&qp_info->snoop_lock, flags); } spin_unlock_irqrestore(&qp_info->snoop_lock, flags); } static void snoop_recv(struct ib_mad_qp_info *qp_info, struct ib_mad_recv_wc *mad_recv_wc, int mad_snoop_flags) { struct ib_mad_snoop_private *mad_snoop_priv; unsigned long flags; int i; spin_lock_irqsave(&qp_info->snoop_lock, flags); for (i = 0; i < qp_info->snoop_table_size; i++) { mad_snoop_priv = qp_info->snoop_table[i]; if (!mad_snoop_priv || !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags)) continue; atomic_inc(&mad_snoop_priv->refcount); spin_unlock_irqrestore(&qp_info->snoop_lock, flags); mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, NULL, mad_recv_wc); deref_snoop_agent(mad_snoop_priv); spin_lock_irqsave(&qp_info->snoop_lock, flags); } spin_unlock_irqrestore(&qp_info->snoop_lock, flags); } static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid, u16 pkey_index, u8 port_num, struct ib_wc *wc) { memset(wc, 0, sizeof *wc); wc->wr_cqe = cqe; wc->status = IB_WC_SUCCESS; wc->opcode = IB_WC_RECV; wc->pkey_index = pkey_index; wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh); wc->src_qp = IB_QP0; wc->qp = qp; wc->slid = slid; wc->sl = 0; wc->dlid_path_bits = 0; wc->port_num = port_num; } static size_t mad_priv_size(const struct ib_mad_private *mp) { return sizeof(struct ib_mad_private) + mp->mad_size; } static struct ib_mad_private *alloc_mad_private(size_t mad_size, gfp_t flags) { size_t size = sizeof(struct ib_mad_private) + mad_size; struct ib_mad_private *ret = kzalloc(size, flags); if (ret) ret->mad_size = mad_size; return ret; } static size_t port_mad_size(const struct ib_mad_port_private *port_priv) { return rdma_max_mad_size(port_priv->device, port_priv->port_num); } static size_t mad_priv_dma_size(const struct ib_mad_private *mp) { return sizeof(struct ib_grh) + mp->mad_size; } /* * Return 0 if SMP is to be sent * Return 1 if SMP was consumed locally (whether or not solicited) * Return < 0 if error */ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_send_wr_private *mad_send_wr) { int ret = 0; struct ib_smp *smp = mad_send_wr->send_buf.mad; struct opa_smp *opa_smp = (struct opa_smp *)smp; unsigned long flags; struct ib_mad_local_private *local; struct ib_mad_private *mad_priv; struct ib_mad_port_private *port_priv; struct ib_mad_agent_private *recv_mad_agent = NULL; struct ib_device *device = mad_agent_priv->agent.device; u8 port_num; struct ib_wc mad_wc; struct ib_ud_wr *send_wr = &mad_send_wr->send_wr; size_t mad_size = port_mad_size(mad_agent_priv->qp_info->port_priv); u16 out_mad_pkey_index = 0; u16 drslid; bool opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device, mad_agent_priv->qp_info->port_priv->port_num); if (rdma_cap_ib_switch(device) && smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) port_num = send_wr->port_num; else port_num = mad_agent_priv->agent.port_num; /* * Directed route handling starts if the initial LID routed part of * a request or the ending LID routed part of a response is empty. * If we are at the start of the LID routed part, don't update the * hop_ptr or hop_cnt. See section 14.2.2, Vol 1 IB spec. */ if (opa && smp->class_version == OPA_SMP_CLASS_VERSION) { u32 opa_drslid; if ((opa_get_smp_direction(opa_smp) ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) == OPA_LID_PERMISSIVE && opa_smi_handle_dr_smp_send(opa_smp, rdma_cap_ib_switch(device), port_num) == IB_SMI_DISCARD) { ret = -EINVAL; dev_err(&device->dev, "OPA Invalid directed route\n"); goto out; } opa_drslid = be32_to_cpu(opa_smp->route.dr.dr_slid); if (opa_drslid != be32_to_cpu(OPA_LID_PERMISSIVE) && opa_drslid & 0xffff0000) { ret = -EINVAL; dev_err(&device->dev, "OPA Invalid dr_slid 0x%x\n", opa_drslid); goto out; } drslid = (u16)(opa_drslid & 0x0000ffff); /* Check to post send on QP or process locally */ if (opa_smi_check_local_smp(opa_smp, device) == IB_SMI_DISCARD && opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD) goto out; } else { if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) == IB_LID_PERMISSIVE && smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) == IB_SMI_DISCARD) { ret = -EINVAL; dev_err(&device->dev, "Invalid directed route\n"); goto out; } drslid = be16_to_cpu(smp->dr_slid); /* Check to post send on QP or process locally */ if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD && smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD) goto out; } local = kmalloc(sizeof *local, GFP_ATOMIC); if (!local) { ret = -ENOMEM; goto out; } local->mad_priv = NULL; local->recv_mad_agent = NULL; mad_priv = alloc_mad_private(mad_size, GFP_ATOMIC); if (!mad_priv) { ret = -ENOMEM; kfree(local); goto out; } build_smp_wc(mad_agent_priv->agent.qp, send_wr->wr.wr_cqe, drslid, send_wr->pkey_index, send_wr->port_num, &mad_wc); if (opa && smp->base_version == OPA_MGMT_BASE_VERSION) { mad_wc.byte_len = mad_send_wr->send_buf.hdr_len + mad_send_wr->send_buf.data_len + sizeof(struct ib_grh); } /* No GRH for DR SMP */ ret = device->process_mad(device, 0, port_num, &mad_wc, NULL, (const struct ib_mad_hdr *)smp, mad_size, (struct ib_mad_hdr *)mad_priv->mad, &mad_size, &out_mad_pkey_index); switch (ret) { case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY: if (ib_response_mad((const struct ib_mad_hdr *)mad_priv->mad) && mad_agent_priv->agent.recv_handler) { local->mad_priv = mad_priv; local->recv_mad_agent = mad_agent_priv; /* * Reference MAD agent until receive * side of local completion handled */ atomic_inc(&mad_agent_priv->refcount); } else kfree(mad_priv); break; case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED: kfree(mad_priv); break; case IB_MAD_RESULT_SUCCESS: /* Treat like an incoming receive MAD */ port_priv = ib_get_mad_port(mad_agent_priv->agent.device, mad_agent_priv->agent.port_num); if (port_priv) { memcpy(mad_priv->mad, smp, mad_priv->mad_size); recv_mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)mad_priv->mad); } if (!port_priv || !recv_mad_agent) { /* * No receiving agent so drop packet and * generate send completion. */ kfree(mad_priv); break; } local->mad_priv = mad_priv; local->recv_mad_agent = recv_mad_agent; break; default: kfree(mad_priv); kfree(local); ret = -EINVAL; goto out; } local->mad_send_wr = mad_send_wr; if (opa) { local->mad_send_wr->send_wr.pkey_index = out_mad_pkey_index; local->return_wc_byte_len = mad_size; } /* Reference MAD agent until send side of local completion handled */ atomic_inc(&mad_agent_priv->refcount); /* Queue local completion to local list */ spin_lock_irqsave(&mad_agent_priv->lock, flags); list_add_tail(&local->completion_list, &mad_agent_priv->local_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); queue_work(mad_agent_priv->qp_info->port_priv->wq, &mad_agent_priv->local_work); ret = 1; out: return ret; } static int get_pad_size(int hdr_len, int data_len, size_t mad_size) { int seg_size, pad; seg_size = mad_size - hdr_len; if (data_len && seg_size) { pad = seg_size - data_len % seg_size; return pad == seg_size ? 0 : pad; } else return seg_size; } static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_rmpp_segment *s, *t; list_for_each_entry_safe(s, t, &mad_send_wr->rmpp_list, list) { list_del(&s->list); kfree(s); } } static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr, size_t mad_size, gfp_t gfp_mask) { struct ib_mad_send_buf *send_buf = &send_wr->send_buf; struct ib_rmpp_mad *rmpp_mad = send_buf->mad; struct ib_rmpp_segment *seg = NULL; int left, seg_size, pad; send_buf->seg_size = mad_size - send_buf->hdr_len; send_buf->seg_rmpp_size = mad_size - IB_MGMT_RMPP_HDR; seg_size = send_buf->seg_size; pad = send_wr->pad; /* Allocate data segments. */ for (left = send_buf->data_len + pad; left > 0; left -= seg_size) { seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask); if (!seg) { free_send_rmpp_list(send_wr); return -ENOMEM; } seg->num = ++send_buf->seg_count; list_add_tail(&seg->list, &send_wr->rmpp_list); } /* Zero any padding */ if (pad) memset(seg->data + seg_size - pad, 0, pad); rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv-> agent.rmpp_version; rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA; ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); send_wr->cur_seg = container_of(send_wr->rmpp_list.next, struct ib_rmpp_segment, list); send_wr->last_ack_seg = send_wr->cur_seg; return 0; } int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent) { return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP); } EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent); struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, u32 remote_qpn, u16 pkey_index, int rmpp_active, int hdr_len, int data_len, gfp_t gfp_mask, u8 base_version) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; int pad, message_size, ret, size; void *buf; size_t mad_size; bool opa; mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, agent); opa = rdma_cap_opa_mad(mad_agent->device, mad_agent->port_num); if (opa && base_version == OPA_MGMT_BASE_VERSION) mad_size = sizeof(struct opa_mad); else mad_size = sizeof(struct ib_mad); pad = get_pad_size(hdr_len, data_len, mad_size); message_size = hdr_len + data_len + pad; if (ib_mad_kernel_rmpp_agent(mad_agent)) { if (!rmpp_active && message_size > mad_size) return ERR_PTR(-EINVAL); } else if (rmpp_active || message_size > mad_size) return ERR_PTR(-EINVAL); size = rmpp_active ? hdr_len : mad_size; buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask); if (!buf) return ERR_PTR(-ENOMEM); mad_send_wr = (struct ib_mad_send_wr_private *)((char *)buf + size); INIT_LIST_HEAD(&mad_send_wr->rmpp_list); mad_send_wr->send_buf.mad = buf; mad_send_wr->send_buf.hdr_len = hdr_len; mad_send_wr->send_buf.data_len = data_len; mad_send_wr->pad = pad; mad_send_wr->mad_agent_priv = mad_agent_priv; mad_send_wr->sg_list[0].length = hdr_len; mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey; /* OPA MADs don't have to be the full 2048 bytes */ if (opa && base_version == OPA_MGMT_BASE_VERSION && data_len < mad_size - hdr_len) mad_send_wr->sg_list[1].length = data_len; else mad_send_wr->sg_list[1].length = mad_size - hdr_len; mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey; mad_send_wr->mad_list.cqe.done = ib_mad_send_done; mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe; mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list; mad_send_wr->send_wr.wr.num_sge = 2; mad_send_wr->send_wr.wr.opcode = IB_WR_SEND; mad_send_wr->send_wr.wr.send_flags = IB_SEND_SIGNALED; mad_send_wr->send_wr.remote_qpn = remote_qpn; mad_send_wr->send_wr.remote_qkey = IB_QP_SET_QKEY; mad_send_wr->send_wr.pkey_index = pkey_index; if (rmpp_active) { ret = alloc_send_rmpp_list(mad_send_wr, mad_size, gfp_mask); if (ret) { kfree(buf); return ERR_PTR(ret); } } mad_send_wr->send_buf.mad_agent = mad_agent; atomic_inc(&mad_agent_priv->refcount); return &mad_send_wr->send_buf; } EXPORT_SYMBOL(ib_create_send_mad); int ib_get_mad_data_offset(u8 mgmt_class) { if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM) return IB_MGMT_SA_HDR; else if ((mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) || (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) || (mgmt_class == IB_MGMT_CLASS_BIS)) return IB_MGMT_DEVICE_HDR; else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) && (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)) return IB_MGMT_VENDOR_HDR; else return IB_MGMT_MAD_HDR; } EXPORT_SYMBOL(ib_get_mad_data_offset); int ib_is_mad_class_rmpp(u8 mgmt_class) { if ((mgmt_class == IB_MGMT_CLASS_SUBN_ADM) || (mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) || (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) || (mgmt_class == IB_MGMT_CLASS_BIS) || ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) && (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))) return 1; return 0; } EXPORT_SYMBOL(ib_is_mad_class_rmpp); void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num) { struct ib_mad_send_wr_private *mad_send_wr; struct list_head *list; mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); list = &mad_send_wr->cur_seg->list; if (mad_send_wr->cur_seg->num < seg_num) { list_for_each_entry(mad_send_wr->cur_seg, list, list) if (mad_send_wr->cur_seg->num == seg_num) break; } else if (mad_send_wr->cur_seg->num > seg_num) { list_for_each_entry_reverse(mad_send_wr->cur_seg, list, list) if (mad_send_wr->cur_seg->num == seg_num) break; } return mad_send_wr->cur_seg->data; } EXPORT_SYMBOL(ib_get_rmpp_segment); static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr) { if (mad_send_wr->send_buf.seg_count) return ib_get_rmpp_segment(&mad_send_wr->send_buf, mad_send_wr->seg_num); else return (char *)mad_send_wr->send_buf.mad + mad_send_wr->send_buf.hdr_len; } void ib_free_send_mad(struct ib_mad_send_buf *send_buf) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; mad_agent_priv = container_of(send_buf->mad_agent, struct ib_mad_agent_private, agent); mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); free_send_rmpp_list(mad_send_wr); kfree(send_buf->mad); deref_mad_agent(mad_agent_priv); } EXPORT_SYMBOL(ib_free_send_mad); int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_mad_qp_info *qp_info; struct list_head *list; - struct ib_send_wr *bad_send_wr; + const struct ib_send_wr *bad_send_wr; struct ib_mad_agent *mad_agent; struct ib_sge *sge; unsigned long flags; int ret; /* Set WR ID to find mad_send_wr upon completion */ qp_info = mad_send_wr->mad_agent_priv->qp_info; mad_send_wr->mad_list.mad_queue = &qp_info->send_queue; mad_send_wr->mad_list.cqe.done = ib_mad_send_done; mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe; mad_agent = mad_send_wr->send_buf.mad_agent; sge = mad_send_wr->sg_list; sge[0].addr = ib_dma_map_single(mad_agent->device, mad_send_wr->send_buf.mad, sge[0].length, DMA_TO_DEVICE); if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr))) return -ENOMEM; mad_send_wr->header_mapping = sge[0].addr; sge[1].addr = ib_dma_map_single(mad_agent->device, ib_get_payload(mad_send_wr), sge[1].length, DMA_TO_DEVICE); if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) { ib_dma_unmap_single(mad_agent->device, mad_send_wr->header_mapping, sge[0].length, DMA_TO_DEVICE); return -ENOMEM; } mad_send_wr->payload_mapping = sge[1].addr; spin_lock_irqsave(&qp_info->send_queue.lock, flags); if (qp_info->send_queue.count < qp_info->send_queue.max_active) { ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr, &bad_send_wr); list = &qp_info->send_queue.list; } else { ret = 0; list = &qp_info->overflow_list; } if (!ret) { qp_info->send_queue.count++; list_add_tail(&mad_send_wr->mad_list.list, list); } spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); if (ret) { ib_dma_unmap_single(mad_agent->device, mad_send_wr->header_mapping, sge[0].length, DMA_TO_DEVICE); ib_dma_unmap_single(mad_agent->device, mad_send_wr->payload_mapping, sge[1].length, DMA_TO_DEVICE); } return ret; } /* * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated * with the registered client */ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, struct ib_mad_send_buf **bad_send_buf) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_buf *next_send_buf; struct ib_mad_send_wr_private *mad_send_wr; unsigned long flags; int ret = -EINVAL; /* Walk list of send WRs and post each on send list */ for (; send_buf; send_buf = next_send_buf) { mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); mad_agent_priv = mad_send_wr->mad_agent_priv; if (!send_buf->mad_agent->send_handler || (send_buf->timeout_ms && !send_buf->mad_agent->recv_handler)) { ret = -EINVAL; goto error; } if (!ib_is_mad_class_rmpp(((struct ib_mad_hdr *) send_buf->mad)->mgmt_class)) { if (mad_agent_priv->agent.rmpp_version) { ret = -EINVAL; goto error; } } /* * Save pointer to next work request to post in case the * current one completes, and the user modifies the work * request associated with the completion */ next_send_buf = send_buf->next; mad_send_wr->send_wr.ah = send_buf->ah; if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { ret = handle_outgoing_dr_smp(mad_agent_priv, mad_send_wr); if (ret < 0) /* error */ goto error; else if (ret == 1) /* locally consumed */ continue; } mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid; /* Timeout will be updated after send completes */ mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms); mad_send_wr->max_retries = send_buf->retries; mad_send_wr->retries_left = send_buf->retries; send_buf->retries = 0; /* Reference for work request to QP + response */ mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); mad_send_wr->status = IB_WC_SUCCESS; /* Reference MAD agent until send completes */ atomic_inc(&mad_agent_priv->refcount); spin_lock_irqsave(&mad_agent_priv->lock, flags); list_add_tail(&mad_send_wr->agent_list, &mad_agent_priv->send_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { ret = ib_send_rmpp_mad(mad_send_wr); if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED) ret = ib_send_mad(mad_send_wr); } else ret = ib_send_mad(mad_send_wr); if (ret < 0) { /* Fail send request */ spin_lock_irqsave(&mad_agent_priv->lock, flags); list_del(&mad_send_wr->agent_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); atomic_dec(&mad_agent_priv->refcount); goto error; } } return 0; error: if (bad_send_buf) *bad_send_buf = send_buf; return ret; } EXPORT_SYMBOL(ib_post_send_mad); /* * ib_free_recv_mad - Returns data buffers used to receive * a MAD to the access layer */ void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc) { struct ib_mad_recv_buf *mad_recv_buf, *temp_recv_buf; struct ib_mad_private_header *mad_priv_hdr; struct ib_mad_private *priv; struct list_head free_list; INIT_LIST_HEAD(&free_list); list_splice_init(&mad_recv_wc->rmpp_list, &free_list); list_for_each_entry_safe(mad_recv_buf, temp_recv_buf, &free_list, list) { mad_recv_wc = container_of(mad_recv_buf, struct ib_mad_recv_wc, recv_buf); mad_priv_hdr = container_of(mad_recv_wc, struct ib_mad_private_header, recv_wc); priv = container_of(mad_priv_hdr, struct ib_mad_private, header); kfree(priv); } } EXPORT_SYMBOL(ib_free_recv_mad); struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp, u8 rmpp_version, ib_mad_send_handler send_handler, ib_mad_recv_handler recv_handler, void *context) { return ERR_PTR(-EINVAL); /* XXX: for now */ } EXPORT_SYMBOL(ib_redirect_mad_qp); int ib_process_mad_wc(struct ib_mad_agent *mad_agent, struct ib_wc *wc) { dev_err(&mad_agent->device->dev, "ib_process_mad_wc() not implemented yet\n"); return 0; } EXPORT_SYMBOL(ib_process_mad_wc); static int method_in_use(struct ib_mad_mgmt_method_table **method, struct ib_mad_reg_req *mad_reg_req) { int i; for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) { if ((*method)->agent[i]) { pr_err("Method %d already in use\n", i); return -EINVAL; } } return 0; } static int allocate_method_table(struct ib_mad_mgmt_method_table **method) { /* Allocate management method table */ *method = kzalloc(sizeof **method, GFP_ATOMIC); return (*method) ? 0 : (-ENOMEM); } /* * Check to see if there are any methods still in use */ static int check_method_table(struct ib_mad_mgmt_method_table *method) { int i; for (i = 0; i < IB_MGMT_MAX_METHODS; i++) if (method->agent[i]) return 1; return 0; } /* * Check to see if there are any method tables for this class still in use */ static int check_class_table(struct ib_mad_mgmt_class_table *class) { int i; for (i = 0; i < MAX_MGMT_CLASS; i++) if (class->method_table[i]) return 1; return 0; } static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class) { int i; for (i = 0; i < MAX_MGMT_OUI; i++) if (vendor_class->method_table[i]) return 1; return 0; } static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class, const char *oui) { int i; for (i = 0; i < MAX_MGMT_OUI; i++) /* Is there matching OUI for this vendor class ? */ if (!memcmp(vendor_class->oui[i], oui, 3)) return i; return -1; } static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor) { int i; for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++) if (vendor->vendor_class[i]) return 1; return 0; } static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method, struct ib_mad_agent_private *agent) { int i; /* Remove any methods for this mad agent */ for (i = 0; i < IB_MGMT_MAX_METHODS; i++) { if (method->agent[i] == agent) { method->agent[i] = NULL; } } } static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv, u8 mgmt_class) { struct ib_mad_port_private *port_priv; struct ib_mad_mgmt_class_table **class; struct ib_mad_mgmt_method_table **method; int i, ret; port_priv = agent_priv->qp_info->port_priv; class = &port_priv->version[mad_reg_req->mgmt_class_version].class; if (!*class) { /* Allocate management class table for "new" class version */ *class = kzalloc(sizeof **class, GFP_ATOMIC); if (!*class) { ret = -ENOMEM; goto error1; } /* Allocate method table for this management class */ method = &(*class)->method_table[mgmt_class]; if ((ret = allocate_method_table(method))) goto error2; } else { method = &(*class)->method_table[mgmt_class]; if (!*method) { /* Allocate method table for this management class */ if ((ret = allocate_method_table(method))) goto error1; } } /* Now, make sure methods are not already in use */ if (method_in_use(method, mad_reg_req)) goto error3; /* Finally, add in methods being registered */ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) (*method)->agent[i] = agent_priv; return 0; error3: /* Remove any methods for this mad agent */ remove_methods_mad_agent(*method, agent_priv); /* Now, check to see if there are any methods in use */ if (!check_method_table(*method)) { /* If not, release management method table */ kfree(*method); *method = NULL; } ret = -EINVAL; goto error1; error2: kfree(*class); *class = NULL; error1: return ret; } static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv) { struct ib_mad_port_private *port_priv; struct ib_mad_mgmt_vendor_class_table **vendor_table; struct ib_mad_mgmt_vendor_class_table *vendor = NULL; struct ib_mad_mgmt_vendor_class *vendor_class = NULL; struct ib_mad_mgmt_method_table **method; int i, ret = -ENOMEM; u8 vclass; /* "New" vendor (with OUI) class */ vclass = vendor_class_index(mad_reg_req->mgmt_class); port_priv = agent_priv->qp_info->port_priv; vendor_table = &port_priv->version[ mad_reg_req->mgmt_class_version].vendor; if (!*vendor_table) { /* Allocate mgmt vendor class table for "new" class version */ vendor = kzalloc(sizeof *vendor, GFP_ATOMIC); if (!vendor) goto error1; *vendor_table = vendor; } if (!(*vendor_table)->vendor_class[vclass]) { /* Allocate table for this management vendor class */ vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC); if (!vendor_class) goto error2; (*vendor_table)->vendor_class[vclass] = vendor_class; } for (i = 0; i < MAX_MGMT_OUI; i++) { /* Is there matching OUI for this vendor class ? */ if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i], mad_reg_req->oui, 3)) { method = &(*vendor_table)->vendor_class[ vclass]->method_table[i]; if (!*method) goto error3; goto check_in_use; } } for (i = 0; i < MAX_MGMT_OUI; i++) { /* OUI slot available ? */ if (!is_vendor_oui((*vendor_table)->vendor_class[ vclass]->oui[i])) { method = &(*vendor_table)->vendor_class[ vclass]->method_table[i]; /* Allocate method table for this OUI */ if (!*method) { ret = allocate_method_table(method); if (ret) goto error3; } memcpy((*vendor_table)->vendor_class[vclass]->oui[i], mad_reg_req->oui, 3); goto check_in_use; } } dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n"); goto error3; check_in_use: /* Now, make sure methods are not already in use */ if (method_in_use(method, mad_reg_req)) goto error4; /* Finally, add in methods being registered */ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) (*method)->agent[i] = agent_priv; return 0; error4: /* Remove any methods for this mad agent */ remove_methods_mad_agent(*method, agent_priv); /* Now, check to see if there are any methods in use */ if (!check_method_table(*method)) { /* If not, release management method table */ kfree(*method); *method = NULL; } ret = -EINVAL; error3: if (vendor_class) { (*vendor_table)->vendor_class[vclass] = NULL; kfree(vendor_class); } error2: if (vendor) { *vendor_table = NULL; kfree(vendor); } error1: return ret; } static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv) { struct ib_mad_port_private *port_priv; struct ib_mad_mgmt_class_table *class; struct ib_mad_mgmt_method_table *method; struct ib_mad_mgmt_vendor_class_table *vendor; struct ib_mad_mgmt_vendor_class *vendor_class; int index; u8 mgmt_class; /* * Was MAD registration request supplied * with original registration ? */ if (!agent_priv->reg_req) { goto out; } port_priv = agent_priv->qp_info->port_priv; mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class); class = port_priv->version[ agent_priv->reg_req->mgmt_class_version].class; if (!class) goto vendor_check; method = class->method_table[mgmt_class]; if (method) { /* Remove any methods for this mad agent */ remove_methods_mad_agent(method, agent_priv); /* Now, check to see if there are any methods still in use */ if (!check_method_table(method)) { /* If not, release management method table */ kfree(method); class->method_table[mgmt_class] = NULL; /* Any management classes left ? */ if (!check_class_table(class)) { /* If not, release management class table */ kfree(class); port_priv->version[ agent_priv->reg_req-> mgmt_class_version].class = NULL; } } } vendor_check: if (!is_vendor_class(mgmt_class)) goto out; /* normalize mgmt_class to vendor range 2 */ mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class); vendor = port_priv->version[ agent_priv->reg_req->mgmt_class_version].vendor; if (!vendor) goto out; vendor_class = vendor->vendor_class[mgmt_class]; if (vendor_class) { index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui); if (index < 0) goto out; method = vendor_class->method_table[index]; if (method) { /* Remove any methods for this mad agent */ remove_methods_mad_agent(method, agent_priv); /* * Now, check to see if there are * any methods still in use */ if (!check_method_table(method)) { /* If not, release management method table */ kfree(method); vendor_class->method_table[index] = NULL; memset(vendor_class->oui[index], 0, 3); /* Any OUIs left ? */ if (!check_vendor_class(vendor_class)) { /* If not, release vendor class table */ kfree(vendor_class); vendor->vendor_class[mgmt_class] = NULL; /* Any other vendor classes left ? */ if (!check_vendor_table(vendor)) { kfree(vendor); port_priv->version[ agent_priv->reg_req-> mgmt_class_version]. vendor = NULL; } } } } } out: return; } static struct ib_mad_agent_private * find_mad_agent(struct ib_mad_port_private *port_priv, const struct ib_mad_hdr *mad_hdr) { struct ib_mad_agent_private *mad_agent = NULL; unsigned long flags; spin_lock_irqsave(&port_priv->reg_lock, flags); if (ib_response_mad(mad_hdr)) { u32 hi_tid; struct ib_mad_agent_private *entry; /* * Routing is based on high 32 bits of transaction ID * of MAD. */ hi_tid = be64_to_cpu(mad_hdr->tid) >> 32; list_for_each_entry(entry, &port_priv->agent_list, agent_list) { if (entry->agent.hi_tid == hi_tid) { mad_agent = entry; break; } } } else { struct ib_mad_mgmt_class_table *class; struct ib_mad_mgmt_method_table *method; struct ib_mad_mgmt_vendor_class_table *vendor; struct ib_mad_mgmt_vendor_class *vendor_class; const struct ib_vendor_mad *vendor_mad; int index; /* * Routing is based on version, class, and method * For "newer" vendor MADs, also based on OUI */ if (mad_hdr->class_version >= MAX_MGMT_VERSION) goto out; if (!is_vendor_class(mad_hdr->mgmt_class)) { class = port_priv->version[ mad_hdr->class_version].class; if (!class) goto out; if (convert_mgmt_class(mad_hdr->mgmt_class) >= ARRAY_SIZE(class->method_table)) goto out; method = class->method_table[convert_mgmt_class( mad_hdr->mgmt_class)]; if (method) mad_agent = method->agent[mad_hdr->method & ~IB_MGMT_METHOD_RESP]; } else { vendor = port_priv->version[ mad_hdr->class_version].vendor; if (!vendor) goto out; vendor_class = vendor->vendor_class[vendor_class_index( mad_hdr->mgmt_class)]; if (!vendor_class) goto out; /* Find matching OUI */ vendor_mad = (const struct ib_vendor_mad *)mad_hdr; index = find_vendor_oui(vendor_class, vendor_mad->oui); if (index == -1) goto out; method = vendor_class->method_table[index]; if (method) { mad_agent = method->agent[mad_hdr->method & ~IB_MGMT_METHOD_RESP]; } } } if (mad_agent) { if (mad_agent->agent.recv_handler) atomic_inc(&mad_agent->refcount); else { dev_notice(&port_priv->device->dev, "No receive handler for client %p on port %d\n", &mad_agent->agent, port_priv->port_num); mad_agent = NULL; } } out: spin_unlock_irqrestore(&port_priv->reg_lock, flags); return mad_agent; } static int validate_mad(const struct ib_mad_hdr *mad_hdr, const struct ib_mad_qp_info *qp_info, bool opa) { int valid = 0; u32 qp_num = qp_info->qp->qp_num; /* Make sure MAD base version is understood */ if (mad_hdr->base_version != IB_MGMT_BASE_VERSION && (!opa || mad_hdr->base_version != OPA_MGMT_BASE_VERSION)) { pr_err("MAD received with unsupported base version %d %s\n", mad_hdr->base_version, opa ? "(opa)" : ""); goto out; } /* Filter SMI packets sent to other than QP0 */ if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { if (qp_num == 0) valid = 1; } else { /* CM attributes other than ClassPortInfo only use Send method */ if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_CM) && (mad_hdr->attr_id != IB_MGMT_CLASSPORTINFO_ATTR_ID) && (mad_hdr->method != IB_MGMT_METHOD_SEND)) goto out; /* Filter GSI packets sent to QP0 */ if (qp_num != 0) valid = 1; } out: return valid; } static int is_rmpp_data_mad(const struct ib_mad_agent_private *mad_agent_priv, const struct ib_mad_hdr *mad_hdr) { const struct ib_rmpp_mad *rmpp_mad; rmpp_mad = (const struct ib_rmpp_mad *)mad_hdr; return !mad_agent_priv->agent.rmpp_version || !ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) || !(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE) || (rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA); } static inline int rcv_has_same_class(const struct ib_mad_send_wr_private *wr, const struct ib_mad_recv_wc *rwc) { return ((struct ib_mad_hdr *)(wr->send_buf.mad))->mgmt_class == rwc->recv_buf.mad->mad_hdr.mgmt_class; } static inline int rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv, const struct ib_mad_send_wr_private *wr, const struct ib_mad_recv_wc *rwc ) { struct ib_ah_attr attr; u8 send_resp, rcv_resp; union ib_gid sgid; struct ib_device *device = mad_agent_priv->agent.device; u8 port_num = mad_agent_priv->agent.port_num; u8 lmc; send_resp = ib_response_mad((struct ib_mad_hdr *)wr->send_buf.mad); rcv_resp = ib_response_mad(&rwc->recv_buf.mad->mad_hdr); if (send_resp == rcv_resp) /* both requests, or both responses. GIDs different */ return 0; if (ib_query_ah(wr->send_buf.ah, &attr)) /* Assume not equal, to avoid false positives. */ return 0; if (!!(attr.ah_flags & IB_AH_GRH) != !!(rwc->wc->wc_flags & IB_WC_GRH)) /* one has GID, other does not. Assume different */ return 0; if (!send_resp && rcv_resp) { /* is request/response. */ if (!(attr.ah_flags & IB_AH_GRH)) { if (ib_get_cached_lmc(device, port_num, &lmc)) return 0; return (!lmc || !((attr.src_path_bits ^ rwc->wc->dlid_path_bits) & ((1 << lmc) - 1))); } else { if (ib_get_cached_gid(device, port_num, attr.grh.sgid_index, &sgid, NULL)) return 0; return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw, 16); } } if (!(attr.ah_flags & IB_AH_GRH)) return attr.dlid == rwc->wc->slid; else return !memcmp(attr.grh.dgid.raw, rwc->recv_buf.grh->sgid.raw, 16); } static inline int is_direct(u8 class) { return (class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE); } struct ib_mad_send_wr_private* ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, const struct ib_mad_recv_wc *wc) { struct ib_mad_send_wr_private *wr; const struct ib_mad_hdr *mad_hdr; mad_hdr = &wc->recv_buf.mad->mad_hdr; list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) { if ((wr->tid == mad_hdr->tid) && rcv_has_same_class(wr, wc) && /* * Don't check GID for direct routed MADs. * These might have permissive LIDs. */ (is_direct(mad_hdr->mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) return (wr->status == IB_WC_SUCCESS) ? wr : NULL; } /* * It's possible to receive the response before we've * been notified that the send has completed */ list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) { if (is_rmpp_data_mad(mad_agent_priv, wr->send_buf.mad) && wr->tid == mad_hdr->tid && wr->timeout && rcv_has_same_class(wr, wc) && /* * Don't check GID for direct routed MADs. * These might have permissive LIDs. */ (is_direct(mad_hdr->mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) /* Verify request has not been canceled */ return (wr->status == IB_WC_SUCCESS) ? wr : NULL; } return NULL; } void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr) { mad_send_wr->timeout = 0; if (mad_send_wr->refcount == 1) list_move_tail(&mad_send_wr->agent_list, &mad_send_wr->mad_agent_priv->done_list); } static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc mad_send_wc; unsigned long flags; INIT_LIST_HEAD(&mad_recv_wc->rmpp_list); list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list); if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv, mad_recv_wc); if (!mad_recv_wc) { deref_mad_agent(mad_agent_priv); return; } } /* Complete corresponding request */ if (ib_response_mad(&mad_recv_wc->recv_buf.mad->mad_hdr)) { spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc); if (!mad_send_wr) { spin_unlock_irqrestore(&mad_agent_priv->lock, flags); if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class) && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) { /* user rmpp is in effect * and this is an active RMPP MAD */ mad_agent_priv->agent.recv_handler( &mad_agent_priv->agent, NULL, mad_recv_wc); atomic_dec(&mad_agent_priv->refcount); } else { /* not user rmpp, revert to normal behavior and * drop the mad */ ib_free_recv_mad(mad_recv_wc); deref_mad_agent(mad_agent_priv); return; } } else { ib_mark_mad_done(mad_send_wr); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); /* Defined behavior is to complete response before request */ mad_agent_priv->agent.recv_handler( &mad_agent_priv->agent, &mad_send_wr->send_buf, mad_recv_wc); atomic_dec(&mad_agent_priv->refcount); mad_send_wc.status = IB_WC_SUCCESS; mad_send_wc.vendor_err = 0; mad_send_wc.send_buf = &mad_send_wr->send_buf; ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); } } else { mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL, mad_recv_wc); deref_mad_agent(mad_agent_priv); } } static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv, const struct ib_mad_qp_info *qp_info, const struct ib_wc *wc, int port_num, struct ib_mad_private *recv, struct ib_mad_private *response) { enum smi_forward_action retsmi; struct ib_smp *smp = (struct ib_smp *)recv->mad; if (smi_handle_dr_smp_recv(smp, rdma_cap_ib_switch(port_priv->device), port_num, port_priv->device->phys_port_cnt) == IB_SMI_DISCARD) return IB_SMI_DISCARD; retsmi = smi_check_forward_dr_smp(smp); if (retsmi == IB_SMI_LOCAL) return IB_SMI_HANDLE; if (retsmi == IB_SMI_SEND) { /* don't forward */ if (smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(port_priv->device), port_num) == IB_SMI_DISCARD) return IB_SMI_DISCARD; if (smi_check_local_smp(smp, port_priv->device) == IB_SMI_DISCARD) return IB_SMI_DISCARD; } else if (rdma_cap_ib_switch(port_priv->device)) { /* forward case for switches */ memcpy(response, recv, mad_priv_size(response)); response->header.recv_wc.wc = &response->header.wc; response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad; response->header.recv_wc.recv_buf.grh = &response->grh; agent_send_response((const struct ib_mad_hdr *)response->mad, &response->grh, wc, port_priv->device, smi_get_fwd_port(smp), qp_info->qp->qp_num, response->mad_size, false); return IB_SMI_DISCARD; } return IB_SMI_HANDLE; } static bool generate_unmatched_resp(const struct ib_mad_private *recv, struct ib_mad_private *response, size_t *resp_len, bool opa) { const struct ib_mad_hdr *recv_hdr = (const struct ib_mad_hdr *)recv->mad; struct ib_mad_hdr *resp_hdr = (struct ib_mad_hdr *)response->mad; if (recv_hdr->method == IB_MGMT_METHOD_GET || recv_hdr->method == IB_MGMT_METHOD_SET) { memcpy(response, recv, mad_priv_size(response)); response->header.recv_wc.wc = &response->header.wc; response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad; response->header.recv_wc.recv_buf.grh = &response->grh; resp_hdr->method = IB_MGMT_METHOD_GET_RESP; resp_hdr->status = cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB); if (recv_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) resp_hdr->status |= IB_SMP_DIRECTION; if (opa && recv_hdr->base_version == OPA_MGMT_BASE_VERSION) { if (recv_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || recv_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) *resp_len = opa_get_smp_header_size( (const struct opa_smp *)recv->mad); else *resp_len = sizeof(struct ib_mad_hdr); } return true; } else { return false; } } static enum smi_action handle_opa_smi(struct ib_mad_port_private *port_priv, struct ib_mad_qp_info *qp_info, struct ib_wc *wc, int port_num, struct ib_mad_private *recv, struct ib_mad_private *response) { enum smi_forward_action retsmi; struct opa_smp *smp = (struct opa_smp *)recv->mad; if (opa_smi_handle_dr_smp_recv(smp, rdma_cap_ib_switch(port_priv->device), port_num, port_priv->device->phys_port_cnt) == IB_SMI_DISCARD) return IB_SMI_DISCARD; retsmi = opa_smi_check_forward_dr_smp(smp); if (retsmi == IB_SMI_LOCAL) return IB_SMI_HANDLE; if (retsmi == IB_SMI_SEND) { /* don't forward */ if (opa_smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(port_priv->device), port_num) == IB_SMI_DISCARD) return IB_SMI_DISCARD; if (opa_smi_check_local_smp(smp, port_priv->device) == IB_SMI_DISCARD) return IB_SMI_DISCARD; } else if (rdma_cap_ib_switch(port_priv->device)) { /* forward case for switches */ memcpy(response, recv, mad_priv_size(response)); response->header.recv_wc.wc = &response->header.wc; response->header.recv_wc.recv_buf.opa_mad = (struct opa_mad *)response->mad; response->header.recv_wc.recv_buf.grh = &response->grh; agent_send_response((const struct ib_mad_hdr *)response->mad, &response->grh, wc, port_priv->device, opa_smi_get_fwd_port(smp), qp_info->qp->qp_num, recv->header.wc.byte_len, true); return IB_SMI_DISCARD; } return IB_SMI_HANDLE; } static enum smi_action handle_smi(struct ib_mad_port_private *port_priv, struct ib_mad_qp_info *qp_info, struct ib_wc *wc, int port_num, struct ib_mad_private *recv, struct ib_mad_private *response, bool opa) { struct ib_mad_hdr *mad_hdr = (struct ib_mad_hdr *)recv->mad; if (opa && mad_hdr->base_version == OPA_MGMT_BASE_VERSION && mad_hdr->class_version == OPA_SMI_CLASS_VERSION) return handle_opa_smi(port_priv, qp_info, wc, port_num, recv, response); return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response); } static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct ib_mad_port_private *port_priv = cq->cq_context; struct ib_mad_list_head *mad_list = container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); struct ib_mad_qp_info *qp_info; struct ib_mad_private_header *mad_priv_hdr; struct ib_mad_private *recv, *response = NULL; struct ib_mad_agent_private *mad_agent; int port_num; int ret = IB_MAD_RESULT_SUCCESS; size_t mad_size; u16 resp_mad_pkey_index = 0; bool opa; if (list_empty_careful(&port_priv->port_list)) return; if (wc->status != IB_WC_SUCCESS) { /* * Receive errors indicate that the QP has entered the error * state - error handling/shutdown code will cleanup */ return; } qp_info = mad_list->mad_queue->qp_info; dequeue_mad(mad_list); opa = rdma_cap_opa_mad(qp_info->port_priv->device, qp_info->port_priv->port_num); mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header, mad_list); recv = container_of(mad_priv_hdr, struct ib_mad_private, header); ib_dma_unmap_single(port_priv->device, recv->header.mapping, mad_priv_dma_size(recv), DMA_FROM_DEVICE); /* Setup MAD receive work completion from "normal" work completion */ recv->header.wc = *wc; recv->header.recv_wc.wc = &recv->header.wc; if (opa && ((struct ib_mad_hdr *)(recv->mad))->base_version == OPA_MGMT_BASE_VERSION) { recv->header.recv_wc.mad_len = wc->byte_len - sizeof(struct ib_grh); recv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad); } else { recv->header.recv_wc.mad_len = sizeof(struct ib_mad); recv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad); } recv->header.recv_wc.recv_buf.mad = (struct ib_mad *)recv->mad; recv->header.recv_wc.recv_buf.grh = &recv->grh; if (atomic_read(&qp_info->snoop_count)) snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS); /* Validate MAD */ if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa)) goto out; mad_size = recv->mad_size; response = alloc_mad_private(mad_size, GFP_KERNEL); if (!response) goto out; if (rdma_cap_ib_switch(port_priv->device)) port_num = wc->port_num; else port_num = port_priv->port_num; if (((struct ib_mad_hdr *)recv->mad)->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { if (handle_smi(port_priv, qp_info, wc, port_num, recv, response, opa) == IB_SMI_DISCARD) goto out; } /* Give driver "right of first refusal" on incoming MAD */ if (port_priv->device->process_mad) { ret = port_priv->device->process_mad(port_priv->device, 0, port_priv->port_num, wc, &recv->grh, (const struct ib_mad_hdr *)recv->mad, recv->mad_size, (struct ib_mad_hdr *)response->mad, &mad_size, &resp_mad_pkey_index); if (opa) wc->pkey_index = resp_mad_pkey_index; if (ret & IB_MAD_RESULT_SUCCESS) { if (ret & IB_MAD_RESULT_CONSUMED) goto out; if (ret & IB_MAD_RESULT_REPLY) { agent_send_response((const struct ib_mad_hdr *)response->mad, &recv->grh, wc, port_priv->device, port_num, qp_info->qp->qp_num, mad_size, opa); goto out; } } } mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad); if (mad_agent) { ib_mad_complete_recv(mad_agent, &recv->header.recv_wc); /* * recv is freed up in error cases in ib_mad_complete_recv * or via recv_handler in ib_mad_complete_recv() */ recv = NULL; } else if ((ret & IB_MAD_RESULT_SUCCESS) && generate_unmatched_resp(recv, response, &mad_size, opa)) { agent_send_response((const struct ib_mad_hdr *)response->mad, &recv->grh, wc, port_priv->device, port_num, qp_info->qp->qp_num, mad_size, opa); } out: /* Post another receive request for this QP */ if (response) { ib_mad_post_receive_mads(qp_info, response); kfree(recv); } else ib_mad_post_receive_mads(qp_info, recv); } static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv) { struct ib_mad_send_wr_private *mad_send_wr; unsigned long delay; if (list_empty(&mad_agent_priv->wait_list)) { cancel_delayed_work(&mad_agent_priv->timed_work); } else { mad_send_wr = list_entry(mad_agent_priv->wait_list.next, struct ib_mad_send_wr_private, agent_list); if (time_after(mad_agent_priv->timeout, mad_send_wr->timeout)) { mad_agent_priv->timeout = mad_send_wr->timeout; delay = mad_send_wr->timeout - jiffies; if ((long)delay <= 0) delay = 1; mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq, &mad_agent_priv->timed_work, delay); } } } static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *temp_mad_send_wr; struct list_head *list_item; unsigned long delay; mad_agent_priv = mad_send_wr->mad_agent_priv; list_del(&mad_send_wr->agent_list); delay = mad_send_wr->timeout; mad_send_wr->timeout += jiffies; if (delay) { list_for_each_prev(list_item, &mad_agent_priv->wait_list) { temp_mad_send_wr = list_entry(list_item, struct ib_mad_send_wr_private, agent_list); if (time_after(mad_send_wr->timeout, temp_mad_send_wr->timeout)) break; } } else list_item = &mad_agent_priv->wait_list; list_add(&mad_send_wr->agent_list, list_item); /* Reschedule a work item if we have a shorter timeout */ if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list) mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq, &mad_agent_priv->timed_work, delay); } void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, int timeout_ms) { mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); wait_for_response(mad_send_wr); } /* * Process a send work completion */ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, struct ib_mad_send_wc *mad_send_wc) { struct ib_mad_agent_private *mad_agent_priv; unsigned long flags; int ret; mad_agent_priv = mad_send_wr->mad_agent_priv; spin_lock_irqsave(&mad_agent_priv->lock, flags); if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc); if (ret == IB_RMPP_RESULT_CONSUMED) goto done; } else ret = IB_RMPP_RESULT_UNHANDLED; if (mad_send_wc->status != IB_WC_SUCCESS && mad_send_wr->status == IB_WC_SUCCESS) { mad_send_wr->status = mad_send_wc->status; mad_send_wr->refcount -= (mad_send_wr->timeout > 0); } if (--mad_send_wr->refcount > 0) { if (mad_send_wr->refcount == 1 && mad_send_wr->timeout && mad_send_wr->status == IB_WC_SUCCESS) { wait_for_response(mad_send_wr); } goto done; } /* Remove send from MAD agent and notify client of completion */ list_del(&mad_send_wr->agent_list); adjust_timeout(mad_agent_priv); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); if (mad_send_wr->status != IB_WC_SUCCESS ) mad_send_wc->status = mad_send_wr->status; if (ret == IB_RMPP_RESULT_INTERNAL) ib_rmpp_send_handler(mad_send_wc); else mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, mad_send_wc); /* Release reference on agent taken when sending */ deref_mad_agent(mad_agent_priv); return; done: spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc) { struct ib_mad_port_private *port_priv = cq->cq_context; struct ib_mad_list_head *mad_list = container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr; struct ib_mad_qp_info *qp_info; struct ib_mad_queue *send_queue; - struct ib_send_wr *bad_send_wr; + const struct ib_send_wr *bad_send_wr; struct ib_mad_send_wc mad_send_wc; unsigned long flags; int ret; if (list_empty_careful(&port_priv->port_list)) return; if (wc->status != IB_WC_SUCCESS) { if (!ib_mad_send_error(port_priv, wc)) return; } mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); send_queue = mad_list->mad_queue; qp_info = send_queue->qp_info; retry: ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device, mad_send_wr->header_mapping, mad_send_wr->sg_list[0].length, DMA_TO_DEVICE); ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device, mad_send_wr->payload_mapping, mad_send_wr->sg_list[1].length, DMA_TO_DEVICE); queued_send_wr = NULL; spin_lock_irqsave(&send_queue->lock, flags); list_del(&mad_list->list); /* Move queued send to the send queue */ if (send_queue->count-- > send_queue->max_active) { mad_list = container_of(qp_info->overflow_list.next, struct ib_mad_list_head, list); queued_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); list_move_tail(&mad_list->list, &send_queue->list); } spin_unlock_irqrestore(&send_queue->lock, flags); mad_send_wc.send_buf = &mad_send_wr->send_buf; mad_send_wc.status = wc->status; mad_send_wc.vendor_err = wc->vendor_err; if (atomic_read(&qp_info->snoop_count)) snoop_send(qp_info, &mad_send_wr->send_buf, &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS); ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); if (queued_send_wr) { ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr, &bad_send_wr); if (ret) { dev_err(&port_priv->device->dev, "ib_post_send failed: %d\n", ret); mad_send_wr = queued_send_wr; wc->status = IB_WC_LOC_QP_OP_ERR; goto retry; } } } static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info) { struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_list_head *mad_list; unsigned long flags; spin_lock_irqsave(&qp_info->send_queue.lock, flags); list_for_each_entry(mad_list, &qp_info->send_queue.list, list) { mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); mad_send_wr->retry = 1; } spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); } static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, struct ib_wc *wc) { struct ib_mad_list_head *mad_list = container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); struct ib_mad_qp_info *qp_info = mad_list->mad_queue->qp_info; struct ib_mad_send_wr_private *mad_send_wr; int ret; /* * Send errors will transition the QP to SQE - move * QP to RTS and repost flushed work requests */ mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); if (wc->status == IB_WC_WR_FLUSH_ERR) { if (mad_send_wr->retry) { /* Repost send */ - struct ib_send_wr *bad_send_wr; + const struct ib_send_wr *bad_send_wr; mad_send_wr->retry = 0; ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr, &bad_send_wr); if (!ret) return false; } } else { struct ib_qp_attr *attr; /* Transition QP to RTS and fail offending send */ attr = kmalloc(sizeof *attr, GFP_KERNEL); if (attr) { attr->qp_state = IB_QPS_RTS; attr->cur_qp_state = IB_QPS_SQE; ret = ib_modify_qp(qp_info->qp, attr, IB_QP_STATE | IB_QP_CUR_STATE); kfree(attr); if (ret) dev_err(&port_priv->device->dev, "%s - ib_modify_qp to RTS: %d\n", __func__, ret); else mark_sends_for_retry(qp_info); } } return true; } static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) { unsigned long flags; struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr; struct ib_mad_send_wc mad_send_wc; struct list_head cancel_list; INIT_LIST_HEAD(&cancel_list); spin_lock_irqsave(&mad_agent_priv->lock, flags); list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, &mad_agent_priv->send_list, agent_list) { if (mad_send_wr->status == IB_WC_SUCCESS) { mad_send_wr->status = IB_WC_WR_FLUSH_ERR; mad_send_wr->refcount -= (mad_send_wr->timeout > 0); } } /* Empty wait list to prevent receives from finding a request */ list_splice_init(&mad_agent_priv->wait_list, &cancel_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); /* Report all cancelled requests */ mad_send_wc.status = IB_WC_WR_FLUSH_ERR; mad_send_wc.vendor_err = 0; list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, &cancel_list, agent_list) { mad_send_wc.send_buf = &mad_send_wr->send_buf; list_del(&mad_send_wr->agent_list); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); atomic_dec(&mad_agent_priv->refcount); } } static struct ib_mad_send_wr_private* find_send_wr(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_send_buf *send_buf) { struct ib_mad_send_wr_private *mad_send_wr; list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list, agent_list) { if (&mad_send_wr->send_buf == send_buf) return mad_send_wr; } list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list, agent_list) { if (is_rmpp_data_mad(mad_agent_priv, mad_send_wr->send_buf.mad) && &mad_send_wr->send_buf == send_buf) return mad_send_wr; } return NULL; } int ib_modify_mad(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, u32 timeout_ms) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; unsigned long flags; int active; mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, agent); spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = find_send_wr(mad_agent_priv, send_buf); if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) { spin_unlock_irqrestore(&mad_agent_priv->lock, flags); return -EINVAL; } active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1); if (!timeout_ms) { mad_send_wr->status = IB_WC_WR_FLUSH_ERR; mad_send_wr->refcount -= (mad_send_wr->timeout > 0); } mad_send_wr->send_buf.timeout_ms = timeout_ms; if (active) mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); else ib_reset_mad_timeout(mad_send_wr, timeout_ms); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); return 0; } EXPORT_SYMBOL(ib_modify_mad); void ib_cancel_mad(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf) { ib_modify_mad(mad_agent, send_buf, 0); } EXPORT_SYMBOL(ib_cancel_mad); static void local_completions(struct work_struct *work) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_local_private *local; struct ib_mad_agent_private *recv_mad_agent; unsigned long flags; int free_mad; struct ib_wc wc; struct ib_mad_send_wc mad_send_wc; bool opa; mad_agent_priv = container_of(work, struct ib_mad_agent_private, local_work); opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device, mad_agent_priv->qp_info->port_priv->port_num); spin_lock_irqsave(&mad_agent_priv->lock, flags); while (!list_empty(&mad_agent_priv->local_list)) { local = list_entry(mad_agent_priv->local_list.next, struct ib_mad_local_private, completion_list); list_del(&local->completion_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); free_mad = 0; if (local->mad_priv) { u8 base_version; recv_mad_agent = local->recv_mad_agent; if (!recv_mad_agent) { dev_err(&mad_agent_priv->agent.device->dev, "No receive MAD agent for local completion\n"); free_mad = 1; goto local_send_completion; } /* * Defined behavior is to complete response * before request */ build_smp_wc(recv_mad_agent->agent.qp, local->mad_send_wr->send_wr.wr.wr_cqe, be16_to_cpu(IB_LID_PERMISSIVE), local->mad_send_wr->send_wr.pkey_index, recv_mad_agent->agent.port_num, &wc); local->mad_priv->header.recv_wc.wc = &wc; base_version = ((struct ib_mad_hdr *)(local->mad_priv->mad))->base_version; if (opa && base_version == OPA_MGMT_BASE_VERSION) { local->mad_priv->header.recv_wc.mad_len = local->return_wc_byte_len; local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad); } else { local->mad_priv->header.recv_wc.mad_len = sizeof(struct ib_mad); local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad); } INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list); list_add(&local->mad_priv->header.recv_wc.recv_buf.list, &local->mad_priv->header.recv_wc.rmpp_list); local->mad_priv->header.recv_wc.recv_buf.grh = NULL; local->mad_priv->header.recv_wc.recv_buf.mad = (struct ib_mad *)local->mad_priv->mad; if (atomic_read(&recv_mad_agent->qp_info->snoop_count)) snoop_recv(recv_mad_agent->qp_info, &local->mad_priv->header.recv_wc, IB_MAD_SNOOP_RECVS); recv_mad_agent->agent.recv_handler( &recv_mad_agent->agent, &local->mad_send_wr->send_buf, &local->mad_priv->header.recv_wc); spin_lock_irqsave(&recv_mad_agent->lock, flags); atomic_dec(&recv_mad_agent->refcount); spin_unlock_irqrestore(&recv_mad_agent->lock, flags); } local_send_completion: /* Complete send */ mad_send_wc.status = IB_WC_SUCCESS; mad_send_wc.vendor_err = 0; mad_send_wc.send_buf = &local->mad_send_wr->send_buf; if (atomic_read(&mad_agent_priv->qp_info->snoop_count)) snoop_send(mad_agent_priv->qp_info, &local->mad_send_wr->send_buf, &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); spin_lock_irqsave(&mad_agent_priv->lock, flags); atomic_dec(&mad_agent_priv->refcount); if (free_mad) kfree(local->mad_priv); kfree(local); } spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) { int ret; if (!mad_send_wr->retries_left) return -ETIMEDOUT; mad_send_wr->retries_left--; mad_send_wr->send_buf.retries++; mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) { ret = ib_retry_rmpp(mad_send_wr); switch (ret) { case IB_RMPP_RESULT_UNHANDLED: ret = ib_send_mad(mad_send_wr); break; case IB_RMPP_RESULT_CONSUMED: ret = 0; break; default: ret = -ECOMM; break; } } else ret = ib_send_mad(mad_send_wr); if (!ret) { mad_send_wr->refcount++; list_add_tail(&mad_send_wr->agent_list, &mad_send_wr->mad_agent_priv->send_list); } return ret; } static void timeout_sends(struct work_struct *work) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc mad_send_wc; unsigned long flags, delay; mad_agent_priv = container_of(work, struct ib_mad_agent_private, timed_work.work); mad_send_wc.vendor_err = 0; spin_lock_irqsave(&mad_agent_priv->lock, flags); while (!list_empty(&mad_agent_priv->wait_list)) { mad_send_wr = list_entry(mad_agent_priv->wait_list.next, struct ib_mad_send_wr_private, agent_list); if (time_after(mad_send_wr->timeout, jiffies)) { delay = mad_send_wr->timeout - jiffies; if ((long)delay <= 0) delay = 1; queue_delayed_work(mad_agent_priv->qp_info-> port_priv->wq, &mad_agent_priv->timed_work, delay); break; } list_del(&mad_send_wr->agent_list); if (mad_send_wr->status == IB_WC_SUCCESS && !retry_send(mad_send_wr)) continue; spin_unlock_irqrestore(&mad_agent_priv->lock, flags); if (mad_send_wr->status == IB_WC_SUCCESS) mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR; else mad_send_wc.status = mad_send_wr->status; mad_send_wc.send_buf = &mad_send_wr->send_buf; mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); atomic_dec(&mad_agent_priv->refcount); spin_lock_irqsave(&mad_agent_priv->lock, flags); } spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } /* * Allocate receive MADs and post receive WRs for them */ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, struct ib_mad_private *mad) { unsigned long flags; int post, ret; struct ib_mad_private *mad_priv; struct ib_sge sg_list; - struct ib_recv_wr recv_wr, *bad_recv_wr; + struct ib_recv_wr recv_wr; + const struct ib_recv_wr *bad_recv_wr; struct ib_mad_queue *recv_queue = &qp_info->recv_queue; /* Initialize common scatter list fields */ sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey; /* Initialize common receive WR fields */ recv_wr.next = NULL; recv_wr.sg_list = &sg_list; recv_wr.num_sge = 1; do { /* Allocate and map receive buffer */ if (mad) { mad_priv = mad; mad = NULL; } else { mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv), GFP_ATOMIC); if (!mad_priv) { ret = -ENOMEM; break; } } sg_list.length = mad_priv_dma_size(mad_priv); sg_list.addr = ib_dma_map_single(qp_info->port_priv->device, &mad_priv->grh, mad_priv_dma_size(mad_priv), DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, sg_list.addr))) { ret = -ENOMEM; break; } mad_priv->header.mapping = sg_list.addr; mad_priv->header.mad_list.mad_queue = recv_queue; mad_priv->header.mad_list.cqe.done = ib_mad_recv_done; recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe; /* Post receive WR */ spin_lock_irqsave(&recv_queue->lock, flags); post = (++recv_queue->count < recv_queue->max_active); list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list); spin_unlock_irqrestore(&recv_queue->lock, flags); ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr); if (ret) { spin_lock_irqsave(&recv_queue->lock, flags); list_del(&mad_priv->header.mad_list.list); recv_queue->count--; spin_unlock_irqrestore(&recv_queue->lock, flags); ib_dma_unmap_single(qp_info->port_priv->device, mad_priv->header.mapping, mad_priv_dma_size(mad_priv), DMA_FROM_DEVICE); kfree(mad_priv); dev_err(&qp_info->port_priv->device->dev, "ib_post_recv failed: %d\n", ret); break; } } while (post); return ret; } /* * Return all the posted receive MADs */ static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info) { struct ib_mad_private_header *mad_priv_hdr; struct ib_mad_private *recv; struct ib_mad_list_head *mad_list; if (!qp_info->qp) return; while (!list_empty(&qp_info->recv_queue.list)) { mad_list = list_entry(qp_info->recv_queue.list.next, struct ib_mad_list_head, list); mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header, mad_list); recv = container_of(mad_priv_hdr, struct ib_mad_private, header); /* Remove from posted receive MAD list */ list_del(&mad_list->list); ib_dma_unmap_single(qp_info->port_priv->device, recv->header.mapping, mad_priv_dma_size(recv), DMA_FROM_DEVICE); kfree(recv); } qp_info->recv_queue.count = 0; } /* * Start the port */ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) { int ret, i; struct ib_qp_attr *attr; struct ib_qp *qp; u16 pkey_index; attr = kmalloc(sizeof *attr, GFP_KERNEL); if (!attr) return -ENOMEM; ret = ib_find_pkey(port_priv->device, port_priv->port_num, IB_DEFAULT_PKEY_FULL, &pkey_index); if (ret) pkey_index = 0; for (i = 0; i < IB_MAD_QPS_CORE; i++) { qp = port_priv->qp_info[i].qp; if (!qp) continue; /* * PKey index for QP1 is irrelevant but * one is needed for the Reset to Init transition */ attr->qp_state = IB_QPS_INIT; attr->pkey_index = pkey_index; attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY; ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY); if (ret) { dev_err(&port_priv->device->dev, "Couldn't change QP%d state to INIT: %d\n", i, ret); goto out; } attr->qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, attr, IB_QP_STATE); if (ret) { dev_err(&port_priv->device->dev, "Couldn't change QP%d state to RTR: %d\n", i, ret); goto out; } attr->qp_state = IB_QPS_RTS; attr->sq_psn = IB_MAD_SEND_Q_PSN; ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN); if (ret) { dev_err(&port_priv->device->dev, "Couldn't change QP%d state to RTS: %d\n", i, ret); goto out; } } ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); if (ret) { dev_err(&port_priv->device->dev, "Failed to request completion notification: %d\n", ret); goto out; } for (i = 0; i < IB_MAD_QPS_CORE; i++) { if (!port_priv->qp_info[i].qp) continue; ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL); if (ret) { dev_err(&port_priv->device->dev, "Couldn't post receive WRs\n"); goto out; } } out: kfree(attr); return ret; } static void qp_event_handler(struct ib_event *event, void *qp_context) { struct ib_mad_qp_info *qp_info = qp_context; /* It's worse than that! He's dead, Jim! */ dev_err(&qp_info->port_priv->device->dev, "Fatal error (%d) on MAD QP (%d)\n", event->event, qp_info->qp->qp_num); } static void init_mad_queue(struct ib_mad_qp_info *qp_info, struct ib_mad_queue *mad_queue) { mad_queue->qp_info = qp_info; mad_queue->count = 0; spin_lock_init(&mad_queue->lock); INIT_LIST_HEAD(&mad_queue->list); } static void init_mad_qp(struct ib_mad_port_private *port_priv, struct ib_mad_qp_info *qp_info) { qp_info->port_priv = port_priv; init_mad_queue(qp_info, &qp_info->send_queue); init_mad_queue(qp_info, &qp_info->recv_queue); INIT_LIST_HEAD(&qp_info->overflow_list); spin_lock_init(&qp_info->snoop_lock); qp_info->snoop_table = NULL; qp_info->snoop_table_size = 0; atomic_set(&qp_info->snoop_count, 0); } static int create_mad_qp(struct ib_mad_qp_info *qp_info, enum ib_qp_type qp_type) { struct ib_qp_init_attr qp_init_attr; int ret; memset(&qp_init_attr, 0, sizeof qp_init_attr); qp_init_attr.send_cq = qp_info->port_priv->cq; qp_init_attr.recv_cq = qp_info->port_priv->cq; qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; qp_init_attr.cap.max_send_wr = mad_sendq_size; qp_init_attr.cap.max_recv_wr = mad_recvq_size; qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG; qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG; qp_init_attr.qp_type = qp_type; qp_init_attr.port_num = qp_info->port_priv->port_num; qp_init_attr.qp_context = qp_info; qp_init_attr.event_handler = qp_event_handler; qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr); if (IS_ERR(qp_info->qp)) { dev_err(&qp_info->port_priv->device->dev, "Couldn't create ib_mad QP%d\n", get_spl_qp_index(qp_type)); ret = PTR_ERR(qp_info->qp); goto error; } /* Use minimum queue sizes unless the CQ is resized */ qp_info->send_queue.max_active = mad_sendq_size; qp_info->recv_queue.max_active = mad_recvq_size; return 0; error: return ret; } static void destroy_mad_qp(struct ib_mad_qp_info *qp_info) { if (!qp_info->qp) return; ib_destroy_qp(qp_info->qp); kfree(qp_info->snoop_table); } /* * Open the port * Create the QP, PD, MR, and CQ if needed */ static int ib_mad_port_open(struct ib_device *device, int port_num) { int ret, cq_size; struct ib_mad_port_private *port_priv; unsigned long flags; char name[sizeof "ib_mad123"]; int has_smi; if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE)) return -EFAULT; if (WARN_ON(rdma_cap_opa_mad(device, port_num) && rdma_max_mad_size(device, port_num) < OPA_MGMT_MAD_SIZE)) return -EFAULT; /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); if (!port_priv) return -ENOMEM; port_priv->device = device; port_priv->port_num = port_num; spin_lock_init(&port_priv->reg_lock); INIT_LIST_HEAD(&port_priv->agent_list); init_mad_qp(port_priv, &port_priv->qp_info[0]); init_mad_qp(port_priv, &port_priv->qp_info[1]); cq_size = mad_sendq_size + mad_recvq_size; has_smi = rdma_cap_ib_smi(device, port_num); if (has_smi) cq_size *= 2; port_priv->pd = ib_alloc_pd(device, 0); if (IS_ERR(port_priv->pd)) { dev_err(&device->dev, "Couldn't create ib_mad PD\n"); ret = PTR_ERR(port_priv->pd); goto error3; } port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0, IB_POLL_WORKQUEUE); if (IS_ERR(port_priv->cq)) { dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); goto error4; } if (has_smi) { ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI); if (ret) goto error6; } ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI); if (ret) goto error7; snprintf(name, sizeof name, "ib_mad%d", port_num); port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); if (!port_priv->wq) { ret = -ENOMEM; goto error8; } spin_lock_irqsave(&ib_mad_port_list_lock, flags); list_add_tail(&port_priv->port_list, &ib_mad_port_list); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); ret = ib_mad_port_start(port_priv); if (ret) { dev_err(&device->dev, "Couldn't start port\n"); goto error9; } return 0; error9: spin_lock_irqsave(&ib_mad_port_list_lock, flags); list_del_init(&port_priv->port_list); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); destroy_workqueue(port_priv->wq); error8: destroy_mad_qp(&port_priv->qp_info[1]); error7: destroy_mad_qp(&port_priv->qp_info[0]); error6: ib_free_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); error4: ib_dealloc_pd(port_priv->pd); error3: kfree(port_priv); return ret; } /* * Close the port * If there are no classes using the port, free the port * resources (CQ, MR, PD, QP) and remove the port's info structure */ static int ib_mad_port_close(struct ib_device *device, int port_num) { struct ib_mad_port_private *port_priv; unsigned long flags; spin_lock_irqsave(&ib_mad_port_list_lock, flags); port_priv = __ib_get_mad_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); dev_err(&device->dev, "Port %d not found\n", port_num); return -ENODEV; } list_del_init(&port_priv->port_list); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); destroy_workqueue(port_priv->wq); destroy_mad_qp(&port_priv->qp_info[1]); destroy_mad_qp(&port_priv->qp_info[0]); ib_free_cq(port_priv->cq); ib_dealloc_pd(port_priv->pd); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); /* XXX: Handle deallocation of MAD registration tables */ kfree(port_priv); return 0; } static void ib_mad_init_device(struct ib_device *device) { int start, i; start = rdma_start_port(device); for (i = start; i <= rdma_end_port(device); i++) { if (!rdma_cap_ib_mad(device, i)) continue; if (ib_mad_port_open(device, i)) { dev_err(&device->dev, "Couldn't open port %d\n", i); goto error; } if (ib_agent_port_open(device, i)) { dev_err(&device->dev, "Couldn't open port %d for agents\n", i); goto error_agent; } } return; error_agent: if (ib_mad_port_close(device, i)) dev_err(&device->dev, "Couldn't close port %d\n", i); error: while (--i >= start) { if (!rdma_cap_ib_mad(device, i)) continue; if (ib_agent_port_close(device, i)) dev_err(&device->dev, "Couldn't close port %d for agents\n", i); if (ib_mad_port_close(device, i)) dev_err(&device->dev, "Couldn't close port %d\n", i); } } static void ib_mad_remove_device(struct ib_device *device, void *client_data) { int i; for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { if (!rdma_cap_ib_mad(device, i)) continue; if (ib_agent_port_close(device, i)) dev_err(&device->dev, "Couldn't close port %d for agents\n", i); if (ib_mad_port_close(device, i)) dev_err(&device->dev, "Couldn't close port %d\n", i); } } static struct ib_client mad_client = { .name = "mad", .add = ib_mad_init_device, .remove = ib_mad_remove_device }; int ib_mad_init(void) { mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE); mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE); mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE); mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE); INIT_LIST_HEAD(&ib_mad_port_list); if (ib_register_client(&mad_client)) { pr_err("Couldn't register ib_mad client\n"); return -EINVAL; } return 0; } void ib_mad_cleanup(void) { ib_unregister_client(&mad_client); } diff --git a/sys/ofed/drivers/infiniband/core/ib_uverbs_cmd.c b/sys/ofed/drivers/infiniband/core/ib_uverbs_cmd.c index dcc73dc5848e..ab6e27d40d33 100644 --- a/sys/ofed/drivers/infiniband/core/ib_uverbs_cmd.c +++ b/sys/ofed/drivers/infiniband/core/ib_uverbs_cmd.c @@ -1,4303 +1,4306 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * Copyright (c) 2006 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #define LINUXKPI_PARAM_PREFIX ibcore_ #include #include #include #include #include #include #include "uverbs.h" #include "core_priv.h" #include struct uverbs_lock_class { char name[16]; }; static struct uverbs_lock_class pd_lock_class = { .name = "PD-uobj" }; static struct uverbs_lock_class mr_lock_class = { .name = "MR-uobj" }; static struct uverbs_lock_class mw_lock_class = { .name = "MW-uobj" }; static struct uverbs_lock_class cq_lock_class = { .name = "CQ-uobj" }; static struct uverbs_lock_class qp_lock_class = { .name = "QP-uobj" }; static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" }; static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; static struct uverbs_lock_class wq_lock_class = { .name = "WQ-uobj" }; static struct uverbs_lock_class rwq_ind_table_lock_class = { .name = "IND_TBL-uobj" }; /* * The ib_uobject locking scheme is as follows: * * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it * needs to be held during all idr write operations. When an object is * looked up, a reference must be taken on the object's kref before * dropping this lock. For read operations, the rcu_read_lock() * and rcu_write_lock() but similarly the kref reference is grabbed * before the rcu_read_unlock(). * * - Each object also has an rwsem. This rwsem must be held for * reading while an operation that uses the object is performed. * For example, while registering an MR, the associated PD's * uobject.mutex must be held for reading. The rwsem must be held * for writing while initializing or destroying an object. * * - In addition, each object has a "live" flag. If this flag is not * set, then lookups of the object will fail even if it is found in * the idr. This handles a reader that blocks and does not acquire * the rwsem until after the object is destroyed. The destroy * operation will set the live flag to 0 and then drop the rwsem; * this will allow the reader to acquire the rwsem, see that the * live flag is 0, and then drop the rwsem and its reference to * object. The underlying storage will not be freed until the last * reference to the object is dropped. */ static void init_uobj(struct ib_uobject *uobj, u64 user_handle, struct ib_ucontext *context, struct uverbs_lock_class *c) { uobj->user_handle = user_handle; uobj->context = context; kref_init(&uobj->ref); init_rwsem(&uobj->mutex); uobj->live = 0; } static void release_uobj(struct kref *kref) { kfree_rcu(container_of(kref, struct ib_uobject, ref), rcu); } static void put_uobj(struct ib_uobject *uobj) { kref_put(&uobj->ref, release_uobj); } static void put_uobj_read(struct ib_uobject *uobj) { up_read(&uobj->mutex); put_uobj(uobj); } static void put_uobj_write(struct ib_uobject *uobj) { up_write(&uobj->mutex); put_uobj(uobj); } static int idr_add_uobj(struct idr *idr, struct ib_uobject *uobj) { int ret; idr_preload(GFP_KERNEL); spin_lock(&ib_uverbs_idr_lock); ret = idr_alloc(idr, uobj, 0, 0, GFP_NOWAIT); if (ret >= 0) uobj->id = ret; spin_unlock(&ib_uverbs_idr_lock); idr_preload_end(); return ret < 0 ? ret : 0; } void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj) { spin_lock(&ib_uverbs_idr_lock); idr_remove(idr, uobj->id); spin_unlock(&ib_uverbs_idr_lock); } static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id, struct ib_ucontext *context) { struct ib_uobject *uobj; rcu_read_lock(); uobj = idr_find(idr, id); if (uobj) { if (uobj->context == context) kref_get(&uobj->ref); else uobj = NULL; } rcu_read_unlock(); return uobj; } static struct ib_uobject *idr_read_uobj(struct idr *idr, int id, struct ib_ucontext *context, int nested) { struct ib_uobject *uobj; uobj = __idr_get_uobj(idr, id, context); if (!uobj) return NULL; if (nested) down_read_nested(&uobj->mutex, SINGLE_DEPTH_NESTING); else down_read(&uobj->mutex); if (!uobj->live) { put_uobj_read(uobj); return NULL; } return uobj; } static struct ib_uobject *idr_write_uobj(struct idr *idr, int id, struct ib_ucontext *context) { struct ib_uobject *uobj; uobj = __idr_get_uobj(idr, id, context); if (!uobj) return NULL; down_write(&uobj->mutex); if (!uobj->live) { put_uobj_write(uobj); return NULL; } return uobj; } static void *idr_read_obj(struct idr *idr, int id, struct ib_ucontext *context, int nested) { struct ib_uobject *uobj; uobj = idr_read_uobj(idr, id, context, nested); return uobj ? uobj->object : NULL; } static struct ib_pd *idr_read_pd(int pd_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_pd_idr, pd_handle, context, 0); } static void put_pd_read(struct ib_pd *pd) { put_uobj_read(pd->uobject); } static struct ib_cq *idr_read_cq(int cq_handle, struct ib_ucontext *context, int nested) { return idr_read_obj(&ib_uverbs_cq_idr, cq_handle, context, nested); } static void put_cq_read(struct ib_cq *cq) { put_uobj_read(cq->uobject); } static struct ib_ah *idr_read_ah(int ah_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_ah_idr, ah_handle, context, 0); } static void put_ah_read(struct ib_ah *ah) { put_uobj_read(ah->uobject); } static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0); } static struct ib_wq *idr_read_wq(int wq_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_wq_idr, wq_handle, context, 0); } static void put_wq_read(struct ib_wq *wq) { put_uobj_read(wq->uobject); } static struct ib_rwq_ind_table *idr_read_rwq_indirection_table(int ind_table_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_rwq_ind_tbl_idr, ind_table_handle, context, 0); } static void put_rwq_indirection_table_read(struct ib_rwq_ind_table *ind_table) { put_uobj_read(ind_table->uobject); } static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context) { struct ib_uobject *uobj; uobj = idr_write_uobj(&ib_uverbs_qp_idr, qp_handle, context); return uobj ? uobj->object : NULL; } static void put_qp_read(struct ib_qp *qp) { put_uobj_read(qp->uobject); } static void put_qp_write(struct ib_qp *qp) { put_uobj_write(qp->uobject); } static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0); } static void put_srq_read(struct ib_srq *srq) { put_uobj_read(srq->uobject); } static struct ib_xrcd *idr_read_xrcd(int xrcd_handle, struct ib_ucontext *context, struct ib_uobject **uobj) { *uobj = idr_read_uobj(&ib_uverbs_xrcd_idr, xrcd_handle, context, 0); return *uobj ? (*uobj)->object : NULL; } static void put_xrcd_read(struct ib_uobject *uobj) { put_uobj_read(uobj); } ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_get_context cmd; struct ib_uverbs_get_context_resp resp; struct ib_udata udata; struct ib_ucontext *ucontext; struct file *filp; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; mutex_lock(&file->mutex); if (file->ucontext) { ret = -EINVAL; goto err; } ib_uverbs_init_udata(&udata, buf + sizeof cmd, u64_to_user_ptr(cmd.response + sizeof resp), in_len - sizeof cmd, out_len - sizeof resp); ucontext = ib_dev->alloc_ucontext(ib_dev, &udata); if (IS_ERR(ucontext)) { ret = PTR_ERR(ucontext); goto err; } ucontext->device = ib_dev; INIT_LIST_HEAD(&ucontext->pd_list); INIT_LIST_HEAD(&ucontext->mr_list); INIT_LIST_HEAD(&ucontext->mw_list); INIT_LIST_HEAD(&ucontext->cq_list); INIT_LIST_HEAD(&ucontext->qp_list); INIT_LIST_HEAD(&ucontext->srq_list); INIT_LIST_HEAD(&ucontext->ah_list); INIT_LIST_HEAD(&ucontext->wq_list); INIT_LIST_HEAD(&ucontext->rwq_ind_tbl_list); INIT_LIST_HEAD(&ucontext->xrcd_list); INIT_LIST_HEAD(&ucontext->rule_list); rcu_read_lock(); ucontext->tgid = get_pid(task_pid_group_leader(current)); rcu_read_unlock(); ucontext->closing = 0; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING ucontext->umem_tree = RB_ROOT; init_rwsem(&ucontext->umem_rwsem); ucontext->odp_mrs_count = 0; INIT_LIST_HEAD(&ucontext->no_private_counters); if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) ucontext->invalidate_range = NULL; #endif resp.num_comp_vectors = file->device->num_comp_vectors; ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) goto err_free; resp.async_fd = ret; filp = ib_uverbs_alloc_event_file(file, ib_dev, 1); if (IS_ERR(filp)) { ret = PTR_ERR(filp); goto err_fd; } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_file; } file->ucontext = ucontext; fd_install(resp.async_fd, filp); mutex_unlock(&file->mutex); return in_len; err_file: ib_uverbs_free_async_event_file(file); fput(filp); err_fd: put_unused_fd(resp.async_fd); err_free: put_pid(ucontext->tgid); ib_dev->dealloc_ucontext(ucontext); err: mutex_unlock(&file->mutex); return ret; } static void copy_query_dev_fields(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_uverbs_query_device_resp *resp, struct ib_device_attr *attr) { resp->fw_ver = attr->fw_ver; resp->node_guid = ib_dev->node_guid; resp->sys_image_guid = attr->sys_image_guid; resp->max_mr_size = attr->max_mr_size; resp->page_size_cap = attr->page_size_cap; resp->vendor_id = attr->vendor_id; resp->vendor_part_id = attr->vendor_part_id; resp->hw_ver = attr->hw_ver; resp->max_qp = attr->max_qp; resp->max_qp_wr = attr->max_qp_wr; resp->device_cap_flags = (u32)(attr->device_cap_flags); resp->max_sge = attr->max_sge; resp->max_sge_rd = attr->max_sge_rd; resp->max_cq = attr->max_cq; resp->max_cqe = attr->max_cqe; resp->max_mr = attr->max_mr; resp->max_pd = attr->max_pd; resp->max_qp_rd_atom = attr->max_qp_rd_atom; resp->max_ee_rd_atom = attr->max_ee_rd_atom; resp->max_res_rd_atom = attr->max_res_rd_atom; resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; resp->atomic_cap = attr->atomic_cap; resp->max_ee = attr->max_ee; resp->max_rdd = attr->max_rdd; resp->max_mw = attr->max_mw; resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; resp->max_mcast_grp = attr->max_mcast_grp; resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; resp->max_ah = attr->max_ah; resp->max_fmr = attr->max_fmr; resp->max_map_per_fmr = attr->max_map_per_fmr; resp->max_srq = attr->max_srq; resp->max_srq_wr = attr->max_srq_wr; resp->max_srq_sge = attr->max_srq_sge; resp->max_pkeys = attr->max_pkeys; resp->local_ca_ack_delay = attr->local_ca_ack_delay; resp->phys_port_cnt = ib_dev->phys_port_cnt; } ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_query_device cmd; struct ib_uverbs_query_device_resp resp; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; memset(&resp, 0, sizeof resp); copy_query_dev_fields(file, ib_dev, &resp, &ib_dev->attrs); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) return -EFAULT; return in_len; } ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_query_port cmd; struct ib_uverbs_query_port_resp resp; struct ib_port_attr attr; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; ret = ib_query_port(ib_dev, cmd.port_num, &attr); if (ret) return ret; memset(&resp, 0, sizeof resp); resp.state = attr.state; resp.max_mtu = attr.max_mtu; resp.active_mtu = attr.active_mtu; resp.gid_tbl_len = attr.gid_tbl_len; resp.port_cap_flags = attr.port_cap_flags; resp.max_msg_sz = attr.max_msg_sz; resp.bad_pkey_cntr = attr.bad_pkey_cntr; resp.qkey_viol_cntr = attr.qkey_viol_cntr; resp.pkey_tbl_len = attr.pkey_tbl_len; resp.lid = attr.lid; resp.sm_lid = attr.sm_lid; resp.lmc = attr.lmc; resp.max_vl_num = attr.max_vl_num; resp.sm_sl = attr.sm_sl; resp.subnet_timeout = attr.subnet_timeout; resp.init_type_reply = attr.init_type_reply; resp.active_width = attr.active_width; resp.active_speed = attr.active_speed; resp.phys_state = attr.phys_state; resp.link_layer = rdma_port_get_link_layer(ib_dev, cmd.port_num); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) return -EFAULT; return in_len; } ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_alloc_pd cmd; struct ib_uverbs_alloc_pd_resp resp; struct ib_udata udata; struct ib_uobject *uobj; struct ib_pd *pd; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; ib_uverbs_init_udata(&udata, buf + sizeof cmd, u64_to_user_ptr(cmd.response + sizeof resp), in_len - sizeof cmd, out_len - sizeof resp); uobj = kmalloc(sizeof *uobj, GFP_KERNEL); if (!uobj) return -ENOMEM; init_uobj(uobj, 0, file->ucontext, &pd_lock_class); down_write(&uobj->mutex); pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata); if (IS_ERR(pd)) { ret = PTR_ERR(pd); goto err; } pd->device = ib_dev; pd->uobject = uobj; pd->__internal_mr = NULL; atomic_set(&pd->usecnt, 0); uobj->object = pd; ret = idr_add_uobj(&ib_uverbs_pd_idr, uobj); if (ret) goto err_idr; memset(&resp, 0, sizeof resp); resp.pd_handle = uobj->id; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; } mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->pd_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); return in_len; err_copy: idr_remove_uobj(&ib_uverbs_pd_idr, uobj); err_idr: ib_dealloc_pd(pd); err: put_uobj_write(uobj); return ret; } ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_dealloc_pd cmd; struct ib_uobject *uobj; struct ib_pd *pd; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext); if (!uobj) return -EINVAL; pd = uobj->object; if (atomic_read(&pd->usecnt)) { ret = -EBUSY; goto err_put; } ret = pd->device->dealloc_pd(uobj->object); WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd"); if (ret) goto err_put; uobj->live = 0; put_uobj_write(uobj); idr_remove_uobj(&ib_uverbs_pd_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); return in_len; err_put: put_uobj_write(uobj); return ret; } struct xrcd_table_entry { struct rb_node node; struct ib_xrcd *xrcd; struct vnode *vnode; }; static int xrcd_table_insert(struct ib_uverbs_device *dev, struct vnode *vnode, struct ib_xrcd *xrcd) { struct xrcd_table_entry *entry, *scan; struct rb_node **p = &dev->xrcd_tree.rb_node; struct rb_node *parent = NULL; entry = kmalloc(sizeof *entry, GFP_KERNEL); if (!entry) return -ENOMEM; entry->xrcd = xrcd; entry->vnode = vnode; while (*p) { parent = *p; scan = rb_entry(parent, struct xrcd_table_entry, node); if ((uintptr_t)vnode < (uintptr_t)scan->vnode) { p = &(*p)->rb_left; } else if ((uintptr_t)vnode > (uintptr_t)scan->vnode) { p = &(*p)->rb_right; } else { kfree(entry); return -EEXIST; } } rb_link_node(&entry->node, parent, p); rb_insert_color(&entry->node, &dev->xrcd_tree); vrefact(vnode); return 0; } static struct xrcd_table_entry *xrcd_table_search(struct ib_uverbs_device *dev, struct vnode *vnode) { struct xrcd_table_entry *entry; struct rb_node *p = dev->xrcd_tree.rb_node; while (p) { entry = rb_entry(p, struct xrcd_table_entry, node); if ((uintptr_t)vnode < (uintptr_t)entry->vnode) p = p->rb_left; else if ((uintptr_t)vnode > (uintptr_t)entry->vnode) p = p->rb_right; else return entry; } return NULL; } static struct ib_xrcd *find_xrcd(struct ib_uverbs_device *dev, struct vnode *vnode) { struct xrcd_table_entry *entry; entry = xrcd_table_search(dev, vnode); if (!entry) return NULL; return entry->xrcd; } static void xrcd_table_delete(struct ib_uverbs_device *dev, struct vnode *vnode) { struct xrcd_table_entry *entry; entry = xrcd_table_search(dev, vnode); if (entry) { vrele(vnode); rb_erase(&entry->node, &dev->xrcd_tree); kfree(entry); } } ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_open_xrcd cmd; struct ib_uverbs_open_xrcd_resp resp; struct ib_udata udata; struct ib_uxrcd_object *obj; struct ib_xrcd *xrcd = NULL; struct vnode *vnode = NULL; int ret = 0; int new_xrcd = 0; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; ib_uverbs_init_udata(&udata, buf + sizeof cmd, u64_to_user_ptr(cmd.response + sizeof resp), in_len - sizeof cmd, out_len - sizeof resp); mutex_lock(&file->device->xrcd_tree_mutex); if (cmd.fd != -1) { /* search for file descriptor */ ret = -fgetvp(curthread, cmd.fd, &cap_no_rights, &vnode); if (ret != 0) goto err_tree_mutex_unlock; xrcd = find_xrcd(file->device, vnode); if (!xrcd && !(cmd.oflags & O_CREAT)) { /* no file descriptor. Need CREATE flag */ ret = -EAGAIN; goto err_tree_mutex_unlock; } if (xrcd && cmd.oflags & O_EXCL) { ret = -EINVAL; goto err_tree_mutex_unlock; } } obj = kmalloc(sizeof *obj, GFP_KERNEL); if (!obj) { ret = -ENOMEM; goto err_tree_mutex_unlock; } init_uobj(&obj->uobject, 0, file->ucontext, &xrcd_lock_class); down_write(&obj->uobject.mutex); if (!xrcd) { xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata); if (IS_ERR(xrcd)) { ret = PTR_ERR(xrcd); goto err; } xrcd->vnode = vnode; xrcd->device = ib_dev; atomic_set(&xrcd->usecnt, 0); mutex_init(&xrcd->tgt_qp_mutex); INIT_LIST_HEAD(&xrcd->tgt_qp_list); new_xrcd = 1; } atomic_set(&obj->refcnt, 0); obj->uobject.object = xrcd; ret = idr_add_uobj(&ib_uverbs_xrcd_idr, &obj->uobject); if (ret) goto err_idr; memset(&resp, 0, sizeof resp); resp.xrcd_handle = obj->uobject.id; if (vnode != NULL) { if (new_xrcd) { /* create new vnode/xrcd table entry */ ret = xrcd_table_insert(file->device, vnode, xrcd); if (ret) goto err_insert_xrcd; } atomic_inc(&xrcd->usecnt); } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; } if (vnode != NULL) vrele(vnode); mutex_lock(&file->mutex); list_add_tail(&obj->uobject.list, &file->ucontext->xrcd_list); mutex_unlock(&file->mutex); obj->uobject.live = 1; up_write(&obj->uobject.mutex); mutex_unlock(&file->device->xrcd_tree_mutex); return in_len; err_copy: if (vnode != NULL) { if (new_xrcd) xrcd_table_delete(file->device, vnode); atomic_dec(&xrcd->usecnt); } err_insert_xrcd: idr_remove_uobj(&ib_uverbs_xrcd_idr, &obj->uobject); err_idr: ib_dealloc_xrcd(xrcd); err: put_uobj_write(&obj->uobject); err_tree_mutex_unlock: if (vnode != NULL) vrele(vnode); mutex_unlock(&file->device->xrcd_tree_mutex); return ret; } ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_close_xrcd cmd; struct ib_uobject *uobj; struct ib_xrcd *xrcd = NULL; struct vnode *vnode = NULL; struct ib_uxrcd_object *obj; int live; int ret = 0; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; mutex_lock(&file->device->xrcd_tree_mutex); uobj = idr_write_uobj(&ib_uverbs_xrcd_idr, cmd.xrcd_handle, file->ucontext); if (!uobj) { ret = -EINVAL; goto out; } xrcd = uobj->object; vnode = xrcd->vnode; obj = container_of(uobj, struct ib_uxrcd_object, uobject); if (atomic_read(&obj->refcnt)) { put_uobj_write(uobj); ret = -EBUSY; goto out; } if (!vnode || atomic_dec_and_test(&xrcd->usecnt)) { ret = ib_dealloc_xrcd(uobj->object); if (!ret) uobj->live = 0; } live = uobj->live; if (vnode && ret) atomic_inc(&xrcd->usecnt); put_uobj_write(uobj); if (ret) goto out; if (vnode && !live) xrcd_table_delete(file->device, vnode); idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); ret = in_len; out: mutex_unlock(&file->device->xrcd_tree_mutex); return ret; } void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd) { struct vnode *vnode; vnode = xrcd->vnode; if (vnode && !atomic_dec_and_test(&xrcd->usecnt)) return; ib_dealloc_xrcd(xrcd); if (vnode) xrcd_table_delete(dev, vnode); } ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_reg_mr cmd; struct ib_uverbs_reg_mr_resp resp; struct ib_udata udata; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mr *mr; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; ib_uverbs_init_udata(&udata, buf + sizeof cmd, u64_to_user_ptr(cmd.response + sizeof resp), in_len - sizeof cmd, out_len - sizeof resp); if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) return -EINVAL; ret = ib_check_mr_access(cmd.access_flags); if (ret) return ret; uobj = kmalloc(sizeof *uobj, GFP_KERNEL); if (!uobj) return -ENOMEM; init_uobj(uobj, 0, file->ucontext, &mr_lock_class); down_write(&uobj->mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err_free; } if (cmd.access_flags & IB_ACCESS_ON_DEMAND) { if (!(pd->device->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) { pr_debug("ODP support not available\n"); ret = -EINVAL; goto err_put; } } mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, cmd.access_flags, &udata); if (IS_ERR(mr)) { ret = PTR_ERR(mr); goto err_put; } mr->device = pd->device; mr->pd = pd; mr->uobject = uobj; atomic_inc(&pd->usecnt); uobj->object = mr; ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); if (ret) goto err_unreg; memset(&resp, 0, sizeof resp); resp.lkey = mr->lkey; resp.rkey = mr->rkey; resp.mr_handle = uobj->id; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; } put_pd_read(pd); mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->mr_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); return in_len; err_copy: idr_remove_uobj(&ib_uverbs_mr_idr, uobj); err_unreg: ib_dereg_mr(mr); err_put: put_pd_read(pd); err_free: put_uobj_write(uobj); return ret; } ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_rereg_mr cmd; struct ib_uverbs_rereg_mr_resp resp; struct ib_udata udata; struct ib_pd *pd = NULL; struct ib_mr *mr; struct ib_pd *old_pd; int ret; struct ib_uobject *uobj; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; ib_uverbs_init_udata(&udata, buf + sizeof(cmd), u64_to_user_ptr(cmd.response + sizeof(resp)), in_len - sizeof(cmd), out_len - sizeof(resp)); if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags) return -EINVAL; if ((cmd.flags & IB_MR_REREG_TRANS) && (!cmd.start || !cmd.hca_va || 0 >= cmd.length || (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))) return -EINVAL; uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext); if (!uobj) return -EINVAL; mr = uobj->object; if (cmd.flags & IB_MR_REREG_ACCESS) { ret = ib_check_mr_access(cmd.access_flags); if (ret) goto put_uobjs; } if (cmd.flags & IB_MR_REREG_PD) { pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto put_uobjs; } } old_pd = mr->pd; ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start, cmd.length, cmd.hca_va, cmd.access_flags, pd, &udata); if (!ret) { if (cmd.flags & IB_MR_REREG_PD) { atomic_inc(&pd->usecnt); mr->pd = pd; atomic_dec(&old_pd->usecnt); } } else { goto put_uobj_pd; } memset(&resp, 0, sizeof(resp)); resp.lkey = mr->lkey; resp.rkey = mr->rkey; if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) ret = -EFAULT; else ret = in_len; put_uobj_pd: if (cmd.flags & IB_MR_REREG_PD) put_pd_read(pd); put_uobjs: put_uobj_write(mr->uobject); return ret; } ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_dereg_mr cmd; struct ib_mr *mr; struct ib_uobject *uobj; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext); if (!uobj) return -EINVAL; mr = uobj->object; ret = ib_dereg_mr(mr); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_mr_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); return in_len; } ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_alloc_mw cmd; struct ib_uverbs_alloc_mw_resp resp; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mw *mw; struct ib_udata udata; int ret; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); if (!uobj) return -ENOMEM; init_uobj(uobj, 0, file->ucontext, &mw_lock_class); down_write(&uobj->mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err_free; } ib_uverbs_init_udata(&udata, buf + sizeof(cmd), u64_to_user_ptr(cmd.response + sizeof(resp)), in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), out_len - sizeof(resp)); mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata); if (IS_ERR(mw)) { ret = PTR_ERR(mw); goto err_put; } mw->device = pd->device; mw->pd = pd; mw->uobject = uobj; atomic_inc(&pd->usecnt); uobj->object = mw; ret = idr_add_uobj(&ib_uverbs_mw_idr, uobj); if (ret) goto err_unalloc; memset(&resp, 0, sizeof(resp)); resp.rkey = mw->rkey; resp.mw_handle = uobj->id; if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) { ret = -EFAULT; goto err_copy; } put_pd_read(pd); mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->mw_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); return in_len; err_copy: idr_remove_uobj(&ib_uverbs_mw_idr, uobj); err_unalloc: uverbs_dealloc_mw(mw); err_put: put_pd_read(pd); err_free: put_uobj_write(uobj); return ret; } ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_dealloc_mw cmd; struct ib_mw *mw; struct ib_uobject *uobj; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_mw_idr, cmd.mw_handle, file->ucontext); if (!uobj) return -EINVAL; mw = uobj->object; ret = uverbs_dealloc_mw(mw); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_mw_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); return in_len; } ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_create_comp_channel cmd; struct ib_uverbs_create_comp_channel_resp resp; struct file *filp; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) return ret; resp.fd = ret; filp = ib_uverbs_alloc_event_file(file, ib_dev, 0); if (IS_ERR(filp)) { put_unused_fd(resp.fd); return PTR_ERR(filp); } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { put_unused_fd(resp.fd); fput(filp); return -EFAULT; } fd_install(resp.fd, filp); return in_len; } static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw, struct ib_uverbs_ex_create_cq *cmd, size_t cmd_sz, int (*cb)(struct ib_uverbs_file *file, struct ib_ucq_object *obj, struct ib_uverbs_ex_create_cq_resp *resp, struct ib_udata *udata, void *context), void *context) { struct ib_ucq_object *obj; struct ib_uverbs_event_file *ev_file = NULL; struct ib_cq *cq; int ret; struct ib_uverbs_ex_create_cq_resp resp; struct ib_cq_init_attr attr = {}; if (cmd->comp_vector >= file->device->num_comp_vectors) return ERR_PTR(-EINVAL); obj = kmalloc(sizeof *obj, GFP_KERNEL); if (!obj) return ERR_PTR(-ENOMEM); init_uobj(&obj->uobject, cmd->user_handle, file->ucontext, &cq_lock_class); down_write(&obj->uobject.mutex); if (cmd->comp_channel >= 0) { ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel); if (!ev_file) { ret = -EINVAL; goto err; } } obj->uverbs_file = file; obj->comp_events_reported = 0; obj->async_events_reported = 0; INIT_LIST_HEAD(&obj->comp_list); INIT_LIST_HEAD(&obj->async_list); attr.cqe = cmd->cqe; attr.comp_vector = cmd->comp_vector; if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags)) attr.flags = cmd->flags; cq = ib_dev->create_cq(ib_dev, &attr, file->ucontext, uhw); if (IS_ERR(cq)) { ret = PTR_ERR(cq); goto err_file; } cq->device = ib_dev; cq->uobject = &obj->uobject; cq->comp_handler = ib_uverbs_comp_handler; cq->event_handler = ib_uverbs_cq_event_handler; cq->cq_context = ev_file; atomic_set(&cq->usecnt, 0); obj->uobject.object = cq; ret = idr_add_uobj(&ib_uverbs_cq_idr, &obj->uobject); if (ret) goto err_free; memset(&resp, 0, sizeof resp); resp.base.cq_handle = obj->uobject.id; resp.base.cqe = cq->cqe; resp.response_length = offsetof(typeof(resp), response_length) + sizeof(resp.response_length); ret = cb(file, obj, &resp, ucore, context); if (ret) goto err_cb; mutex_lock(&file->mutex); list_add_tail(&obj->uobject.list, &file->ucontext->cq_list); mutex_unlock(&file->mutex); obj->uobject.live = 1; up_write(&obj->uobject.mutex); return obj; err_cb: idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject); err_free: ib_destroy_cq(cq); err_file: if (ev_file) ib_uverbs_release_ucq(file, ev_file, obj); err: put_uobj_write(&obj->uobject); return ERR_PTR(ret); } static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file, struct ib_ucq_object *obj, struct ib_uverbs_ex_create_cq_resp *resp, struct ib_udata *ucore, void *context) { if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base))) return -EFAULT; return 0; } ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_create_cq cmd; struct ib_uverbs_ex_create_cq cmd_ex; struct ib_uverbs_create_cq_resp resp; struct ib_udata ucore; struct ib_udata uhw; struct ib_ucq_object *obj; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; ib_uverbs_init_udata(&ucore, buf, u64_to_user_ptr(cmd.response), sizeof(cmd), sizeof(resp)); ib_uverbs_init_udata(&uhw, buf + sizeof(cmd), u64_to_user_ptr(cmd.response + sizeof(resp)), in_len - sizeof(cmd), out_len - sizeof(resp)); memset(&cmd_ex, 0, sizeof(cmd_ex)); cmd_ex.user_handle = cmd.user_handle; cmd_ex.cqe = cmd.cqe; cmd_ex.comp_vector = cmd.comp_vector; cmd_ex.comp_channel = cmd.comp_channel; obj = create_cq(file, ib_dev, &ucore, &uhw, &cmd_ex, offsetof(typeof(cmd_ex), comp_channel) + sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb, NULL); if (IS_ERR(obj)) return PTR_ERR(obj); return in_len; } static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file, struct ib_ucq_object *obj, struct ib_uverbs_ex_create_cq_resp *resp, struct ib_udata *ucore, void *context) { if (ib_copy_to_udata(ucore, resp, resp->response_length)) return -EFAULT; return 0; } int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_ex_create_cq_resp resp; struct ib_uverbs_ex_create_cq cmd; struct ib_ucq_object *obj; int err; if (ucore->inlen < sizeof(cmd)) return -EINVAL; err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); if (err) return err; if (cmd.comp_mask) return -EINVAL; if (cmd.reserved) return -EINVAL; if (ucore->outlen < (offsetof(typeof(resp), response_length) + sizeof(resp.response_length))) return -ENOSPC; obj = create_cq(file, ib_dev, ucore, uhw, &cmd, min(ucore->inlen, sizeof(cmd)), ib_uverbs_ex_create_cq_cb, NULL); if (IS_ERR(obj)) return PTR_ERR(obj); return 0; } ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_resize_cq cmd; struct ib_uverbs_resize_cq_resp resp; struct ib_udata udata; struct ib_cq *cq; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; ib_uverbs_init_udata(&udata, buf + sizeof cmd, u64_to_user_ptr(cmd.response + sizeof resp), in_len - sizeof cmd, out_len - sizeof resp); cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); if (!cq) return -EINVAL; ret = cq->device->resize_cq(cq, cmd.cqe, &udata); if (ret) goto out; resp.cqe = cq->cqe; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp.cqe)) ret = -EFAULT; out: put_cq_read(cq); return ret ? ret : in_len; } static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) { struct ib_uverbs_wc tmp; tmp.wr_id = wc->wr_id; tmp.status = wc->status; tmp.opcode = wc->opcode; tmp.vendor_err = wc->vendor_err; tmp.byte_len = wc->byte_len; tmp.ex.imm_data = (__u32 __force) wc->ex.imm_data; tmp.qp_num = wc->qp->qp_num; tmp.src_qp = wc->src_qp; tmp.wc_flags = wc->wc_flags; tmp.pkey_index = wc->pkey_index; tmp.slid = wc->slid; tmp.sl = wc->sl; tmp.dlid_path_bits = wc->dlid_path_bits; tmp.port_num = wc->port_num; tmp.reserved = 0; if (copy_to_user(dest, &tmp, sizeof tmp)) return -EFAULT; return 0; } ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_poll_cq cmd; struct ib_uverbs_poll_cq_resp resp; u8 __user *header_ptr; u8 __user *data_ptr; struct ib_cq *cq; struct ib_wc wc; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); if (!cq) return -EINVAL; /* we copy a struct ib_uverbs_poll_cq_resp to user space */ header_ptr = (void __user *)(unsigned long) cmd.response; data_ptr = header_ptr + sizeof resp; memset(&resp, 0, sizeof resp); while (resp.count < cmd.ne) { ret = ib_poll_cq(cq, 1, &wc); if (ret < 0) goto out_put; if (!ret) break; ret = copy_wc_to_user(data_ptr, &wc); if (ret) goto out_put; data_ptr += sizeof(struct ib_uverbs_wc); ++resp.count; } if (copy_to_user(header_ptr, &resp, sizeof resp)) { ret = -EFAULT; goto out_put; } ret = in_len; out_put: put_cq_read(cq); return ret; } ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_req_notify_cq cmd; struct ib_cq *cq; int retval; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); if (!cq) return -EINVAL; if (ib_req_notify_cq(cq, cmd.solicited_only ? IB_CQ_SOLICITED : IB_CQ_NEXT_COMP) < 0) retval = -ENXIO; else retval = in_len; put_cq_read(cq); return retval; } ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_cq cmd; struct ib_uverbs_destroy_cq_resp resp; struct ib_uobject *uobj; struct ib_cq *cq; struct ib_ucq_object *obj; struct ib_uverbs_event_file *ev_file; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_cq_idr, cmd.cq_handle, file->ucontext); if (!uobj) return -EINVAL; cq = uobj->object; ev_file = cq->cq_context; obj = container_of(cq->uobject, struct ib_ucq_object, uobject); ret = ib_destroy_cq(cq); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_cq_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); ib_uverbs_release_ucq(file, ev_file, obj); memset(&resp, 0, sizeof resp); resp.comp_events_reported = obj->comp_events_reported; resp.async_events_reported = obj->async_events_reported; put_uobj(uobj); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) return -EFAULT; return in_len; } static int create_qp(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw, struct ib_uverbs_ex_create_qp *cmd, size_t cmd_sz, int (*cb)(struct ib_uverbs_file *file, struct ib_uverbs_ex_create_qp_resp *resp, struct ib_udata *udata), void *context) { struct ib_uqp_object *obj; struct ib_device *device; struct ib_pd *pd = NULL; struct ib_xrcd *xrcd = NULL; struct ib_uobject *uninitialized_var(xrcd_uobj); struct ib_cq *scq = NULL, *rcq = NULL; struct ib_srq *srq = NULL; struct ib_qp *qp; char *buf; struct ib_qp_init_attr attr = {}; struct ib_uverbs_ex_create_qp_resp resp; int ret; struct ib_rwq_ind_table *ind_tbl = NULL; bool has_sq = true; if (cmd->qp_type == IB_QPT_RAW_PACKET && priv_check(curthread, PRIV_NET_RAW) != 0) return -EPERM; obj = kzalloc(sizeof *obj, GFP_KERNEL); if (!obj) return -ENOMEM; init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &qp_lock_class); mutex_init(&obj->mcast_lock); down_write(&obj->uevent.uobject.mutex); if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) + sizeof(cmd->rwq_ind_tbl_handle) && (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) { ind_tbl = idr_read_rwq_indirection_table(cmd->rwq_ind_tbl_handle, file->ucontext); if (!ind_tbl) { ret = -EINVAL; goto err_put; } attr.rwq_ind_tbl = ind_tbl; } if ((cmd_sz >= offsetof(typeof(*cmd), reserved1) + sizeof(cmd->reserved1)) && cmd->reserved1) { ret = -EOPNOTSUPP; goto err_put; } if (ind_tbl && (cmd->max_recv_wr || cmd->max_recv_sge || cmd->is_srq)) { ret = -EINVAL; goto err_put; } if (ind_tbl && !cmd->max_send_wr) has_sq = false; if (cmd->qp_type == IB_QPT_XRC_TGT) { xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext, &xrcd_uobj); if (!xrcd) { ret = -EINVAL; goto err_put; } device = xrcd->device; } else { if (cmd->qp_type == IB_QPT_XRC_INI) { cmd->max_recv_wr = 0; cmd->max_recv_sge = 0; } else { if (cmd->is_srq) { srq = idr_read_srq(cmd->srq_handle, file->ucontext); if (!srq || srq->srq_type != IB_SRQT_BASIC) { ret = -EINVAL; goto err_put; } } if (!ind_tbl) { if (cmd->recv_cq_handle != cmd->send_cq_handle) { rcq = idr_read_cq(cmd->recv_cq_handle, file->ucontext, 0); if (!rcq) { ret = -EINVAL; goto err_put; } } } } if (has_sq) scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq); if (!ind_tbl) rcq = rcq ?: scq; pd = idr_read_pd(cmd->pd_handle, file->ucontext); if (!pd || (!scq && has_sq)) { ret = -EINVAL; goto err_put; } device = pd->device; } attr.event_handler = ib_uverbs_qp_event_handler; attr.qp_context = file; attr.send_cq = scq; attr.recv_cq = rcq; attr.srq = srq; attr.xrcd = xrcd; attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; attr.qp_type = cmd->qp_type; attr.create_flags = 0; attr.cap.max_send_wr = cmd->max_send_wr; attr.cap.max_recv_wr = cmd->max_recv_wr; attr.cap.max_send_sge = cmd->max_send_sge; attr.cap.max_recv_sge = cmd->max_recv_sge; attr.cap.max_inline_data = cmd->max_inline_data; obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); INIT_LIST_HEAD(&obj->mcast_list); if (cmd_sz >= offsetof(typeof(*cmd), create_flags) + sizeof(cmd->create_flags)) attr.create_flags = cmd->create_flags; if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | IB_QP_CREATE_CROSS_CHANNEL | IB_QP_CREATE_MANAGED_SEND | IB_QP_CREATE_MANAGED_RECV | IB_QP_CREATE_SCATTER_FCS)) { ret = -EINVAL; goto err_put; } buf = (char *)cmd + sizeof(*cmd); if (cmd_sz > sizeof(*cmd)) if (!(buf[0] == 0 && !memcmp(buf, buf + 1, cmd_sz - sizeof(*cmd) - 1))) { ret = -EINVAL; goto err_put; } if (cmd->qp_type == IB_QPT_XRC_TGT) qp = ib_create_qp(pd, &attr); else qp = device->create_qp(pd, &attr, uhw); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_put; } if (cmd->qp_type != IB_QPT_XRC_TGT) { qp->real_qp = qp; qp->device = device; qp->pd = pd; qp->send_cq = attr.send_cq; qp->recv_cq = attr.recv_cq; qp->srq = attr.srq; qp->rwq_ind_tbl = ind_tbl; qp->event_handler = attr.event_handler; qp->qp_context = attr.qp_context; qp->qp_type = attr.qp_type; atomic_set(&qp->usecnt, 0); atomic_inc(&pd->usecnt); if (attr.send_cq) atomic_inc(&attr.send_cq->usecnt); if (attr.recv_cq) atomic_inc(&attr.recv_cq->usecnt); if (attr.srq) atomic_inc(&attr.srq->usecnt); if (ind_tbl) atomic_inc(&ind_tbl->usecnt); } else { /* It is done in _ib_create_qp for other QP types */ qp->uobject = &obj->uevent.uobject; } qp->uobject = &obj->uevent.uobject; obj->uevent.uobject.object = qp; ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); if (ret) goto err_destroy; memset(&resp, 0, sizeof resp); resp.base.qpn = qp->qp_num; resp.base.qp_handle = obj->uevent.uobject.id; resp.base.max_recv_sge = attr.cap.max_recv_sge; resp.base.max_send_sge = attr.cap.max_send_sge; resp.base.max_recv_wr = attr.cap.max_recv_wr; resp.base.max_send_wr = attr.cap.max_send_wr; resp.base.max_inline_data = attr.cap.max_inline_data; resp.response_length = offsetof(typeof(resp), response_length) + sizeof(resp.response_length); ret = cb(file, &resp, ucore); if (ret) goto err_cb; if (xrcd) { obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); put_xrcd_read(xrcd_uobj); } if (pd) put_pd_read(pd); if (scq) put_cq_read(scq); if (rcq && rcq != scq) put_cq_read(rcq); if (srq) put_srq_read(srq); if (ind_tbl) put_rwq_indirection_table_read(ind_tbl); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); mutex_unlock(&file->mutex); obj->uevent.uobject.live = 1; up_write(&obj->uevent.uobject.mutex); return 0; err_cb: idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); err_destroy: ib_destroy_qp(qp); err_put: if (xrcd) put_xrcd_read(xrcd_uobj); if (pd) put_pd_read(pd); if (scq) put_cq_read(scq); if (rcq && rcq != scq) put_cq_read(rcq); if (srq) put_srq_read(srq); if (ind_tbl) put_rwq_indirection_table_read(ind_tbl); put_uobj_write(&obj->uevent.uobject); return ret; } static int ib_uverbs_create_qp_cb(struct ib_uverbs_file *file, struct ib_uverbs_ex_create_qp_resp *resp, struct ib_udata *ucore) { if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base))) return -EFAULT; return 0; } ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_create_qp cmd; struct ib_uverbs_ex_create_qp cmd_ex; struct ib_udata ucore; struct ib_udata uhw; ssize_t resp_size = sizeof(struct ib_uverbs_create_qp_resp); int err; if (out_len < resp_size) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; ib_uverbs_init_udata(&ucore, buf, u64_to_user_ptr(cmd.response), sizeof(cmd), resp_size); ib_uverbs_init_udata(&uhw, buf + sizeof(cmd), u64_to_user_ptr(cmd.response + resp_size), in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), out_len - resp_size); memset(&cmd_ex, 0, sizeof(cmd_ex)); cmd_ex.user_handle = cmd.user_handle; cmd_ex.pd_handle = cmd.pd_handle; cmd_ex.send_cq_handle = cmd.send_cq_handle; cmd_ex.recv_cq_handle = cmd.recv_cq_handle; cmd_ex.srq_handle = cmd.srq_handle; cmd_ex.max_send_wr = cmd.max_send_wr; cmd_ex.max_recv_wr = cmd.max_recv_wr; cmd_ex.max_send_sge = cmd.max_send_sge; cmd_ex.max_recv_sge = cmd.max_recv_sge; cmd_ex.max_inline_data = cmd.max_inline_data; cmd_ex.sq_sig_all = cmd.sq_sig_all; cmd_ex.qp_type = cmd.qp_type; cmd_ex.is_srq = cmd.is_srq; err = create_qp(file, &ucore, &uhw, &cmd_ex, offsetof(typeof(cmd_ex), is_srq) + sizeof(cmd.is_srq), ib_uverbs_create_qp_cb, NULL); if (err) return err; return in_len; } static int ib_uverbs_ex_create_qp_cb(struct ib_uverbs_file *file, struct ib_uverbs_ex_create_qp_resp *resp, struct ib_udata *ucore) { if (ib_copy_to_udata(ucore, resp, resp->response_length)) return -EFAULT; return 0; } int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_ex_create_qp_resp resp; struct ib_uverbs_ex_create_qp cmd = {0}; int err; if (ucore->inlen < (offsetof(typeof(cmd), comp_mask) + sizeof(cmd.comp_mask))) return -EINVAL; err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); if (err) return err; if (cmd.comp_mask & ~IB_UVERBS_CREATE_QP_SUP_COMP_MASK) return -EINVAL; if (cmd.reserved) return -EINVAL; if (ucore->outlen < (offsetof(typeof(resp), response_length) + sizeof(resp.response_length))) return -ENOSPC; err = create_qp(file, ucore, uhw, &cmd, min(ucore->inlen, sizeof(cmd)), ib_uverbs_ex_create_qp_cb, NULL); if (err) return err; return 0; } ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_open_qp cmd; struct ib_uverbs_create_qp_resp resp; struct ib_udata udata; struct ib_uqp_object *obj; struct ib_xrcd *xrcd; struct ib_uobject *uninitialized_var(xrcd_uobj); struct ib_qp *qp; struct ib_qp_open_attr attr; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; ib_uverbs_init_udata(&udata, buf + sizeof cmd, u64_to_user_ptr(cmd.response + sizeof resp), in_len - sizeof cmd, out_len - sizeof resp); obj = kmalloc(sizeof *obj, GFP_KERNEL); if (!obj) return -ENOMEM; init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class); down_write(&obj->uevent.uobject.mutex); xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj); if (!xrcd) { ret = -EINVAL; goto err_put; } attr.event_handler = ib_uverbs_qp_event_handler; attr.qp_context = file; attr.qp_num = cmd.qpn; attr.qp_type = cmd.qp_type; obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); INIT_LIST_HEAD(&obj->mcast_list); qp = ib_open_qp(xrcd, &attr); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_put; } qp->uobject = &obj->uevent.uobject; obj->uevent.uobject.object = qp; ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); if (ret) goto err_destroy; memset(&resp, 0, sizeof resp); resp.qpn = qp->qp_num; resp.qp_handle = obj->uevent.uobject.id; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_remove; } obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); put_xrcd_read(xrcd_uobj); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); mutex_unlock(&file->mutex); obj->uevent.uobject.live = 1; up_write(&obj->uevent.uobject.mutex); return in_len; err_remove: idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); err_destroy: ib_destroy_qp(qp); err_put: put_xrcd_read(xrcd_uobj); put_uobj_write(&obj->uevent.uobject); return ret; } ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_query_qp cmd; struct ib_uverbs_query_qp_resp resp; struct ib_qp *qp; struct ib_qp_attr *attr; struct ib_qp_init_attr *init_attr; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; attr = kmalloc(sizeof *attr, GFP_KERNEL); init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL); if (!attr || !init_attr) { ret = -ENOMEM; goto out; } qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) { ret = -EINVAL; goto out; } ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr); put_qp_read(qp); if (ret) goto out; memset(&resp, 0, sizeof resp); resp.qp_state = attr->qp_state; resp.cur_qp_state = attr->cur_qp_state; resp.path_mtu = attr->path_mtu; resp.path_mig_state = attr->path_mig_state; resp.qkey = attr->qkey; resp.rq_psn = attr->rq_psn; resp.sq_psn = attr->sq_psn; resp.dest_qp_num = attr->dest_qp_num; resp.qp_access_flags = attr->qp_access_flags; resp.pkey_index = attr->pkey_index; resp.alt_pkey_index = attr->alt_pkey_index; resp.sq_draining = attr->sq_draining; resp.max_rd_atomic = attr->max_rd_atomic; resp.max_dest_rd_atomic = attr->max_dest_rd_atomic; resp.min_rnr_timer = attr->min_rnr_timer; resp.port_num = attr->port_num; resp.timeout = attr->timeout; resp.retry_cnt = attr->retry_cnt; resp.rnr_retry = attr->rnr_retry; resp.alt_port_num = attr->alt_port_num; resp.alt_timeout = attr->alt_timeout; memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16); resp.dest.flow_label = attr->ah_attr.grh.flow_label; resp.dest.sgid_index = attr->ah_attr.grh.sgid_index; resp.dest.hop_limit = attr->ah_attr.grh.hop_limit; resp.dest.traffic_class = attr->ah_attr.grh.traffic_class; resp.dest.dlid = attr->ah_attr.dlid; resp.dest.sl = attr->ah_attr.sl; resp.dest.src_path_bits = attr->ah_attr.src_path_bits; resp.dest.static_rate = attr->ah_attr.static_rate; resp.dest.is_global = !!(attr->ah_attr.ah_flags & IB_AH_GRH); resp.dest.port_num = attr->ah_attr.port_num; memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16); resp.alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label; resp.alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index; resp.alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit; resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class; resp.alt_dest.dlid = attr->alt_ah_attr.dlid; resp.alt_dest.sl = attr->alt_ah_attr.sl; resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits; resp.alt_dest.static_rate = attr->alt_ah_attr.static_rate; resp.alt_dest.is_global = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH); resp.alt_dest.port_num = attr->alt_ah_attr.port_num; resp.max_send_wr = init_attr->cap.max_send_wr; resp.max_recv_wr = init_attr->cap.max_recv_wr; resp.max_send_sge = init_attr->cap.max_send_sge; resp.max_recv_sge = init_attr->cap.max_recv_sge; resp.max_inline_data = init_attr->cap.max_inline_data; resp.sq_sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) ret = -EFAULT; out: kfree(attr); kfree(init_attr); return ret ? ret : in_len; } /* Remove ignored fields set in the attribute mask */ static int modify_qp_mask(enum ib_qp_type qp_type, int mask) { switch (qp_type) { case IB_QPT_XRC_INI: return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER); case IB_QPT_XRC_TGT: return mask & ~(IB_QP_MAX_QP_RD_ATOMIC | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY); default: return mask; } } ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_modify_qp cmd; struct ib_udata udata; struct ib_qp *qp; struct ib_qp_attr *attr; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; ib_uverbs_init_udata(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, out_len); attr = kmalloc(sizeof *attr, GFP_KERNEL); if (!attr) return -ENOMEM; qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) { ret = -EINVAL; goto out; } if ((cmd.attr_mask & IB_QP_PORT) && !rdma_is_port_valid(qp->device, cmd.port_num)) { ret = -EINVAL; goto release_qp; } if ((cmd.attr_mask & IB_QP_AV) && !rdma_is_port_valid(qp->device, cmd.dest.port_num)) { ret = -EINVAL; goto release_qp; } if ((cmd.attr_mask & IB_QP_ALT_PATH) && (!rdma_is_port_valid(qp->device, cmd.alt_port_num) || !rdma_is_port_valid(qp->device, cmd.alt_dest.port_num))) { ret = -EINVAL; goto release_qp; } attr->qp_state = cmd.qp_state; attr->cur_qp_state = cmd.cur_qp_state; attr->path_mtu = cmd.path_mtu; attr->path_mig_state = cmd.path_mig_state; attr->qkey = cmd.qkey; attr->rq_psn = cmd.rq_psn; attr->sq_psn = cmd.sq_psn; attr->dest_qp_num = cmd.dest_qp_num; attr->qp_access_flags = cmd.qp_access_flags; attr->pkey_index = cmd.pkey_index; attr->alt_pkey_index = cmd.alt_pkey_index; attr->en_sqd_async_notify = cmd.en_sqd_async_notify; attr->max_rd_atomic = cmd.max_rd_atomic; attr->max_dest_rd_atomic = cmd.max_dest_rd_atomic; attr->min_rnr_timer = cmd.min_rnr_timer; attr->port_num = cmd.port_num; attr->timeout = cmd.timeout; attr->retry_cnt = cmd.retry_cnt; attr->rnr_retry = cmd.rnr_retry; attr->alt_port_num = cmd.alt_port_num; attr->alt_timeout = cmd.alt_timeout; memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16); attr->ah_attr.grh.flow_label = cmd.dest.flow_label; attr->ah_attr.grh.sgid_index = cmd.dest.sgid_index; attr->ah_attr.grh.hop_limit = cmd.dest.hop_limit; attr->ah_attr.grh.traffic_class = cmd.dest.traffic_class; attr->ah_attr.dlid = cmd.dest.dlid; attr->ah_attr.sl = cmd.dest.sl; attr->ah_attr.src_path_bits = cmd.dest.src_path_bits; attr->ah_attr.static_rate = cmd.dest.static_rate; attr->ah_attr.ah_flags = cmd.dest.is_global ? IB_AH_GRH : 0; attr->ah_attr.port_num = cmd.dest.port_num; memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16); attr->alt_ah_attr.grh.flow_label = cmd.alt_dest.flow_label; attr->alt_ah_attr.grh.sgid_index = cmd.alt_dest.sgid_index; attr->alt_ah_attr.grh.hop_limit = cmd.alt_dest.hop_limit; attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class; attr->alt_ah_attr.dlid = cmd.alt_dest.dlid; attr->alt_ah_attr.sl = cmd.alt_dest.sl; attr->alt_ah_attr.src_path_bits = cmd.alt_dest.src_path_bits; attr->alt_ah_attr.static_rate = cmd.alt_dest.static_rate; attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0; attr->alt_ah_attr.port_num = cmd.alt_dest.port_num; if (qp->real_qp == qp) { if (cmd.attr_mask & IB_QP_AV) { ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); if (ret) goto release_qp; } ret = qp->device->modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata); } else { ret = ib_modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask)); } if (ret) goto release_qp; ret = in_len; release_qp: put_qp_read(qp); out: kfree(attr); return ret; } ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_qp cmd; struct ib_uverbs_destroy_qp_resp resp; struct ib_uobject *uobj; struct ib_qp *qp; struct ib_uqp_object *obj; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; memset(&resp, 0, sizeof resp); uobj = idr_write_uobj(&ib_uverbs_qp_idr, cmd.qp_handle, file->ucontext); if (!uobj) return -EINVAL; qp = uobj->object; obj = container_of(uobj, struct ib_uqp_object, uevent.uobject); if (!list_empty(&obj->mcast_list)) { put_uobj_write(uobj); return -EBUSY; } ret = ib_destroy_qp(qp); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; if (obj->uxrcd) atomic_dec(&obj->uxrcd->refcnt); idr_remove_uobj(&ib_uverbs_qp_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); ib_uverbs_release_uevent(file, &obj->uevent); resp.events_reported = obj->uevent.events_reported; put_uobj(uobj); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) return -EFAULT; return in_len; } static void *alloc_wr(size_t wr_size, __u32 num_sge) { return kmalloc(ALIGN(wr_size, sizeof (struct ib_sge)) + num_sge * sizeof (struct ib_sge), GFP_KERNEL); }; ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_post_send cmd; struct ib_uverbs_post_send_resp resp; struct ib_uverbs_send_wr *user_wr; - struct ib_send_wr *wr = NULL, *last, *next, *bad_wr; + struct ib_send_wr *wr = NULL, *last, *next; + const struct ib_send_wr *bad_wr; struct ib_qp *qp; int i, sg_ind; int is_ud; ssize_t ret = -EINVAL; size_t next_size; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; if (in_len < sizeof cmd + cmd.wqe_size * cmd.wr_count + cmd.sge_count * sizeof (struct ib_uverbs_sge)) return -EINVAL; if (cmd.wqe_size < sizeof (struct ib_uverbs_send_wr)) return -EINVAL; user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL); if (!user_wr) return -ENOMEM; qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) goto out; is_ud = qp->qp_type == IB_QPT_UD; sg_ind = 0; last = NULL; for (i = 0; i < cmd.wr_count; ++i) { if (copy_from_user(user_wr, buf + sizeof cmd + i * cmd.wqe_size, cmd.wqe_size)) { ret = -EFAULT; goto out_put; } if (user_wr->num_sge + sg_ind > cmd.sge_count) { ret = -EINVAL; goto out_put; } if (is_ud) { struct ib_ud_wr *ud; if (user_wr->opcode != IB_WR_SEND && user_wr->opcode != IB_WR_SEND_WITH_IMM) { ret = -EINVAL; goto out_put; } next_size = sizeof(*ud); ud = alloc_wr(next_size, user_wr->num_sge); if (!ud) { ret = -ENOMEM; goto out_put; } ud->ah = idr_read_ah(user_wr->wr.ud.ah, file->ucontext); if (!ud->ah) { kfree(ud); ret = -EINVAL; goto out_put; } ud->remote_qpn = user_wr->wr.ud.remote_qpn; ud->remote_qkey = user_wr->wr.ud.remote_qkey; next = &ud->wr; } else if (user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM || user_wr->opcode == IB_WR_RDMA_WRITE || user_wr->opcode == IB_WR_RDMA_READ) { struct ib_rdma_wr *rdma; next_size = sizeof(*rdma); rdma = alloc_wr(next_size, user_wr->num_sge); if (!rdma) { ret = -ENOMEM; goto out_put; } rdma->remote_addr = user_wr->wr.rdma.remote_addr; rdma->rkey = user_wr->wr.rdma.rkey; next = &rdma->wr; } else if (user_wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || user_wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { struct ib_atomic_wr *atomic; next_size = sizeof(*atomic); atomic = alloc_wr(next_size, user_wr->num_sge); if (!atomic) { ret = -ENOMEM; goto out_put; } atomic->remote_addr = user_wr->wr.atomic.remote_addr; atomic->compare_add = user_wr->wr.atomic.compare_add; atomic->swap = user_wr->wr.atomic.swap; atomic->rkey = user_wr->wr.atomic.rkey; next = &atomic->wr; } else if (user_wr->opcode == IB_WR_SEND || user_wr->opcode == IB_WR_SEND_WITH_IMM || user_wr->opcode == IB_WR_SEND_WITH_INV) { next_size = sizeof(*next); next = alloc_wr(next_size, user_wr->num_sge); if (!next) { ret = -ENOMEM; goto out_put; } } else { ret = -EINVAL; goto out_put; } if (user_wr->opcode == IB_WR_SEND_WITH_IMM || user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { next->ex.imm_data = (__be32 __force) user_wr->ex.imm_data; } else if (user_wr->opcode == IB_WR_SEND_WITH_INV) { next->ex.invalidate_rkey = user_wr->ex.invalidate_rkey; } if (!last) wr = next; else last->next = next; last = next; next->next = NULL; next->wr_id = user_wr->wr_id; next->num_sge = user_wr->num_sge; next->opcode = user_wr->opcode; next->send_flags = user_wr->send_flags; if (next->num_sge) { next->sg_list = (void *)((char *)next + ALIGN(next_size, sizeof(struct ib_sge))); if (copy_from_user(next->sg_list, (const char *)buf + sizeof cmd + cmd.wr_count * cmd.wqe_size + sg_ind * sizeof (struct ib_sge), next->num_sge * sizeof (struct ib_sge))) { ret = -EFAULT; goto out_put; } sg_ind += next->num_sge; } else next->sg_list = NULL; } resp.bad_wr = 0; ret = qp->device->post_send(qp->real_qp, wr, &bad_wr); if (ret) for (next = wr; next; next = next->next) { ++resp.bad_wr; if (next == bad_wr) break; } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) ret = -EFAULT; out_put: put_qp_read(qp); while (wr) { if (is_ud && ud_wr(wr)->ah) put_ah_read(ud_wr(wr)->ah); next = wr->next; kfree(wr); wr = next; } out: kfree(user_wr); return ret ? ret : in_len; } static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf, int in_len, u32 wr_count, u32 sge_count, u32 wqe_size) { struct ib_uverbs_recv_wr *user_wr; struct ib_recv_wr *wr = NULL, *last, *next; int sg_ind; int i; int ret; if (in_len < wqe_size * wr_count + sge_count * sizeof (struct ib_uverbs_sge)) return ERR_PTR(-EINVAL); if (wqe_size < sizeof (struct ib_uverbs_recv_wr)) return ERR_PTR(-EINVAL); user_wr = kmalloc(wqe_size, GFP_KERNEL); if (!user_wr) return ERR_PTR(-ENOMEM); sg_ind = 0; last = NULL; for (i = 0; i < wr_count; ++i) { if (copy_from_user(user_wr, buf + i * wqe_size, wqe_size)) { ret = -EFAULT; goto err; } if (user_wr->num_sge + sg_ind > sge_count) { ret = -EINVAL; goto err; } next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + user_wr->num_sge * sizeof (struct ib_sge), GFP_KERNEL); if (!next) { ret = -ENOMEM; goto err; } if (!last) wr = next; else last->next = next; last = next; next->next = NULL; next->wr_id = user_wr->wr_id; next->num_sge = user_wr->num_sge; if (next->num_sge) { next->sg_list = (void *)((char *)next + ALIGN(sizeof *next, sizeof (struct ib_sge))); if (copy_from_user(next->sg_list, (const char *)buf + wr_count * wqe_size + sg_ind * sizeof (struct ib_sge), next->num_sge * sizeof (struct ib_sge))) { ret = -EFAULT; goto err; } sg_ind += next->num_sge; } else next->sg_list = NULL; } kfree(user_wr); return wr; err: kfree(user_wr); while (wr) { next = wr->next; kfree(wr); wr = next; } return ERR_PTR(ret); } ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_post_recv cmd; struct ib_uverbs_post_recv_resp resp; - struct ib_recv_wr *wr, *next, *bad_wr; + struct ib_recv_wr *wr, *next; + const struct ib_recv_wr *bad_wr; struct ib_qp *qp; ssize_t ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd, in_len - sizeof cmd, cmd.wr_count, cmd.sge_count, cmd.wqe_size); if (IS_ERR(wr)) return PTR_ERR(wr); qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) goto out; resp.bad_wr = 0; ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr); put_qp_read(qp); if (ret) for (next = wr; next; next = next->next) { ++resp.bad_wr; if (next == bad_wr) break; } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) ret = -EFAULT; out: while (wr) { next = wr->next; kfree(wr); wr = next; } return ret ? ret : in_len; } ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_post_srq_recv cmd; struct ib_uverbs_post_srq_recv_resp resp; - struct ib_recv_wr *wr, *next, *bad_wr; + struct ib_recv_wr *wr, *next; + const struct ib_recv_wr *bad_wr; struct ib_srq *srq; ssize_t ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd, in_len - sizeof cmd, cmd.wr_count, cmd.sge_count, cmd.wqe_size); if (IS_ERR(wr)) return PTR_ERR(wr); srq = idr_read_srq(cmd.srq_handle, file->ucontext); if (!srq) goto out; resp.bad_wr = 0; ret = srq->device->post_srq_recv(srq, wr, &bad_wr); put_srq_read(srq); if (ret) for (next = wr; next; next = next->next) { ++resp.bad_wr; if (next == bad_wr) break; } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) ret = -EFAULT; out: while (wr) { next = wr->next; kfree(wr); wr = next; } return ret ? ret : in_len; } ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_create_ah cmd; struct ib_uverbs_create_ah_resp resp; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_ah *ah; struct ib_ah_attr attr; int ret; struct ib_udata udata; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; if (!rdma_is_port_valid(ib_dev, cmd.attr.port_num)) return -EINVAL; ib_uverbs_init_udata(&udata, buf + sizeof(cmd), u64_to_user_ptr(cmd.response + sizeof(resp)), in_len - sizeof(cmd), out_len - sizeof(resp)); uobj = kmalloc(sizeof *uobj, GFP_KERNEL); if (!uobj) return -ENOMEM; init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_class); down_write(&uobj->mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err; } attr.dlid = cmd.attr.dlid; attr.sl = cmd.attr.sl; attr.src_path_bits = cmd.attr.src_path_bits; attr.static_rate = cmd.attr.static_rate; attr.ah_flags = cmd.attr.is_global ? IB_AH_GRH : 0; attr.port_num = cmd.attr.port_num; attr.grh.flow_label = cmd.attr.grh.flow_label; attr.grh.sgid_index = cmd.attr.grh.sgid_index; attr.grh.hop_limit = cmd.attr.grh.hop_limit; attr.grh.traffic_class = cmd.attr.grh.traffic_class; memset(&attr.dmac, 0, sizeof(attr.dmac)); memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16); ah = pd->device->create_ah(pd, &attr, &udata); if (IS_ERR(ah)) { ret = PTR_ERR(ah); goto err_put; } ah->device = pd->device; ah->pd = pd; atomic_inc(&pd->usecnt); ah->uobject = uobj; uobj->object = ah; ret = idr_add_uobj(&ib_uverbs_ah_idr, uobj); if (ret) goto err_destroy; resp.ah_handle = uobj->id; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; } put_pd_read(pd); mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->ah_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); return in_len; err_copy: idr_remove_uobj(&ib_uverbs_ah_idr, uobj); err_destroy: ib_destroy_ah(ah); err_put: put_pd_read(pd); err: put_uobj_write(uobj); return ret; } ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_ah cmd; struct ib_ah *ah; struct ib_uobject *uobj; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_ah_idr, cmd.ah_handle, file->ucontext); if (!uobj) return -EINVAL; ah = uobj->object; ret = ib_destroy_ah(ah); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_ah_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); return in_len; } ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_attach_mcast cmd; struct ib_qp *qp; struct ib_uqp_object *obj; struct ib_uverbs_mcast_entry *mcast; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; qp = idr_write_qp(cmd.qp_handle, file->ucontext); if (!qp) return -EINVAL; obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); mutex_lock(&obj->mcast_lock); list_for_each_entry(mcast, &obj->mcast_list, list) if (cmd.mlid == mcast->lid && !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) { ret = 0; goto out_put; } mcast = kmalloc(sizeof *mcast, GFP_KERNEL); if (!mcast) { ret = -ENOMEM; goto out_put; } mcast->lid = cmd.mlid; memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw); ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid); if (!ret) list_add_tail(&mcast->list, &obj->mcast_list); else kfree(mcast); out_put: mutex_unlock(&obj->mcast_lock); put_qp_write(qp); return ret ? ret : in_len; } ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_detach_mcast cmd; struct ib_uqp_object *obj; struct ib_qp *qp; struct ib_uverbs_mcast_entry *mcast; int ret = -EINVAL; bool found = false; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; qp = idr_write_qp(cmd.qp_handle, file->ucontext); if (!qp) return -EINVAL; obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); mutex_lock(&obj->mcast_lock); list_for_each_entry(mcast, &obj->mcast_list, list) if (cmd.mlid == mcast->lid && !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) { list_del(&mcast->list); kfree(mcast); found = true; break; } if (!found) { ret = -EINVAL; goto out_put; } ret = ib_detach_mcast(qp, (union ib_gid *)cmd.gid, cmd.mlid); out_put: mutex_unlock(&obj->mcast_lock); put_qp_write(qp); return ret ? ret : in_len; } static size_t kern_spec_filter_sz(struct ib_uverbs_flow_spec_hdr *spec) { /* Returns user space filter size, includes padding */ return (spec->size - sizeof(struct ib_uverbs_flow_spec_hdr)) / 2; } static ssize_t spec_filter_size(void *kern_spec_filter, u16 kern_filter_size, u16 ib_real_filter_sz) { /* * User space filter structures must be 64 bit aligned, otherwise this * may pass, but we won't handle additional new attributes. */ if (kern_filter_size > ib_real_filter_sz) { if (memchr_inv((char *)kern_spec_filter + ib_real_filter_sz, 0, kern_filter_size - ib_real_filter_sz)) return -EINVAL; return ib_real_filter_sz; } return kern_filter_size; } static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec) { ssize_t actual_filter_sz; ssize_t kern_filter_sz; ssize_t ib_filter_sz; void *kern_spec_mask; void *kern_spec_val; if (kern_spec->reserved) return -EINVAL; ib_spec->type = kern_spec->type; kern_filter_sz = kern_spec_filter_sz(&kern_spec->hdr); /* User flow spec size must be aligned to 4 bytes */ if (kern_filter_sz != ALIGN(kern_filter_sz, 4)) return -EINVAL; kern_spec_val = (char *)kern_spec + sizeof(struct ib_uverbs_flow_spec_hdr); kern_spec_mask = (char *)kern_spec_val + kern_filter_sz; switch (ib_spec->type) { case IB_FLOW_SPEC_ETH: ib_filter_sz = offsetof(struct ib_flow_eth_filter, real_sz); actual_filter_sz = spec_filter_size(kern_spec_mask, kern_filter_sz, ib_filter_sz); if (actual_filter_sz <= 0) return -EINVAL; ib_spec->size = sizeof(struct ib_flow_spec_eth); memcpy(&ib_spec->eth.val, kern_spec_val, actual_filter_sz); memcpy(&ib_spec->eth.mask, kern_spec_mask, actual_filter_sz); break; case IB_FLOW_SPEC_IPV4: ib_filter_sz = offsetof(struct ib_flow_ipv4_filter, real_sz); actual_filter_sz = spec_filter_size(kern_spec_mask, kern_filter_sz, ib_filter_sz); if (actual_filter_sz <= 0) return -EINVAL; ib_spec->size = sizeof(struct ib_flow_spec_ipv4); memcpy(&ib_spec->ipv4.val, kern_spec_val, actual_filter_sz); memcpy(&ib_spec->ipv4.mask, kern_spec_mask, actual_filter_sz); break; case IB_FLOW_SPEC_IPV6: ib_filter_sz = offsetof(struct ib_flow_ipv6_filter, real_sz); actual_filter_sz = spec_filter_size(kern_spec_mask, kern_filter_sz, ib_filter_sz); if (actual_filter_sz <= 0) return -EINVAL; ib_spec->size = sizeof(struct ib_flow_spec_ipv6); memcpy(&ib_spec->ipv6.val, kern_spec_val, actual_filter_sz); memcpy(&ib_spec->ipv6.mask, kern_spec_mask, actual_filter_sz); if ((ntohl(ib_spec->ipv6.mask.flow_label)) >= BIT(20) || (ntohl(ib_spec->ipv6.val.flow_label)) >= BIT(20)) return -EINVAL; break; case IB_FLOW_SPEC_TCP: case IB_FLOW_SPEC_UDP: ib_filter_sz = offsetof(struct ib_flow_tcp_udp_filter, real_sz); actual_filter_sz = spec_filter_size(kern_spec_mask, kern_filter_sz, ib_filter_sz); if (actual_filter_sz <= 0) return -EINVAL; ib_spec->size = sizeof(struct ib_flow_spec_tcp_udp); memcpy(&ib_spec->tcp_udp.val, kern_spec_val, actual_filter_sz); memcpy(&ib_spec->tcp_udp.mask, kern_spec_mask, actual_filter_sz); break; default: return -EINVAL; } return 0; } int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_ex_create_wq cmd = {}; struct ib_uverbs_ex_create_wq_resp resp = {}; struct ib_uwq_object *obj; int err = 0; struct ib_cq *cq; struct ib_pd *pd; struct ib_wq *wq; struct ib_wq_init_attr wq_init_attr = {}; size_t required_cmd_sz; size_t required_resp_len; required_cmd_sz = offsetof(typeof(cmd), max_sge) + sizeof(cmd.max_sge); required_resp_len = offsetof(typeof(resp), wqn) + sizeof(resp.wqn); if (ucore->inlen < required_cmd_sz) return -EINVAL; if (ucore->outlen < required_resp_len) return -ENOSPC; if (ucore->inlen > sizeof(cmd) && !ib_is_udata_cleared(ucore, sizeof(cmd), ucore->inlen - sizeof(cmd))) return -EOPNOTSUPP; err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); if (err) return err; if (cmd.comp_mask) return -EOPNOTSUPP; obj = kmalloc(sizeof(*obj), GFP_KERNEL); if (!obj) return -ENOMEM; init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &wq_lock_class); down_write(&obj->uevent.uobject.mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { err = -EINVAL; goto err_uobj; } cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); if (!cq) { err = -EINVAL; goto err_put_pd; } wq_init_attr.cq = cq; wq_init_attr.max_sge = cmd.max_sge; wq_init_attr.max_wr = cmd.max_wr; wq_init_attr.wq_context = file; wq_init_attr.wq_type = cmd.wq_type; wq_init_attr.event_handler = ib_uverbs_wq_event_handler; obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); wq = pd->device->create_wq(pd, &wq_init_attr, uhw); if (IS_ERR(wq)) { err = PTR_ERR(wq); goto err_put_cq; } wq->uobject = &obj->uevent.uobject; obj->uevent.uobject.object = wq; wq->wq_type = wq_init_attr.wq_type; wq->cq = cq; wq->pd = pd; wq->device = pd->device; wq->wq_context = wq_init_attr.wq_context; atomic_set(&wq->usecnt, 0); atomic_inc(&pd->usecnt); atomic_inc(&cq->usecnt); wq->uobject = &obj->uevent.uobject; obj->uevent.uobject.object = wq; err = idr_add_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject); if (err) goto destroy_wq; memset(&resp, 0, sizeof(resp)); resp.wq_handle = obj->uevent.uobject.id; resp.max_sge = wq_init_attr.max_sge; resp.max_wr = wq_init_attr.max_wr; resp.wqn = wq->wq_num; resp.response_length = required_resp_len; err = ib_copy_to_udata(ucore, &resp, resp.response_length); if (err) goto err_copy; put_pd_read(pd); put_cq_read(cq); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->wq_list); mutex_unlock(&file->mutex); obj->uevent.uobject.live = 1; up_write(&obj->uevent.uobject.mutex); return 0; err_copy: idr_remove_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject); destroy_wq: ib_destroy_wq(wq); err_put_cq: put_cq_read(cq); err_put_pd: put_pd_read(pd); err_uobj: put_uobj_write(&obj->uevent.uobject); return err; } int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_ex_destroy_wq cmd = {}; struct ib_uverbs_ex_destroy_wq_resp resp = {}; struct ib_wq *wq; struct ib_uobject *uobj; struct ib_uwq_object *obj; size_t required_cmd_sz; size_t required_resp_len; int ret; required_cmd_sz = offsetof(typeof(cmd), wq_handle) + sizeof(cmd.wq_handle); required_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); if (ucore->inlen < required_cmd_sz) return -EINVAL; if (ucore->outlen < required_resp_len) return -ENOSPC; if (ucore->inlen > sizeof(cmd) && !ib_is_udata_cleared(ucore, sizeof(cmd), ucore->inlen - sizeof(cmd))) return -EOPNOTSUPP; ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); if (ret) return ret; if (cmd.comp_mask) return -EOPNOTSUPP; resp.response_length = required_resp_len; uobj = idr_write_uobj(&ib_uverbs_wq_idr, cmd.wq_handle, file->ucontext); if (!uobj) return -EINVAL; wq = uobj->object; obj = container_of(uobj, struct ib_uwq_object, uevent.uobject); ret = ib_destroy_wq(wq); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_wq_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); ib_uverbs_release_uevent(file, &obj->uevent); resp.events_reported = obj->uevent.events_reported; put_uobj(uobj); ret = ib_copy_to_udata(ucore, &resp, resp.response_length); if (ret) return ret; return 0; } int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_ex_modify_wq cmd = {}; struct ib_wq *wq; struct ib_wq_attr wq_attr = {}; size_t required_cmd_sz; int ret; required_cmd_sz = offsetof(typeof(cmd), curr_wq_state) + sizeof(cmd.curr_wq_state); if (ucore->inlen < required_cmd_sz) return -EINVAL; if (ucore->inlen > sizeof(cmd) && !ib_is_udata_cleared(ucore, sizeof(cmd), ucore->inlen - sizeof(cmd))) return -EOPNOTSUPP; ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); if (ret) return ret; if (!cmd.attr_mask) return -EINVAL; if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE)) return -EINVAL; wq = idr_read_wq(cmd.wq_handle, file->ucontext); if (!wq) return -EINVAL; wq_attr.curr_wq_state = cmd.curr_wq_state; wq_attr.wq_state = cmd.wq_state; ret = wq->device->modify_wq(wq, &wq_attr, cmd.attr_mask, uhw); put_wq_read(wq); return ret; } int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_ex_create_rwq_ind_table cmd = {}; struct ib_uverbs_ex_create_rwq_ind_table_resp resp = {}; struct ib_uobject *uobj; int err = 0; struct ib_rwq_ind_table_init_attr init_attr = {}; struct ib_rwq_ind_table *rwq_ind_tbl; struct ib_wq **wqs = NULL; u32 *wqs_handles = NULL; struct ib_wq *wq = NULL; int i, j, num_read_wqs; u32 num_wq_handles; u32 expected_in_size; size_t required_cmd_sz_header; size_t required_resp_len; required_cmd_sz_header = offsetof(typeof(cmd), log_ind_tbl_size) + sizeof(cmd.log_ind_tbl_size); required_resp_len = offsetof(typeof(resp), ind_tbl_num) + sizeof(resp.ind_tbl_num); if (ucore->inlen < required_cmd_sz_header) return -EINVAL; if (ucore->outlen < required_resp_len) return -ENOSPC; err = ib_copy_from_udata(&cmd, ucore, required_cmd_sz_header); if (err) return err; ucore->inbuf = (const char *)ucore->inbuf + required_cmd_sz_header; ucore->inlen -= required_cmd_sz_header; if (cmd.comp_mask) return -EOPNOTSUPP; if (cmd.log_ind_tbl_size > IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE) return -EINVAL; num_wq_handles = 1 << cmd.log_ind_tbl_size; expected_in_size = num_wq_handles * sizeof(__u32); if (num_wq_handles == 1) /* input size for wq handles is u64 aligned */ expected_in_size += sizeof(__u32); if (ucore->inlen < expected_in_size) return -EINVAL; if (ucore->inlen > expected_in_size && !ib_is_udata_cleared(ucore, expected_in_size, ucore->inlen - expected_in_size)) return -EOPNOTSUPP; wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles), GFP_KERNEL); if (!wqs_handles) return -ENOMEM; err = ib_copy_from_udata(wqs_handles, ucore, num_wq_handles * sizeof(__u32)); if (err) goto err_free; wqs = kcalloc(num_wq_handles, sizeof(*wqs), GFP_KERNEL); if (!wqs) { err = -ENOMEM; goto err_free; } for (num_read_wqs = 0; num_read_wqs < num_wq_handles; num_read_wqs++) { wq = idr_read_wq(wqs_handles[num_read_wqs], file->ucontext); if (!wq) { err = -EINVAL; goto put_wqs; } wqs[num_read_wqs] = wq; } uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); if (!uobj) { err = -ENOMEM; goto put_wqs; } init_uobj(uobj, 0, file->ucontext, &rwq_ind_table_lock_class); down_write(&uobj->mutex); init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size; init_attr.ind_tbl = wqs; rwq_ind_tbl = ib_dev->create_rwq_ind_table(ib_dev, &init_attr, uhw); if (IS_ERR(rwq_ind_tbl)) { err = PTR_ERR(rwq_ind_tbl); goto err_uobj; } rwq_ind_tbl->ind_tbl = wqs; rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size; rwq_ind_tbl->uobject = uobj; uobj->object = rwq_ind_tbl; rwq_ind_tbl->device = ib_dev; atomic_set(&rwq_ind_tbl->usecnt, 0); for (i = 0; i < num_wq_handles; i++) atomic_inc(&wqs[i]->usecnt); err = idr_add_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); if (err) goto destroy_ind_tbl; resp.ind_tbl_handle = uobj->id; resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num; resp.response_length = required_resp_len; err = ib_copy_to_udata(ucore, &resp, resp.response_length); if (err) goto err_copy; kfree(wqs_handles); for (j = 0; j < num_read_wqs; j++) put_wq_read(wqs[j]); mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->rwq_ind_tbl_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); return 0; err_copy: idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); destroy_ind_tbl: ib_destroy_rwq_ind_table(rwq_ind_tbl); err_uobj: put_uobj_write(uobj); put_wqs: for (j = 0; j < num_read_wqs; j++) put_wq_read(wqs[j]); err_free: kfree(wqs_handles); kfree(wqs); return err; } int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_ex_destroy_rwq_ind_table cmd = {}; struct ib_rwq_ind_table *rwq_ind_tbl; struct ib_uobject *uobj; int ret; struct ib_wq **ind_tbl; size_t required_cmd_sz; required_cmd_sz = offsetof(typeof(cmd), ind_tbl_handle) + sizeof(cmd.ind_tbl_handle); if (ucore->inlen < required_cmd_sz) return -EINVAL; if (ucore->inlen > sizeof(cmd) && !ib_is_udata_cleared(ucore, sizeof(cmd), ucore->inlen - sizeof(cmd))) return -EOPNOTSUPP; ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); if (ret) return ret; if (cmd.comp_mask) return -EOPNOTSUPP; uobj = idr_write_uobj(&ib_uverbs_rwq_ind_tbl_idr, cmd.ind_tbl_handle, file->ucontext); if (!uobj) return -EINVAL; rwq_ind_tbl = uobj->object; ind_tbl = rwq_ind_tbl->ind_tbl; ret = ib_destroy_rwq_ind_table(rwq_ind_tbl); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); kfree(ind_tbl); return ret; } int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_create_flow cmd; struct ib_uverbs_create_flow_resp resp; struct ib_uobject *uobj; struct ib_flow *flow_id; struct ib_uverbs_flow_attr *kern_flow_attr; struct ib_flow_attr *flow_attr; struct ib_qp *qp; int err = 0; void *kern_spec; void *ib_spec; int i; if (ucore->inlen < sizeof(cmd)) return -EINVAL; if (ucore->outlen < sizeof(resp)) return -ENOSPC; err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); if (err) return err; ucore->inbuf = (const char *)ucore->inbuf + sizeof(cmd); ucore->inlen -= sizeof(cmd); if (cmd.comp_mask) return -EINVAL; if (priv_check(curthread, PRIV_NET_RAW) != 0) return -EPERM; if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED) return -EINVAL; if ((cmd.flow_attr.flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) && ((cmd.flow_attr.type == IB_FLOW_ATTR_ALL_DEFAULT) || (cmd.flow_attr.type == IB_FLOW_ATTR_MC_DEFAULT))) return -EINVAL; if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) return -EINVAL; if (cmd.flow_attr.size > ucore->inlen || cmd.flow_attr.size > (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec))) return -EINVAL; if (cmd.flow_attr.reserved[0] || cmd.flow_attr.reserved[1]) return -EINVAL; if (cmd.flow_attr.num_of_specs) { kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size, GFP_KERNEL); if (!kern_flow_attr) return -ENOMEM; memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr)); err = ib_copy_from_udata(kern_flow_attr + 1, ucore, cmd.flow_attr.size); if (err) goto err_free_attr; } else { kern_flow_attr = &cmd.flow_attr; } uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); if (!uobj) { err = -ENOMEM; goto err_free_attr; } init_uobj(uobj, 0, file->ucontext, &rule_lock_class); down_write(&uobj->mutex); qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) { err = -EINVAL; goto err_uobj; } flow_attr = kzalloc(sizeof(*flow_attr) + cmd.flow_attr.num_of_specs * sizeof(union ib_flow_spec), GFP_KERNEL); if (!flow_attr) { err = -ENOMEM; goto err_put; } flow_attr->type = kern_flow_attr->type; flow_attr->priority = kern_flow_attr->priority; flow_attr->num_of_specs = kern_flow_attr->num_of_specs; flow_attr->port = kern_flow_attr->port; flow_attr->flags = kern_flow_attr->flags; flow_attr->size = sizeof(*flow_attr); kern_spec = kern_flow_attr + 1; ib_spec = flow_attr + 1; for (i = 0; i < flow_attr->num_of_specs && cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) && cmd.flow_attr.size >= ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) { err = kern_spec_to_ib_spec(kern_spec, ib_spec); if (err) goto err_free; flow_attr->size += ((union ib_flow_spec *) ib_spec)->size; cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size; kern_spec = (char *)kern_spec + ((struct ib_uverbs_flow_spec *) kern_spec)->size; ib_spec = (char *)ib_spec + ((union ib_flow_spec *)ib_spec)->size; } if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n", i, cmd.flow_attr.size); err = -EINVAL; goto err_free; } flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); if (IS_ERR(flow_id)) { err = PTR_ERR(flow_id); goto err_free; } flow_id->qp = qp; flow_id->uobject = uobj; uobj->object = flow_id; err = idr_add_uobj(&ib_uverbs_rule_idr, uobj); if (err) goto destroy_flow; memset(&resp, 0, sizeof(resp)); resp.flow_handle = uobj->id; err = ib_copy_to_udata(ucore, &resp, sizeof(resp)); if (err) goto err_copy; put_qp_read(qp); mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->rule_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); kfree(flow_attr); if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); return 0; err_copy: idr_remove_uobj(&ib_uverbs_rule_idr, uobj); destroy_flow: ib_destroy_flow(flow_id); err_free: kfree(flow_attr); err_put: put_qp_read(qp); err_uobj: put_uobj_write(uobj); err_free_attr: if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); return err; } int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_destroy_flow cmd; struct ib_flow *flow_id; struct ib_uobject *uobj; int ret; if (ucore->inlen < sizeof(cmd)) return -EINVAL; ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); if (ret) return ret; if (cmd.comp_mask) return -EINVAL; uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle, file->ucontext); if (!uobj) return -EINVAL; flow_id = uobj->object; ret = ib_destroy_flow(flow_id); if (!ret) uobj->live = 0; put_uobj_write(uobj); idr_remove_uobj(&ib_uverbs_rule_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); return ret; } static int __uverbs_create_xsrq(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_uverbs_create_xsrq *cmd, struct ib_udata *udata) { struct ib_uverbs_create_srq_resp resp; struct ib_usrq_object *obj; struct ib_pd *pd; struct ib_srq *srq; struct ib_uobject *uninitialized_var(xrcd_uobj); struct ib_srq_init_attr attr; int ret; obj = kmalloc(sizeof *obj, GFP_KERNEL); if (!obj) return -ENOMEM; init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class); down_write(&obj->uevent.uobject.mutex); if (cmd->srq_type == IB_SRQT_XRC) { attr.ext.xrc.xrcd = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj); if (!attr.ext.xrc.xrcd) { ret = -EINVAL; goto err; } obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); attr.ext.xrc.cq = idr_read_cq(cmd->cq_handle, file->ucontext, 0); if (!attr.ext.xrc.cq) { ret = -EINVAL; goto err_put_xrcd; } } pd = idr_read_pd(cmd->pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err_put_cq; } attr.event_handler = ib_uverbs_srq_event_handler; attr.srq_context = file; attr.srq_type = cmd->srq_type; attr.attr.max_wr = cmd->max_wr; attr.attr.max_sge = cmd->max_sge; attr.attr.srq_limit = cmd->srq_limit; obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); srq = pd->device->create_srq(pd, &attr, udata); if (IS_ERR(srq)) { ret = PTR_ERR(srq); goto err_put; } srq->device = pd->device; srq->pd = pd; srq->srq_type = cmd->srq_type; srq->uobject = &obj->uevent.uobject; srq->event_handler = attr.event_handler; srq->srq_context = attr.srq_context; if (cmd->srq_type == IB_SRQT_XRC) { srq->ext.xrc.cq = attr.ext.xrc.cq; srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; atomic_inc(&attr.ext.xrc.cq->usecnt); atomic_inc(&attr.ext.xrc.xrcd->usecnt); } atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); obj->uevent.uobject.object = srq; ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); if (ret) goto err_destroy; memset(&resp, 0, sizeof resp); resp.srq_handle = obj->uevent.uobject.id; resp.max_wr = attr.attr.max_wr; resp.max_sge = attr.attr.max_sge; if (cmd->srq_type == IB_SRQT_XRC) resp.srqn = srq->ext.xrc.srq_num; if (copy_to_user((void __user *) (unsigned long) cmd->response, &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; } if (cmd->srq_type == IB_SRQT_XRC) { put_uobj_read(xrcd_uobj); put_cq_read(attr.ext.xrc.cq); } put_pd_read(pd); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list); mutex_unlock(&file->mutex); obj->uevent.uobject.live = 1; up_write(&obj->uevent.uobject.mutex); return 0; err_copy: idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); err_destroy: ib_destroy_srq(srq); err_put: put_pd_read(pd); err_put_cq: if (cmd->srq_type == IB_SRQT_XRC) put_cq_read(attr.ext.xrc.cq); err_put_xrcd: if (cmd->srq_type == IB_SRQT_XRC) { atomic_dec(&obj->uxrcd->refcnt); put_uobj_read(xrcd_uobj); } err: put_uobj_write(&obj->uevent.uobject); return ret; } ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_create_srq cmd; struct ib_uverbs_create_xsrq xcmd; struct ib_uverbs_create_srq_resp resp; struct ib_udata udata; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; xcmd.response = cmd.response; xcmd.user_handle = cmd.user_handle; xcmd.srq_type = IB_SRQT_BASIC; xcmd.pd_handle = cmd.pd_handle; xcmd.max_wr = cmd.max_wr; xcmd.max_sge = cmd.max_sge; xcmd.srq_limit = cmd.srq_limit; ib_uverbs_init_udata(&udata, buf + sizeof cmd, u64_to_user_ptr(cmd.response + sizeof resp), in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), out_len - sizeof resp); ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata); if (ret) return ret; return in_len; } ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_create_xsrq cmd; struct ib_uverbs_create_srq_resp resp; struct ib_udata udata; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; ib_uverbs_init_udata(&udata, buf + sizeof cmd, u64_to_user_ptr(cmd.response + sizeof resp), in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), out_len - sizeof resp); ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata); if (ret) return ret; return in_len; } ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_modify_srq cmd; struct ib_udata udata; struct ib_srq *srq; struct ib_srq_attr attr; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; ib_uverbs_init_udata(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, out_len); srq = idr_read_srq(cmd.srq_handle, file->ucontext); if (!srq) return -EINVAL; attr.max_wr = cmd.max_wr; attr.srq_limit = cmd.srq_limit; ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata); put_srq_read(srq); return ret ? ret : in_len; } ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_query_srq cmd; struct ib_uverbs_query_srq_resp resp; struct ib_srq_attr attr; struct ib_srq *srq; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; srq = idr_read_srq(cmd.srq_handle, file->ucontext); if (!srq) return -EINVAL; ret = ib_query_srq(srq, &attr); put_srq_read(srq); if (ret) return ret; memset(&resp, 0, sizeof resp); resp.max_wr = attr.max_wr; resp.max_sge = attr.max_sge; resp.srq_limit = attr.srq_limit; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) return -EFAULT; return in_len; } ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_srq cmd; struct ib_uverbs_destroy_srq_resp resp; struct ib_uobject *uobj; struct ib_srq *srq; struct ib_uevent_object *obj; int ret = -EINVAL; struct ib_usrq_object *us; enum ib_srq_type srq_type; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext); if (!uobj) return -EINVAL; srq = uobj->object; obj = container_of(uobj, struct ib_uevent_object, uobject); srq_type = srq->srq_type; ret = ib_destroy_srq(srq); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; if (srq_type == IB_SRQT_XRC) { us = container_of(obj, struct ib_usrq_object, uevent); atomic_dec(&us->uxrcd->refcnt); } idr_remove_uobj(&ib_uverbs_srq_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); ib_uverbs_release_uevent(file, obj); memset(&resp, 0, sizeof resp); resp.events_reported = obj->events_reported; put_uobj(uobj); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) ret = -EFAULT; return ret ? ret : in_len; } int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_ex_query_device_resp resp = { {0} }; struct ib_uverbs_ex_query_device cmd; struct ib_device_attr attr = {0}; int err; if (ucore->inlen < sizeof(cmd)) return -EINVAL; err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); if (err) return err; if (cmd.comp_mask) return -EINVAL; if (cmd.reserved) return -EINVAL; resp.response_length = offsetof(typeof(resp), odp_caps); if (ucore->outlen < resp.response_length) return -ENOSPC; err = ib_dev->query_device(ib_dev, &attr, uhw); if (err) return err; copy_query_dev_fields(file, ib_dev, &resp.base, &attr); if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps)) goto end; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING resp.odp_caps.general_caps = attr.odp_caps.general_caps; resp.odp_caps.per_transport_caps.rc_odp_caps = attr.odp_caps.per_transport_caps.rc_odp_caps; resp.odp_caps.per_transport_caps.uc_odp_caps = attr.odp_caps.per_transport_caps.uc_odp_caps; resp.odp_caps.per_transport_caps.ud_odp_caps = attr.odp_caps.per_transport_caps.ud_odp_caps; #endif resp.response_length += sizeof(resp.odp_caps); if (ucore->outlen < resp.response_length + sizeof(resp.timestamp_mask)) goto end; resp.timestamp_mask = attr.timestamp_mask; resp.response_length += sizeof(resp.timestamp_mask); if (ucore->outlen < resp.response_length + sizeof(resp.hca_core_clock)) goto end; resp.hca_core_clock = attr.hca_core_clock; resp.response_length += sizeof(resp.hca_core_clock); if (ucore->outlen < resp.response_length + sizeof(resp.device_cap_flags_ex)) goto end; resp.device_cap_flags_ex = attr.device_cap_flags; resp.response_length += sizeof(resp.device_cap_flags_ex); if (ucore->outlen < resp.response_length + sizeof(resp.rss_caps)) goto end; resp.rss_caps.supported_qpts = attr.rss_caps.supported_qpts; resp.rss_caps.max_rwq_indirection_tables = attr.rss_caps.max_rwq_indirection_tables; resp.rss_caps.max_rwq_indirection_table_size = attr.rss_caps.max_rwq_indirection_table_size; resp.response_length += sizeof(resp.rss_caps); if (ucore->outlen < resp.response_length + sizeof(resp.max_wq_type_rq)) goto end; resp.max_wq_type_rq = attr.max_wq_type_rq; resp.response_length += sizeof(resp.max_wq_type_rq); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; } diff --git a/sys/ofed/drivers/infiniband/core/ib_verbs.c b/sys/ofed/drivers/infiniband/core/ib_verbs.c index 32be78f118b5..040a0d401649 100644 --- a/sys/ofed/drivers/infiniband/core/ib_verbs.c +++ b/sys/ofed/drivers/infiniband/core/ib_verbs.c @@ -1,2132 +1,2133 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "core_priv.h" static const char * const ib_events[] = { [IB_EVENT_CQ_ERR] = "CQ error", [IB_EVENT_QP_FATAL] = "QP fatal error", [IB_EVENT_QP_REQ_ERR] = "QP request error", [IB_EVENT_QP_ACCESS_ERR] = "QP access error", [IB_EVENT_COMM_EST] = "communication established", [IB_EVENT_SQ_DRAINED] = "send queue drained", [IB_EVENT_PATH_MIG] = "path migration successful", [IB_EVENT_PATH_MIG_ERR] = "path migration error", [IB_EVENT_DEVICE_FATAL] = "device fatal error", [IB_EVENT_PORT_ACTIVE] = "port active", [IB_EVENT_PORT_ERR] = "port error", [IB_EVENT_LID_CHANGE] = "LID change", [IB_EVENT_PKEY_CHANGE] = "P_key change", [IB_EVENT_SM_CHANGE] = "SM change", [IB_EVENT_SRQ_ERR] = "SRQ error", [IB_EVENT_SRQ_LIMIT_REACHED] = "SRQ limit reached", [IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached", [IB_EVENT_CLIENT_REREGISTER] = "client reregister", [IB_EVENT_GID_CHANGE] = "GID changed", }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event) { size_t index = event; return (index < ARRAY_SIZE(ib_events) && ib_events[index]) ? ib_events[index] : "unrecognized event"; } EXPORT_SYMBOL(ib_event_msg); static const char * const wc_statuses[] = { [IB_WC_SUCCESS] = "success", [IB_WC_LOC_LEN_ERR] = "local length error", [IB_WC_LOC_QP_OP_ERR] = "local QP operation error", [IB_WC_LOC_EEC_OP_ERR] = "local EE context operation error", [IB_WC_LOC_PROT_ERR] = "local protection error", [IB_WC_WR_FLUSH_ERR] = "WR flushed", [IB_WC_MW_BIND_ERR] = "memory management operation error", [IB_WC_BAD_RESP_ERR] = "bad response error", [IB_WC_LOC_ACCESS_ERR] = "local access error", [IB_WC_REM_INV_REQ_ERR] = "invalid request error", [IB_WC_REM_ACCESS_ERR] = "remote access error", [IB_WC_REM_OP_ERR] = "remote operation error", [IB_WC_RETRY_EXC_ERR] = "transport retry counter exceeded", [IB_WC_RNR_RETRY_EXC_ERR] = "RNR retry counter exceeded", [IB_WC_LOC_RDD_VIOL_ERR] = "local RDD violation error", [IB_WC_REM_INV_RD_REQ_ERR] = "remote invalid RD request", [IB_WC_REM_ABORT_ERR] = "operation aborted", [IB_WC_INV_EECN_ERR] = "invalid EE context number", [IB_WC_INV_EEC_STATE_ERR] = "invalid EE context state", [IB_WC_FATAL_ERR] = "fatal error", [IB_WC_RESP_TIMEOUT_ERR] = "response timeout error", [IB_WC_GENERAL_ERR] = "general error", }; const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status) { size_t index = status; return (index < ARRAY_SIZE(wc_statuses) && wc_statuses[index]) ? wc_statuses[index] : "unrecognized status"; } EXPORT_SYMBOL(ib_wc_status_msg); __attribute_const__ int ib_rate_to_mult(enum ib_rate rate) { switch (rate) { case IB_RATE_2_5_GBPS: return 1; case IB_RATE_5_GBPS: return 2; case IB_RATE_10_GBPS: return 4; case IB_RATE_20_GBPS: return 8; case IB_RATE_30_GBPS: return 12; case IB_RATE_40_GBPS: return 16; case IB_RATE_60_GBPS: return 24; case IB_RATE_80_GBPS: return 32; case IB_RATE_120_GBPS: return 48; case IB_RATE_14_GBPS: return 6; case IB_RATE_56_GBPS: return 22; case IB_RATE_112_GBPS: return 45; case IB_RATE_168_GBPS: return 67; case IB_RATE_25_GBPS: return 10; case IB_RATE_100_GBPS: return 40; case IB_RATE_200_GBPS: return 80; case IB_RATE_300_GBPS: return 120; case IB_RATE_28_GBPS: return 11; case IB_RATE_50_GBPS: return 20; case IB_RATE_400_GBPS: return 160; case IB_RATE_600_GBPS: return 240; default: return -1; } } EXPORT_SYMBOL(ib_rate_to_mult); __attribute_const__ enum ib_rate mult_to_ib_rate(int mult) { switch (mult) { case 1: return IB_RATE_2_5_GBPS; case 2: return IB_RATE_5_GBPS; case 4: return IB_RATE_10_GBPS; case 8: return IB_RATE_20_GBPS; case 12: return IB_RATE_30_GBPS; case 16: return IB_RATE_40_GBPS; case 24: return IB_RATE_60_GBPS; case 32: return IB_RATE_80_GBPS; case 48: return IB_RATE_120_GBPS; case 6: return IB_RATE_14_GBPS; case 22: return IB_RATE_56_GBPS; case 45: return IB_RATE_112_GBPS; case 67: return IB_RATE_168_GBPS; case 10: return IB_RATE_25_GBPS; case 40: return IB_RATE_100_GBPS; case 80: return IB_RATE_200_GBPS; case 120: return IB_RATE_300_GBPS; case 11: return IB_RATE_28_GBPS; case 20: return IB_RATE_50_GBPS; case 160: return IB_RATE_400_GBPS; case 240: return IB_RATE_600_GBPS; default: return IB_RATE_PORT_CURRENT; } } EXPORT_SYMBOL(mult_to_ib_rate); __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) { switch (rate) { case IB_RATE_2_5_GBPS: return 2500; case IB_RATE_5_GBPS: return 5000; case IB_RATE_10_GBPS: return 10000; case IB_RATE_20_GBPS: return 20000; case IB_RATE_30_GBPS: return 30000; case IB_RATE_40_GBPS: return 40000; case IB_RATE_60_GBPS: return 60000; case IB_RATE_80_GBPS: return 80000; case IB_RATE_120_GBPS: return 120000; case IB_RATE_14_GBPS: return 14062; case IB_RATE_56_GBPS: return 56250; case IB_RATE_112_GBPS: return 112500; case IB_RATE_168_GBPS: return 168750; case IB_RATE_25_GBPS: return 25781; case IB_RATE_100_GBPS: return 103125; case IB_RATE_200_GBPS: return 206250; case IB_RATE_300_GBPS: return 309375; case IB_RATE_28_GBPS: return 28125; case IB_RATE_50_GBPS: return 53125; case IB_RATE_400_GBPS: return 425000; case IB_RATE_600_GBPS: return 637500; default: return -1; } } EXPORT_SYMBOL(ib_rate_to_mbps); __attribute_const__ enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type) { switch (node_type) { case RDMA_NODE_IB_CA: case RDMA_NODE_IB_SWITCH: case RDMA_NODE_IB_ROUTER: return RDMA_TRANSPORT_IB; case RDMA_NODE_RNIC: return RDMA_TRANSPORT_IWARP; case RDMA_NODE_USNIC: return RDMA_TRANSPORT_USNIC; case RDMA_NODE_USNIC_UDP: return RDMA_TRANSPORT_USNIC_UDP; default: BUG(); return 0; } } EXPORT_SYMBOL(rdma_node_get_transport); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num) { if (device->get_link_layer) return device->get_link_layer(device, port_num); switch (rdma_node_get_transport(device->node_type)) { case RDMA_TRANSPORT_IB: return IB_LINK_LAYER_INFINIBAND; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_USNIC: case RDMA_TRANSPORT_USNIC_UDP: return IB_LINK_LAYER_ETHERNET; default: return IB_LINK_LAYER_UNSPECIFIED; } } EXPORT_SYMBOL(rdma_port_get_link_layer); /* Protection domains */ /** * ib_alloc_pd - Allocates an unused protection domain. * @device: The device on which to allocate the protection domain. * * A protection domain object provides an association between QPs, shared * receive queues, address handles, memory regions, and memory windows. * * Every PD has a local_dma_lkey which can be used as the lkey value for local * memory operations. */ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, const char *caller) { struct ib_pd *pd; int mr_access_flags = 0; pd = device->alloc_pd(device, NULL, NULL); if (IS_ERR(pd)) return pd; pd->device = device; pd->uobject = NULL; pd->__internal_mr = NULL; atomic_set(&pd->usecnt, 0); pd->flags = flags; if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) pd->local_dma_lkey = device->local_dma_lkey; else mr_access_flags |= IB_ACCESS_LOCAL_WRITE; if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) { pr_warn("%s: enabling unsafe global rkey\n", caller); mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; } if (mr_access_flags) { struct ib_mr *mr; mr = pd->device->get_dma_mr(pd, mr_access_flags); if (IS_ERR(mr)) { ib_dealloc_pd(pd); return ERR_CAST(mr); } mr->device = pd->device; mr->pd = pd; mr->uobject = NULL; mr->need_inval = false; pd->__internal_mr = mr; if (!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) pd->local_dma_lkey = pd->__internal_mr->lkey; if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) pd->unsafe_global_rkey = pd->__internal_mr->rkey; } return pd; } EXPORT_SYMBOL(__ib_alloc_pd); /** * ib_dealloc_pd - Deallocates a protection domain. * @pd: The protection domain to deallocate. * * It is an error to call this function while any resources in the pd still * exist. The caller is responsible to synchronously destroy them and * guarantee no new allocations will happen. */ void ib_dealloc_pd(struct ib_pd *pd) { int ret; if (pd->__internal_mr) { ret = pd->device->dereg_mr(pd->__internal_mr); WARN_ON(ret); pd->__internal_mr = NULL; } /* uverbs manipulates usecnt with proper locking, while the kabi requires the caller to guarantee we can't race here. */ WARN_ON(atomic_read(&pd->usecnt)); /* Making delalloc_pd a void return is a WIP, no driver should return an error here. */ ret = pd->device->dealloc_pd(pd); WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd"); } EXPORT_SYMBOL(ib_dealloc_pd); /* Address handles */ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) { struct ib_ah *ah; ah = pd->device->create_ah(pd, ah_attr, NULL); if (!IS_ERR(ah)) { ah->device = pd->device; ah->pd = pd; ah->uobject = NULL; atomic_inc(&pd->usecnt); } return ah; } EXPORT_SYMBOL(ib_create_ah); static int ib_get_header_version(const union rdma_network_hdr *hdr) { const struct ip *ip4h = (const struct ip *)&hdr->roce4grh; struct ip ip4h_checked; const struct ip6_hdr *ip6h = (const struct ip6_hdr *)&hdr->ibgrh; /* If it's IPv6, the version must be 6, otherwise, the first * 20 bytes (before the IPv4 header) are garbled. */ if ((ip6h->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) return (ip4h->ip_v == 4) ? 4 : 0; /* version may be 6 or 4 because the first 20 bytes could be garbled */ /* RoCE v2 requires no options, thus header length * must be 5 words */ if (ip4h->ip_hl != 5) return 6; /* Verify checksum. * We can't write on scattered buffers so we need to copy to * temp buffer. */ memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked)); ip4h_checked.ip_sum = 0; #if defined(INET) || defined(INET6) ip4h_checked.ip_sum = in_cksum_hdr(&ip4h_checked); #endif /* if IPv4 header checksum is OK, believe it */ if (ip4h->ip_sum == ip4h_checked.ip_sum) return 4; return 6; } static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device, u8 port_num, const struct ib_grh *grh) { int grh_version; if (rdma_protocol_ib(device, port_num)) return RDMA_NETWORK_IB; grh_version = ib_get_header_version((const union rdma_network_hdr *)grh); if (grh_version == 4) return RDMA_NETWORK_IPV4; if (grh->next_hdr == IPPROTO_UDP) return RDMA_NETWORK_IPV6; return RDMA_NETWORK_ROCE_V1; } struct find_gid_index_context { u16 vlan_id; enum ib_gid_type gid_type; }; /* * This function will return true only if a inspected GID index * matches the request based on the GID type and VLAN configuration */ static bool find_gid_index(const union ib_gid *gid, const struct ib_gid_attr *gid_attr, void *context) { u16 vlan_diff; struct find_gid_index_context *ctx = (struct find_gid_index_context *)context; if (ctx->gid_type != gid_attr->gid_type) return false; /* * The following will verify: * 1. VLAN ID matching for VLAN tagged requests. * 2. prio-tagged/untagged to prio-tagged/untagged matching. * * This XOR is valid, since 0x0 < vlan_id < 0x0FFF. */ vlan_diff = rdma_vlan_dev_vlan_id(gid_attr->ndev) ^ ctx->vlan_id; return (vlan_diff == 0x0000 || vlan_diff == 0xFFFF); } static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num, u16 vlan_id, const union ib_gid *sgid, enum ib_gid_type gid_type, u16 *gid_index) { struct find_gid_index_context context = {.vlan_id = vlan_id, .gid_type = gid_type}; return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index, &context, gid_index); } static int get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, enum rdma_network_type net_type, union ib_gid *sgid, union ib_gid *dgid) { struct sockaddr_in src_in; struct sockaddr_in dst_in; __be32 src_saddr, dst_saddr; if (!sgid || !dgid) return -EINVAL; if (net_type == RDMA_NETWORK_IPV4) { memcpy(&src_in.sin_addr.s_addr, &hdr->roce4grh.ip_src, 4); memcpy(&dst_in.sin_addr.s_addr, &hdr->roce4grh.ip_dst, 4); src_saddr = src_in.sin_addr.s_addr; dst_saddr = dst_in.sin_addr.s_addr; ipv6_addr_set_v4mapped(src_saddr, (struct in6_addr *)sgid); ipv6_addr_set_v4mapped(dst_saddr, (struct in6_addr *)dgid); return 0; } else if (net_type == RDMA_NETWORK_IPV6 || net_type == RDMA_NETWORK_IB) { *dgid = hdr->ibgrh.dgid; *sgid = hdr->ibgrh.sgid; return 0; } else { return -EINVAL; } } int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct ib_ah_attr *ah_attr) { u32 flow_class; u16 gid_index = 0; int ret; enum rdma_network_type net_type = RDMA_NETWORK_IB; enum ib_gid_type gid_type = IB_GID_TYPE_IB; int hoplimit = 0xff; union ib_gid dgid; union ib_gid sgid; memset(ah_attr, 0, sizeof *ah_attr); if (rdma_cap_eth_ah(device, port_num)) { if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE) net_type = wc->network_hdr_type; else net_type = ib_get_net_type_by_grh(device, port_num, grh); gid_type = ib_network_to_gid_type(net_type); } ret = get_gids_from_rdma_hdr((const union rdma_network_hdr *)grh, net_type, &sgid, &dgid); if (ret) return ret; if (rdma_protocol_roce(device, port_num)) { struct ib_gid_attr dgid_attr; const u16 vlan_id = (wc->wc_flags & IB_WC_WITH_VLAN) ? wc->vlan_id : 0xffff; if (!(wc->wc_flags & IB_WC_GRH)) return -EPROTOTYPE; ret = get_sgid_index_from_eth(device, port_num, vlan_id, &dgid, gid_type, &gid_index); if (ret) return ret; ret = ib_get_cached_gid(device, port_num, gid_index, &dgid, &dgid_attr); if (ret) return ret; if (dgid_attr.ndev == NULL) return -ENODEV; ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid, ah_attr->dmac, dgid_attr.ndev, &hoplimit); dev_put(dgid_attr.ndev); if (ret) return ret; } ah_attr->dlid = wc->slid; ah_attr->sl = wc->sl; ah_attr->src_path_bits = wc->dlid_path_bits; ah_attr->port_num = port_num; if (wc->wc_flags & IB_WC_GRH) { ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = sgid; if (!rdma_cap_eth_ah(device, port_num)) { if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) { ret = ib_find_cached_gid_by_port(device, &dgid, IB_GID_TYPE_IB, port_num, NULL, &gid_index); if (ret) return ret; } } ah_attr->grh.sgid_index = (u8) gid_index; flow_class = be32_to_cpu(grh->version_tclass_flow); ah_attr->grh.flow_label = flow_class & 0xFFFFF; ah_attr->grh.hop_limit = hoplimit; ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; } return 0; } EXPORT_SYMBOL(ib_init_ah_from_wc); struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, const struct ib_grh *grh, u8 port_num) { struct ib_ah_attr ah_attr; int ret; ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr); if (ret) return ERR_PTR(ret); return ib_create_ah(pd, &ah_attr); } EXPORT_SYMBOL(ib_create_ah_from_wc); int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) { return ah->device->modify_ah ? ah->device->modify_ah(ah, ah_attr) : -ENOSYS; } EXPORT_SYMBOL(ib_modify_ah); int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) { return ah->device->query_ah ? ah->device->query_ah(ah, ah_attr) : -ENOSYS; } EXPORT_SYMBOL(ib_query_ah); int ib_destroy_ah(struct ib_ah *ah) { struct ib_pd *pd; int ret; pd = ah->pd; ret = ah->device->destroy_ah(ah); if (!ret) atomic_dec(&pd->usecnt); return ret; } EXPORT_SYMBOL(ib_destroy_ah); /* Shared receive queues */ struct ib_srq *ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr) { struct ib_srq *srq; if (!pd->device->create_srq) return ERR_PTR(-ENOSYS); srq = pd->device->create_srq(pd, srq_init_attr, NULL); if (!IS_ERR(srq)) { srq->device = pd->device; srq->pd = pd; srq->uobject = NULL; srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; srq->srq_type = srq_init_attr->srq_type; if (srq->srq_type == IB_SRQT_XRC) { srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq; atomic_inc(&srq->ext.xrc.xrcd->usecnt); atomic_inc(&srq->ext.xrc.cq->usecnt); } atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); } return srq; } EXPORT_SYMBOL(ib_create_srq); int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask) { return srq->device->modify_srq ? srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) : -ENOSYS; } EXPORT_SYMBOL(ib_modify_srq); int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr) { return srq->device->query_srq ? srq->device->query_srq(srq, srq_attr) : -ENOSYS; } EXPORT_SYMBOL(ib_query_srq); int ib_destroy_srq(struct ib_srq *srq) { struct ib_pd *pd; enum ib_srq_type srq_type; struct ib_xrcd *uninitialized_var(xrcd); struct ib_cq *uninitialized_var(cq); int ret; if (atomic_read(&srq->usecnt)) return -EBUSY; pd = srq->pd; srq_type = srq->srq_type; if (srq_type == IB_SRQT_XRC) { xrcd = srq->ext.xrc.xrcd; cq = srq->ext.xrc.cq; } ret = srq->device->destroy_srq(srq); if (!ret) { atomic_dec(&pd->usecnt); if (srq_type == IB_SRQT_XRC) { atomic_dec(&xrcd->usecnt); atomic_dec(&cq->usecnt); } } return ret; } EXPORT_SYMBOL(ib_destroy_srq); /* Queue pairs */ static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) { struct ib_qp *qp = context; unsigned long flags; spin_lock_irqsave(&qp->device->event_handler_lock, flags); list_for_each_entry(event->element.qp, &qp->open_list, open_list) if (event->element.qp->event_handler) event->element.qp->event_handler(event, event->element.qp->qp_context); spin_unlock_irqrestore(&qp->device->event_handler_lock, flags); } static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp) { mutex_lock(&xrcd->tgt_qp_mutex); list_add(&qp->xrcd_list, &xrcd->tgt_qp_list); mutex_unlock(&xrcd->tgt_qp_mutex); } static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp, void (*event_handler)(struct ib_event *, void *), void *qp_context) { struct ib_qp *qp; unsigned long flags; qp = kzalloc(sizeof *qp, GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); qp->real_qp = real_qp; atomic_inc(&real_qp->usecnt); qp->device = real_qp->device; qp->event_handler = event_handler; qp->qp_context = qp_context; qp->qp_num = real_qp->qp_num; qp->qp_type = real_qp->qp_type; spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); list_add(&qp->open_list, &real_qp->open_list); spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); return qp; } struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, struct ib_qp_open_attr *qp_open_attr) { struct ib_qp *qp, *real_qp; if (qp_open_attr->qp_type != IB_QPT_XRC_TGT) return ERR_PTR(-EINVAL); qp = ERR_PTR(-EINVAL); mutex_lock(&xrcd->tgt_qp_mutex); list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) { if (real_qp->qp_num == qp_open_attr->qp_num) { qp = __ib_open_qp(real_qp, qp_open_attr->event_handler, qp_open_attr->qp_context); break; } } mutex_unlock(&xrcd->tgt_qp_mutex); return qp; } EXPORT_SYMBOL(ib_open_qp); static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr) { struct ib_qp *real_qp = qp; qp->event_handler = __ib_shared_qp_event_handler; qp->qp_context = qp; qp->pd = NULL; qp->send_cq = qp->recv_cq = NULL; qp->srq = NULL; qp->xrcd = qp_init_attr->xrcd; atomic_inc(&qp_init_attr->xrcd->usecnt); INIT_LIST_HEAD(&qp->open_list); qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, qp_init_attr->qp_context); if (!IS_ERR(qp)) __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); else real_qp->device->destroy_qp(real_qp); return qp; } struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device; struct ib_qp *qp; if (qp_init_attr->rwq_ind_tbl && (qp_init_attr->recv_cq || qp_init_attr->srq || qp_init_attr->cap.max_recv_wr || qp_init_attr->cap.max_recv_sge)) return ERR_PTR(-EINVAL); qp = device->create_qp(pd, qp_init_attr, NULL); if (IS_ERR(qp)) return qp; qp->device = device; qp->real_qp = qp; qp->uobject = NULL; qp->qp_type = qp_init_attr->qp_type; qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl; atomic_set(&qp->usecnt, 0); spin_lock_init(&qp->mr_lock); if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) return ib_create_xrc_qp(qp, qp_init_attr); qp->event_handler = qp_init_attr->event_handler; qp->qp_context = qp_init_attr->qp_context; if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { qp->recv_cq = NULL; qp->srq = NULL; } else { qp->recv_cq = qp_init_attr->recv_cq; if (qp_init_attr->recv_cq) atomic_inc(&qp_init_attr->recv_cq->usecnt); qp->srq = qp_init_attr->srq; if (qp->srq) atomic_inc(&qp_init_attr->srq->usecnt); } qp->pd = pd; qp->send_cq = qp_init_attr->send_cq; qp->xrcd = NULL; atomic_inc(&pd->usecnt); if (qp_init_attr->send_cq) atomic_inc(&qp_init_attr->send_cq->usecnt); if (qp_init_attr->rwq_ind_tbl) atomic_inc(&qp->rwq_ind_tbl->usecnt); /* * Note: all hw drivers guarantee that max_send_sge is lower than * the device RDMA WRITE SGE limit but not all hw drivers ensure that * max_send_sge <= max_sge_rd. */ qp->max_write_sge = qp_init_attr->cap.max_send_sge; qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge, device->attrs.max_sge_rd); return qp; } EXPORT_SYMBOL(ib_create_qp); static const struct { int valid; enum ib_qp_attr_mask req_param[IB_QPT_MAX]; enum ib_qp_attr_mask opt_param[IB_QPT_MAX]; } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_INIT] = { .valid = 1, .req_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY), [IB_QPT_RAW_PACKET] = IB_QP_PORT, [IB_QPT_UC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), } }, }, [IB_QPS_INIT] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_INIT] = { .valid = 1, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), } }, [IB_QPS_RTR] = { .valid = 1, .req_param = { [IB_QPT_UC] = (IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN), [IB_QPT_RC] = (IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), [IB_QPT_XRC_INI] = (IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN), [IB_QPT_XRC_TGT] = (IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), }, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), [IB_QPT_RC] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), }, }, }, [IB_QPS_RTR] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_RTS] = { .valid = 1, .req_param = { [IB_QPT_UD] = IB_QP_SQ_PSN, [IB_QPT_UC] = IB_QP_SQ_PSN, [IB_QPT_RC] = (IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_SQ_PSN | IB_QP_MAX_QP_RD_ATOMIC), [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_SQ_PSN | IB_QP_MAX_QP_RD_ATOMIC), [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT | IB_QP_SQ_PSN), [IB_QPT_SMI] = IB_QP_SQ_PSN, [IB_QPT_GSI] = IB_QP_SQ_PSN, }, .opt_param = { [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PATH_MIG_STATE), [IB_QPT_RC] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT, } } }, [IB_QPS_RTS] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_RTS] = { .valid = 1, .opt_param = { [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE), [IB_QPT_RC] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE | IB_QP_MIN_RNR_TIMER), [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE | IB_QP_MIN_RNR_TIMER), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT, } }, [IB_QPS_SQD] = { .valid = 1, .opt_param = { [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */ [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY } }, }, [IB_QPS_SQD] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_RTS] = { .valid = 1, .opt_param = { [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PATH_MIG_STATE), [IB_QPT_RC] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), } }, [IB_QPS_SQD] = { .valid = 1, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_AV | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PATH_MIG_STATE), [IB_QPT_RC] = (IB_QP_PORT | IB_QP_AV | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_INI] = (IB_QP_PORT | IB_QP_AV | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_TGT] = (IB_QP_PORT | IB_QP_AV | IB_QP_TIMEOUT | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), } } }, [IB_QPS_SQE] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_RTS] = { .valid = 1, .opt_param = { [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), } } }, [IB_QPS_ERR] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 } } }; bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, enum ib_qp_type type, enum ib_qp_attr_mask mask) { enum ib_qp_attr_mask req_param, opt_param; if (mask & IB_QP_CUR_STATE && cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS && cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) return false; if (!qp_state_table[cur_state][next_state].valid) return false; req_param = qp_state_table[cur_state][next_state].req_param[type]; opt_param = qp_state_table[cur_state][next_state].opt_param[type]; if ((mask & req_param) != req_param) return false; if (mask & ~(req_param | opt_param | IB_QP_STATE)) return false; return true; } EXPORT_SYMBOL(ib_modify_qp_is_ok); int ib_resolve_eth_dmac(struct ib_device *device, struct ib_ah_attr *ah_attr) { struct ib_gid_attr sgid_attr; union ib_gid sgid; int hop_limit; int ret; if (ah_attr->port_num < rdma_start_port(device) || ah_attr->port_num > rdma_end_port(device)) return -EINVAL; if (!rdma_cap_eth_ah(device, ah_attr->port_num)) return 0; if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) { if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) { __be32 addr = 0; memcpy(&addr, ah_attr->grh.dgid.raw + 12, 4); ip_eth_mc_map(addr, (char *)ah_attr->dmac); } else { ipv6_eth_mc_map((struct in6_addr *)ah_attr->grh.dgid.raw, (char *)ah_attr->dmac); } return 0; } ret = ib_query_gid(device, ah_attr->port_num, ah_attr->grh.sgid_index, &sgid, &sgid_attr); if (ret != 0) return (ret); if (!sgid_attr.ndev) return -ENXIO; ret = rdma_addr_find_l2_eth_by_grh(&sgid, &ah_attr->grh.dgid, ah_attr->dmac, sgid_attr.ndev, &hop_limit); dev_put(sgid_attr.ndev); ah_attr->grh.hop_limit = hop_limit; return ret; } EXPORT_SYMBOL(ib_resolve_eth_dmac); int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask) { if (qp_attr_mask & IB_QP_AV) { int ret; ret = ib_resolve_eth_dmac(qp->device, &qp_attr->ah_attr); if (ret) return ret; } return qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); } EXPORT_SYMBOL(ib_modify_qp); int ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { return qp->device->query_qp ? qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) : -ENOSYS; } EXPORT_SYMBOL(ib_query_qp); int ib_close_qp(struct ib_qp *qp) { struct ib_qp *real_qp; unsigned long flags; real_qp = qp->real_qp; if (real_qp == qp) return -EINVAL; spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); list_del(&qp->open_list); spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); atomic_dec(&real_qp->usecnt); kfree(qp); return 0; } EXPORT_SYMBOL(ib_close_qp); static int __ib_destroy_shared_qp(struct ib_qp *qp) { struct ib_xrcd *xrcd; struct ib_qp *real_qp; int ret; real_qp = qp->real_qp; xrcd = real_qp->xrcd; mutex_lock(&xrcd->tgt_qp_mutex); ib_close_qp(qp); if (atomic_read(&real_qp->usecnt) == 0) list_del(&real_qp->xrcd_list); else real_qp = NULL; mutex_unlock(&xrcd->tgt_qp_mutex); if (real_qp) { ret = ib_destroy_qp(real_qp); if (!ret) atomic_dec(&xrcd->usecnt); else __ib_insert_xrcd_qp(xrcd, real_qp); } return 0; } int ib_destroy_qp(struct ib_qp *qp) { struct ib_pd *pd; struct ib_cq *scq, *rcq; struct ib_srq *srq; struct ib_rwq_ind_table *ind_tbl; int ret; if (atomic_read(&qp->usecnt)) return -EBUSY; if (qp->real_qp != qp) return __ib_destroy_shared_qp(qp); pd = qp->pd; scq = qp->send_cq; rcq = qp->recv_cq; srq = qp->srq; ind_tbl = qp->rwq_ind_tbl; ret = qp->device->destroy_qp(qp); if (!ret) { if (pd) atomic_dec(&pd->usecnt); if (scq) atomic_dec(&scq->usecnt); if (rcq) atomic_dec(&rcq->usecnt); if (srq) atomic_dec(&srq->usecnt); if (ind_tbl) atomic_dec(&ind_tbl->usecnt); } return ret; } EXPORT_SYMBOL(ib_destroy_qp); /* Completion queues */ struct ib_cq *ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), void *cq_context, const struct ib_cq_init_attr *cq_attr) { struct ib_cq *cq; cq = device->create_cq(device, cq_attr, NULL, NULL); if (!IS_ERR(cq)) { cq->device = device; cq->uobject = NULL; cq->comp_handler = comp_handler; cq->event_handler = event_handler; cq->cq_context = cq_context; atomic_set(&cq->usecnt, 0); } return cq; } EXPORT_SYMBOL(ib_create_cq); int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) { return cq->device->modify_cq ? cq->device->modify_cq(cq, cq_count, cq_period) : -ENOSYS; } EXPORT_SYMBOL(ib_modify_cq); int ib_destroy_cq(struct ib_cq *cq) { if (atomic_read(&cq->usecnt)) return -EBUSY; return cq->device->destroy_cq(cq); } EXPORT_SYMBOL(ib_destroy_cq); int ib_resize_cq(struct ib_cq *cq, int cqe) { return cq->device->resize_cq ? cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS; } EXPORT_SYMBOL(ib_resize_cq); /* Memory regions */ int ib_dereg_mr(struct ib_mr *mr) { struct ib_pd *pd = mr->pd; int ret; ret = mr->device->dereg_mr(mr); if (!ret) atomic_dec(&pd->usecnt); return ret; } EXPORT_SYMBOL(ib_dereg_mr); /** * ib_alloc_mr() - Allocates a memory region * @pd: protection domain associated with the region * @mr_type: memory region type * @max_num_sg: maximum sg entries available for registration. * * Notes: * Memory registeration page/sg lists must not exceed max_num_sg. * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed * max_num_sg * used_page_size. * */ struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg) { struct ib_mr *mr; if (!pd->device->alloc_mr) return ERR_PTR(-ENOSYS); mr = pd->device->alloc_mr(pd, mr_type, max_num_sg); if (!IS_ERR(mr)) { mr->device = pd->device; mr->pd = pd; mr->uobject = NULL; atomic_inc(&pd->usecnt); mr->need_inval = false; } return mr; } EXPORT_SYMBOL(ib_alloc_mr); /* "Fast" memory regions */ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr) { struct ib_fmr *fmr; if (!pd->device->alloc_fmr) return ERR_PTR(-ENOSYS); fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr); if (!IS_ERR(fmr)) { fmr->device = pd->device; fmr->pd = pd; atomic_inc(&pd->usecnt); } return fmr; } EXPORT_SYMBOL(ib_alloc_fmr); int ib_unmap_fmr(struct list_head *fmr_list) { struct ib_fmr *fmr; if (list_empty(fmr_list)) return 0; fmr = list_entry(fmr_list->next, struct ib_fmr, list); return fmr->device->unmap_fmr(fmr_list); } EXPORT_SYMBOL(ib_unmap_fmr); int ib_dealloc_fmr(struct ib_fmr *fmr) { struct ib_pd *pd; int ret; pd = fmr->pd; ret = fmr->device->dealloc_fmr(fmr); if (!ret) atomic_dec(&pd->usecnt); return ret; } EXPORT_SYMBOL(ib_dealloc_fmr); /* Multicast groups */ static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) { struct ib_qp_init_attr init_attr = {}; struct ib_qp_attr attr = {}; int num_eth_ports = 0; int port; /* If QP state >= init, it is assigned to a port and we can check this * port only. */ if (!ib_query_qp(qp, &attr, IB_QP_STATE | IB_QP_PORT, &init_attr)) { if (attr.qp_state >= IB_QPS_INIT) { if (rdma_port_get_link_layer(qp->device, attr.port_num) != IB_LINK_LAYER_INFINIBAND) return true; goto lid_check; } } /* Can't get a quick answer, iterate over all ports */ for (port = 0; port < qp->device->phys_port_cnt; port++) if (rdma_port_get_link_layer(qp->device, port) != IB_LINK_LAYER_INFINIBAND) num_eth_ports++; /* If we have at lease one Ethernet port, RoCE annex declares that * multicast LID should be ignored. We can't tell at this step if the * QP belongs to an IB or Ethernet port. */ if (num_eth_ports) return true; /* If all the ports are IB, we can check according to IB spec. */ lid_check: return !(lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || lid == be16_to_cpu(IB_LID_PERMISSIVE)); } int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) { int ret; if (!qp->device->attach_mcast) return -ENOSYS; if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->attach_mcast(qp, gid, lid); if (!ret) atomic_inc(&qp->usecnt); return ret; } EXPORT_SYMBOL(ib_attach_mcast); int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) { int ret; if (!qp->device->detach_mcast) return -ENOSYS; if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->detach_mcast(qp, gid, lid); if (!ret) atomic_dec(&qp->usecnt); return ret; } EXPORT_SYMBOL(ib_detach_mcast); struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) { struct ib_xrcd *xrcd; if (!device->alloc_xrcd) return ERR_PTR(-ENOSYS); xrcd = device->alloc_xrcd(device, NULL, NULL); if (!IS_ERR(xrcd)) { xrcd->device = device; xrcd->inode = NULL; atomic_set(&xrcd->usecnt, 0); mutex_init(&xrcd->tgt_qp_mutex); INIT_LIST_HEAD(&xrcd->tgt_qp_list); } return xrcd; } EXPORT_SYMBOL(ib_alloc_xrcd); int ib_dealloc_xrcd(struct ib_xrcd *xrcd) { struct ib_qp *qp; int ret; if (atomic_read(&xrcd->usecnt)) return -EBUSY; while (!list_empty(&xrcd->tgt_qp_list)) { qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list); ret = ib_destroy_qp(qp); if (ret) return ret; } return xrcd->device->dealloc_xrcd(xrcd); } EXPORT_SYMBOL(ib_dealloc_xrcd); /** * ib_create_wq - Creates a WQ associated with the specified protection * domain. * @pd: The protection domain associated with the WQ. * @wq_init_attr: A list of initial attributes required to create the * WQ. If WQ creation succeeds, then the attributes are updated to * the actual capabilities of the created WQ. * * wq_init_attr->max_wr and wq_init_attr->max_sge determine * the requested size of the WQ, and set to the actual values allocated * on return. * If ib_create_wq() succeeds, then max_wr and max_sge will always be * at least as large as the requested values. */ struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *wq_attr) { struct ib_wq *wq; if (!pd->device->create_wq) return ERR_PTR(-ENOSYS); wq = pd->device->create_wq(pd, wq_attr, NULL); if (!IS_ERR(wq)) { wq->event_handler = wq_attr->event_handler; wq->wq_context = wq_attr->wq_context; wq->wq_type = wq_attr->wq_type; wq->cq = wq_attr->cq; wq->device = pd->device; wq->pd = pd; wq->uobject = NULL; atomic_inc(&pd->usecnt); atomic_inc(&wq_attr->cq->usecnt); atomic_set(&wq->usecnt, 0); } return wq; } EXPORT_SYMBOL(ib_create_wq); /** * ib_destroy_wq - Destroys the specified WQ. * @wq: The WQ to destroy. */ int ib_destroy_wq(struct ib_wq *wq) { int err; struct ib_cq *cq = wq->cq; struct ib_pd *pd = wq->pd; if (atomic_read(&wq->usecnt)) return -EBUSY; err = wq->device->destroy_wq(wq); if (!err) { atomic_dec(&pd->usecnt); atomic_dec(&cq->usecnt); } return err; } EXPORT_SYMBOL(ib_destroy_wq); /** * ib_modify_wq - Modifies the specified WQ. * @wq: The WQ to modify. * @wq_attr: On input, specifies the WQ attributes to modify. * @wq_attr_mask: A bit-mask used to specify which attributes of the WQ * are being modified. * On output, the current values of selected WQ attributes are returned. */ int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, u32 wq_attr_mask) { int err; if (!wq->device->modify_wq) return -ENOSYS; err = wq->device->modify_wq(wq, wq_attr, wq_attr_mask, NULL); return err; } EXPORT_SYMBOL(ib_modify_wq); /* * ib_create_rwq_ind_table - Creates a RQ Indirection Table. * @device: The device on which to create the rwq indirection table. * @ib_rwq_ind_table_init_attr: A list of initial attributes required to * create the Indirection Table. * * Note: The life time of ib_rwq_ind_table_init_attr->ind_tbl is not less * than the created ib_rwq_ind_table object and the caller is responsible * for its memory allocation/free. */ struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device, struct ib_rwq_ind_table_init_attr *init_attr) { struct ib_rwq_ind_table *rwq_ind_table; int i; u32 table_size; if (!device->create_rwq_ind_table) return ERR_PTR(-ENOSYS); table_size = (1 << init_attr->log_ind_tbl_size); rwq_ind_table = device->create_rwq_ind_table(device, init_attr, NULL); if (IS_ERR(rwq_ind_table)) return rwq_ind_table; rwq_ind_table->ind_tbl = init_attr->ind_tbl; rwq_ind_table->log_ind_tbl_size = init_attr->log_ind_tbl_size; rwq_ind_table->device = device; rwq_ind_table->uobject = NULL; atomic_set(&rwq_ind_table->usecnt, 0); for (i = 0; i < table_size; i++) atomic_inc(&rwq_ind_table->ind_tbl[i]->usecnt); return rwq_ind_table; } EXPORT_SYMBOL(ib_create_rwq_ind_table); /* * ib_destroy_rwq_ind_table - Destroys the specified Indirection Table. * @wq_ind_table: The Indirection Table to destroy. */ int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table) { int err, i; u32 table_size = (1 << rwq_ind_table->log_ind_tbl_size); struct ib_wq **ind_tbl = rwq_ind_table->ind_tbl; if (atomic_read(&rwq_ind_table->usecnt)) return -EBUSY; err = rwq_ind_table->device->destroy_rwq_ind_table(rwq_ind_table); if (!err) { for (i = 0; i < table_size; i++) atomic_dec(&ind_tbl[i]->usecnt); } return err; } EXPORT_SYMBOL(ib_destroy_rwq_ind_table); struct ib_flow *ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain) { struct ib_flow *flow_id; if (!qp->device->create_flow) return ERR_PTR(-ENOSYS); flow_id = qp->device->create_flow(qp, flow_attr, domain); if (!IS_ERR(flow_id)) atomic_inc(&qp->usecnt); return flow_id; } EXPORT_SYMBOL(ib_create_flow); int ib_destroy_flow(struct ib_flow *flow_id) { int err; struct ib_qp *qp = flow_id->qp; err = qp->device->destroy_flow(flow_id); if (!err) atomic_dec(&qp->usecnt); return err; } EXPORT_SYMBOL(ib_destroy_flow); int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status) { return mr->device->check_mr_status ? mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS; } EXPORT_SYMBOL(ib_check_mr_status); int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, int state) { if (!device->set_vf_link_state) return -ENOSYS; return device->set_vf_link_state(device, vf, port, state); } EXPORT_SYMBOL(ib_set_vf_link_state); int ib_get_vf_config(struct ib_device *device, int vf, u8 port, struct ifla_vf_info *info) { if (!device->get_vf_config) return -ENOSYS; return device->get_vf_config(device, vf, port, info); } EXPORT_SYMBOL(ib_get_vf_config); int ib_get_vf_stats(struct ib_device *device, int vf, u8 port, struct ifla_vf_stats *stats) { if (!device->get_vf_stats) return -ENOSYS; return device->get_vf_stats(device, vf, port, stats); } EXPORT_SYMBOL(ib_get_vf_stats); int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, int type) { if (!device->set_vf_guid) return -ENOSYS; return device->set_vf_guid(device, vf, port, guid, type); } EXPORT_SYMBOL(ib_set_vf_guid); /** * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list * and set it the memory region. * @mr: memory region * @sg: dma mapped scatterlist * @sg_nents: number of entries in sg * @sg_offset: offset in bytes into sg * @page_size: page vector desired page size * * Constraints: * - The first sg element is allowed to have an offset. * - Each sg element must either be aligned to page_size or virtually * contiguous to the previous element. In case an sg element has a * non-contiguous offset, the mapping prefix will not include it. * - The last sg element is allowed to have length less than page_size. * - If sg_nents total byte length exceeds the mr max_num_sge * page_size * then only max_num_sg entries will be mapped. * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS, none of these * constraints holds and the page_size argument is ignored. * * Returns the number of sg elements that were mapped to the memory region. * * After this completes successfully, the memory region * is ready for registration. */ int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size) { if (unlikely(!mr->device->map_mr_sg)) return -ENOSYS; mr->page_size = page_size; return mr->device->map_mr_sg(mr, sg, sg_nents, sg_offset); } EXPORT_SYMBOL(ib_map_mr_sg); /** * ib_sg_to_pages() - Convert the largest prefix of a sg list * to a page vector * @mr: memory region * @sgl: dma mapped scatterlist * @sg_nents: number of entries in sg * @sg_offset_p: IN: start offset in bytes into sg * OUT: offset in bytes for element n of the sg of the first * byte that has not been processed where n is the return * value of this function. * @set_page: driver page assignment function pointer * * Core service helper for drivers to convert the largest * prefix of given sg list to a page vector. The sg list * prefix converted is the prefix that meet the requirements * of ib_map_mr_sg. * * Returns the number of sg elements that were assigned to * a page vector. */ int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64)) { struct scatterlist *sg; u64 last_end_dma_addr = 0; unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; unsigned int last_page_off = 0; u64 page_mask = ~((u64)mr->page_size - 1); int i, ret; if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0]))) return -EINVAL; mr->iova = sg_dma_address(&sgl[0]) + sg_offset; mr->length = 0; for_each_sg(sgl, sg, sg_nents, i) { u64 dma_addr = sg_dma_address(sg) + sg_offset; u64 prev_addr = dma_addr; unsigned int dma_len = sg_dma_len(sg) - sg_offset; u64 end_dma_addr = dma_addr + dma_len; u64 page_addr = dma_addr & page_mask; /* * For the second and later elements, check whether either the * end of element i-1 or the start of element i is not aligned * on a page boundary. */ if (i && (last_page_off != 0 || page_addr != dma_addr)) { /* Stop mapping if there is a gap. */ if (last_end_dma_addr != dma_addr) break; /* * Coalesce this element with the last. If it is small * enough just update mr->length. Otherwise start * mapping from the next page. */ goto next_page; } do { ret = set_page(mr, page_addr); if (unlikely(ret < 0)) { sg_offset = prev_addr - sg_dma_address(sg); mr->length += prev_addr - dma_addr; if (sg_offset_p) *sg_offset_p = sg_offset; return i || sg_offset ? i : ret; } prev_addr = page_addr; next_page: page_addr += mr->page_size; } while (page_addr < end_dma_addr); mr->length += dma_len; last_end_dma_addr = end_dma_addr; last_page_off = end_dma_addr & ~page_mask; sg_offset = 0; } if (sg_offset_p) *sg_offset_p = 0; return i; } EXPORT_SYMBOL(ib_sg_to_pages); struct ib_drain_cqe { struct ib_cqe cqe; struct completion done; }; static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) { struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe, cqe); complete(&cqe->done); } /* * Post a WR and block until its completion is reaped for the SQ. */ static void __ib_drain_sq(struct ib_qp *qp) { struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; struct ib_drain_cqe sdrain; - struct ib_send_wr *bad_swr; + const struct ib_send_wr *bad_swr; struct ib_rdma_wr swr = { .wr = { .opcode = IB_WR_RDMA_WRITE, .wr_cqe = &sdrain.cqe, }, }; int ret; if (qp->send_cq->poll_ctx == IB_POLL_DIRECT) { WARN_ONCE(qp->send_cq->poll_ctx == IB_POLL_DIRECT, "IB_POLL_DIRECT poll_ctx not supported for drain\n"); return; } sdrain.cqe.done = ib_drain_qp_done; init_completion(&sdrain.done); ret = ib_modify_qp(qp, &attr, IB_QP_STATE); if (ret) { WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); return; } ret = ib_post_send(qp, &swr.wr, &bad_swr); if (ret) { WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); return; } wait_for_completion(&sdrain.done); } /* * Post a WR and block until its completion is reaped for the RQ. */ static void __ib_drain_rq(struct ib_qp *qp) { struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; struct ib_drain_cqe rdrain; - struct ib_recv_wr rwr = {}, *bad_rwr; + struct ib_recv_wr rwr = {}; + const struct ib_recv_wr *bad_rwr; int ret; if (qp->recv_cq->poll_ctx == IB_POLL_DIRECT) { WARN_ONCE(qp->recv_cq->poll_ctx == IB_POLL_DIRECT, "IB_POLL_DIRECT poll_ctx not supported for drain\n"); return; } rwr.wr_cqe = &rdrain.cqe; rdrain.cqe.done = ib_drain_qp_done; init_completion(&rdrain.done); ret = ib_modify_qp(qp, &attr, IB_QP_STATE); if (ret) { WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); return; } ret = ib_post_recv(qp, &rwr, &bad_rwr); if (ret) { WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); return; } wait_for_completion(&rdrain.done); } /** * ib_drain_sq() - Block until all SQ CQEs have been consumed by the * application. * @qp: queue pair to drain * * If the device has a provider-specific drain function, then * call that. Otherwise call the generic drain function * __ib_drain_sq(). * * The caller must: * * ensure there is room in the CQ and SQ for the drain work request and * completion. * * allocate the CQ using ib_alloc_cq() and the CQ poll context cannot be * IB_POLL_DIRECT. * * ensure that there are no other contexts that are posting WRs concurrently. * Otherwise the drain is not guaranteed. */ void ib_drain_sq(struct ib_qp *qp) { if (qp->device->drain_sq) qp->device->drain_sq(qp); else __ib_drain_sq(qp); } EXPORT_SYMBOL(ib_drain_sq); /** * ib_drain_rq() - Block until all RQ CQEs have been consumed by the * application. * @qp: queue pair to drain * * If the device has a provider-specific drain function, then * call that. Otherwise call the generic drain function * __ib_drain_rq(). * * The caller must: * * ensure there is room in the CQ and RQ for the drain work request and * completion. * * allocate the CQ using ib_alloc_cq() and the CQ poll context cannot be * IB_POLL_DIRECT. * * ensure that there are no other contexts that are posting WRs concurrently. * Otherwise the drain is not guaranteed. */ void ib_drain_rq(struct ib_qp *qp) { if (qp->device->drain_rq) qp->device->drain_rq(qp); else __ib_drain_rq(qp); } EXPORT_SYMBOL(ib_drain_rq); /** * ib_drain_qp() - Block until all CQEs have been consumed by the * application on both the RQ and SQ. * @qp: queue pair to drain * * The caller must: * * ensure there is room in the CQ(s), SQ, and RQ for drain work requests * and completions. * * allocate the CQs using ib_alloc_cq() and the CQ poll context cannot be * IB_POLL_DIRECT. * * ensure that there are no other contexts that are posting WRs concurrently. * Otherwise the drain is not guaranteed. */ void ib_drain_qp(struct ib_qp *qp) { ib_drain_sq(qp); if (!qp->srq) ib_drain_rq(qp); } EXPORT_SYMBOL(ib_drain_qp); diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c index a5db9a256204..e40218f497b2 100644 --- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -1,1036 +1,1036 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "ipoib.h" #include #include #include #include #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA static int data_debug_level; module_param(data_debug_level, int, 0644); MODULE_PARM_DESC(data_debug_level, "Enable data path debug tracing if > 0"); #endif static DEFINE_MUTEX(pkey_mutex); struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *priv, struct ib_pd *pd, struct ib_ah_attr *attr) { struct ipoib_ah *ah; ah = kmalloc(sizeof *ah, GFP_KERNEL); if (!ah) return NULL; ah->priv = priv; ah->last_send = 0; kref_init(&ah->ref); ah->ah = ib_create_ah(pd, attr); if (IS_ERR(ah->ah)) { kfree(ah); ah = NULL; } else ipoib_dbg(priv, "Created ah %p\n", ah->ah); return ah; } void ipoib_free_ah(struct kref *kref) { struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref); struct ipoib_dev_priv *priv = ah->priv; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); list_add_tail(&ah->list, &priv->dead_ahs); spin_unlock_irqrestore(&priv->lock, flags); } void ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req) { struct mbuf *m; int i; for (i = 0, m = rx_req->mb; m != NULL; m = m->m_next, i++) ib_dma_unmap_single(priv->ca, rx_req->mapping[i], m->m_len, DMA_FROM_DEVICE); } void ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length) { m_adj(mb, -(mb->m_pkthdr.len - length)); } struct mbuf * ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req, int align, int size, int max_frags) { struct mbuf *mb, *m; int i, j; rx_req->mb = NULL; mb = m_getm2(NULL, align + size, M_NOWAIT, MT_DATA, M_PKTHDR); if (mb == NULL) return (NULL); for (i = 0, m = mb; m != NULL; m = m->m_next, i++) { MPASS(i < max_frags); m->m_len = M_SIZE(m) - align; m->m_data += align; align = 0; mb->m_pkthdr.len += m->m_len; rx_req->mapping[i] = ib_dma_map_single(priv->ca, mtod(m, void *), m->m_len, DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(priv->ca, rx_req->mapping[i]))) goto error; } rx_req->mb = mb; return (mb); error: for (j = 0, m = mb; j < i; m = m->m_next, j++) ib_dma_unmap_single(priv->ca, rx_req->mapping[j], m->m_len, DMA_FROM_DEVICE); m_freem(mb); return (NULL); } static int ipoib_ib_post_receive(struct ipoib_dev_priv *priv, int id) { struct ipoib_rx_buf *rx_req; - struct ib_recv_wr *bad_wr; + const struct ib_recv_wr *bad_wr; struct mbuf *m; int ret; int i; rx_req = &priv->rx_ring[id]; for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) { priv->rx_sge[i].addr = rx_req->mapping[i]; priv->rx_sge[i].length = m->m_len; } priv->rx_wr.num_sge = i; priv->rx_wr.wr_id = id | IPOIB_OP_RECV; ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret); ipoib_dma_unmap_rx(priv, &priv->rx_ring[id]); m_freem(priv->rx_ring[id].mb); priv->rx_ring[id].mb = NULL; } return ret; } static struct mbuf * ipoib_alloc_rx_mb(struct ipoib_dev_priv *priv, int id) { return ipoib_alloc_map_mb(priv, &priv->rx_ring[id], 0, priv->max_ib_mtu + IB_GRH_BYTES, IPOIB_UD_RX_SG); } static int ipoib_ib_post_receives(struct ipoib_dev_priv *priv) { int i; for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_alloc_rx_mb(priv, i)) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); return -ENOMEM; } if (ipoib_ib_post_receive(priv, i)) { ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i); return -EIO; } } return 0; } static void ipoib_ib_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { struct ipoib_rx_buf saverx; unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV; struct ifnet *dev = priv->dev; struct ipoib_header *eh; struct mbuf *mb; ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n", wr_id, wc->status); if (unlikely(wr_id >= ipoib_recvq_size)) { ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n", wr_id, ipoib_recvq_size); return; } mb = priv->rx_ring[wr_id].mb; if (unlikely(wc->status != IB_WC_SUCCESS)) { if (wc->status != IB_WC_WR_FLUSH_ERR) { ipoib_warn(priv, "failed recv event " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); goto repost; } if (mb) { ipoib_dma_unmap_rx(priv, &priv->rx_ring[wr_id]); m_freem(mb); priv->rx_ring[wr_id].mb = NULL; } return; } /* * Drop packets that this interface sent, ie multicast packets * that the HCA has replicated. */ if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) goto repost; memcpy(&saverx, &priv->rx_ring[wr_id], sizeof(saverx)); /* * If we can't allocate a new RX buffer, dump * this packet and reuse the old buffer. */ if (unlikely(!ipoib_alloc_rx_mb(priv, wr_id))) { memcpy(&priv->rx_ring[wr_id], &saverx, sizeof(saverx)); if_inc_counter(dev, IFCOUNTER_IQDROPS, 1); goto repost; } ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", wc->byte_len, wc->slid); ipoib_dma_unmap_rx(priv, &saverx); ipoib_dma_mb(priv, mb, wc->byte_len); if_inc_counter(dev, IFCOUNTER_IPACKETS, 1); if_inc_counter(dev, IFCOUNTER_IBYTES, mb->m_pkthdr.len); mb->m_pkthdr.rcvif = dev; m_adj(mb, sizeof(struct ib_grh) - INFINIBAND_ALEN); eh = mtod(mb, struct ipoib_header *); bzero(eh->hwaddr, 4); /* Zero the queue pair, only dgid is in grh */ if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->wc_flags & IB_WC_IP_CSUM_OK)) mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID; dev->if_input(dev, mb); repost: if (unlikely(ipoib_ib_post_receive(priv, wr_id))) ipoib_warn(priv, "ipoib_ib_post_receive failed " "for buf %d\n", wr_id); } int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max) { struct mbuf *mb = tx_req->mb; u64 *mapping = tx_req->mapping; struct mbuf *m, *p; int error; int i; for (m = mb, p = NULL, i = 0; m != NULL; p = m, m = m->m_next, i++) { if (m->m_len != 0) continue; if (p == NULL) panic("ipoib_dma_map_tx: First mbuf empty\n"); p->m_next = m_free(m); m = p; i--; } i--; if (i >= max) { tx_req->mb = mb = m_defrag(mb, M_NOWAIT); if (mb == NULL) return -EIO; for (m = mb, i = 0; m != NULL; m = m->m_next, i++); if (i >= max) return -EIO; } error = 0; for (m = mb, i = 0; m != NULL; m = m->m_next, i++) { mapping[i] = ib_dma_map_single(ca, mtod(m, void *), m->m_len, DMA_TO_DEVICE); if (unlikely(ib_dma_mapping_error(ca, mapping[i]))) { error = -EIO; break; } } if (error) { int end; end = i; for (m = mb, i = 0; i < end; m = m->m_next, i++) ib_dma_unmap_single(ca, mapping[i], m->m_len, DMA_TO_DEVICE); } return error; } void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req) { struct mbuf *mb = tx_req->mb; u64 *mapping = tx_req->mapping; struct mbuf *m; int i; for (m = mb, i = 0; m != NULL; m = m->m_next, i++) ib_dma_unmap_single(ca, mapping[i], m->m_len, DMA_TO_DEVICE); } static void ipoib_ib_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { struct ifnet *dev = priv->dev; unsigned int wr_id = wc->wr_id; struct ipoib_tx_buf *tx_req; ipoib_dbg_data(priv, "send completion: id %d, status: %d\n", wr_id, wc->status); if (unlikely(wr_id >= ipoib_sendq_size)) { ipoib_warn(priv, "send completion event with wrid %d (> %d)\n", wr_id, ipoib_sendq_size); return; } tx_req = &priv->tx_ring[wr_id]; ipoib_dma_unmap_tx(priv->ca, tx_req); if_inc_counter(dev, IFCOUNTER_OPACKETS, 1); m_freem(tx_req->mb); ++priv->tx_tail; if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && (dev->if_drv_flags & IFF_DRV_OACTIVE) && test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) dev->if_drv_flags &= ~IFF_DRV_OACTIVE; if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) ipoib_warn(priv, "failed send event " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); } int ipoib_poll_tx(struct ipoib_dev_priv *priv, bool do_start) { int n, i; n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc); for (i = 0; i < n; ++i) { struct ib_wc *wc = priv->send_wc + i; if (wc->wr_id & IPOIB_OP_CM) ipoib_cm_handle_tx_wc(priv, wc); else ipoib_ib_handle_tx_wc(priv, wc); } if (do_start && n != 0) ipoib_start_locked(priv->dev, priv); return n == MAX_SEND_CQE; } static void ipoib_poll(struct ipoib_dev_priv *priv) { int n, i; poll_more: spin_lock(&priv->drain_lock); for (;;) { n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); for (i = 0; i < n; i++) { struct ib_wc *wc = priv->ibwc + i; if ((wc->wr_id & IPOIB_OP_RECV) == 0) panic("ipoib_poll: Bad wr_id 0x%jX\n", (intmax_t)wc->wr_id); if (wc->wr_id & IPOIB_OP_CM) ipoib_cm_handle_rx_wc(priv, wc); else ipoib_ib_handle_rx_wc(priv, wc); } if (n != IPOIB_NUM_WC) break; } spin_unlock(&priv->drain_lock); if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) > 0) goto poll_more; } void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr) { struct ipoib_dev_priv *priv = dev_ptr; ipoib_poll(priv); } static void drain_tx_cq(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; spin_lock(&priv->lock); while (ipoib_poll_tx(priv, true)) ; /* nothing */ if (dev->if_drv_flags & IFF_DRV_OACTIVE) mod_timer(&priv->poll_timer, jiffies + 1); spin_unlock(&priv->lock); } void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr) { struct ipoib_dev_priv *priv = dev_ptr; mod_timer(&priv->poll_timer, jiffies); } static inline int post_send(struct ipoib_dev_priv *priv, unsigned int wr_id, struct ib_ah *address, u32 qpn, struct ipoib_tx_buf *tx_req, void *head, int hlen) { - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; struct mbuf *mb = tx_req->mb; u64 *mapping = tx_req->mapping; struct mbuf *m; int i; for (m = mb, i = 0; m != NULL; m = m->m_next, i++) { priv->tx_sge[i].addr = mapping[i]; priv->tx_sge[i].length = m->m_len; } priv->tx_wr.wr.num_sge = i; priv->tx_wr.wr.wr_id = wr_id; priv->tx_wr.remote_qpn = qpn; priv->tx_wr.ah = address; if (head) { priv->tx_wr.mss = 0; /* XXX mb_shinfo(mb)->gso_size; */ priv->tx_wr.header = head; priv->tx_wr.hlen = hlen; priv->tx_wr.wr.opcode = IB_WR_LSO; } else priv->tx_wr.wr.opcode = IB_WR_SEND; return ib_post_send(priv->qp, &priv->tx_wr.wr, &bad_wr); } void ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_ah *address, u32 qpn) { struct ifnet *dev = priv->dev; struct ipoib_tx_buf *tx_req; int hlen; void *phead; if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) while (ipoib_poll_tx(priv, false)) ; /* nothing */ m_adj(mb, sizeof (struct ipoib_pseudoheader)); if (0 /* XXX segment offload mb_is_gso(mb) */) { /* XXX hlen = mb_transport_offset(mb) + tcp_hdrlen(mb); */ phead = mtod(mb, void *); if (mb->m_len < hlen) { ipoib_warn(priv, "linear data too small\n"); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); m_freem(mb); return; } m_adj(mb, hlen); } else { if (unlikely(mb->m_pkthdr.len - IPOIB_ENCAP_LEN > priv->mcast_mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", mb->m_pkthdr.len, priv->mcast_mtu); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); ipoib_cm_mb_too_long(priv, mb, priv->mcast_mtu); return; } phead = NULL; hlen = 0; } ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n", mb->m_pkthdr.len, address, qpn); /* * We put the mb into the tx_ring _before_ we call post_send() * because it's entirely possible that the completion handler will * run before we execute anything after the post_send(). That * means we have to make sure everything is properly recorded and * our state is consistent before we call post_send(). */ tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; tx_req->mb = mb; if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req, IPOIB_UD_TX_SG))) { if_inc_counter(dev, IFCOUNTER_OERRORS, 1); if (tx_req->mb) m_freem(tx_req->mb); return; } if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP)) priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM; else priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; if (++priv->tx_outstanding == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) ipoib_warn(priv, "request notify on send CQ failed\n"); dev->if_drv_flags |= IFF_DRV_OACTIVE; } if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), address->ah, qpn, tx_req, phead, hlen))) { ipoib_warn(priv, "post_send failed\n"); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); --priv->tx_outstanding; ipoib_dma_unmap_tx(priv->ca, tx_req); m_freem(mb); if (dev->if_drv_flags & IFF_DRV_OACTIVE) dev->if_drv_flags &= ~IFF_DRV_OACTIVE; } else { address->last_send = priv->tx_head; ++priv->tx_head; } } static void __ipoib_reap_ah(struct ipoib_dev_priv *priv) { struct ipoib_ah *ah, *tah; LIST_HEAD(remove_list); unsigned long flags; spin_lock_irqsave(&priv->lock, flags); list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list) if ((int) priv->tx_tail - (int) ah->last_send >= 0) { list_del(&ah->list); ib_destroy_ah(ah->ah); kfree(ah); } spin_unlock_irqrestore(&priv->lock, flags); } void ipoib_reap_ah(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, ah_reap_task.work); __ipoib_reap_ah(priv); if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); } static void ipoib_ah_dev_cleanup(struct ipoib_dev_priv *priv) { unsigned long begin; begin = jiffies; while (!list_empty(&priv->dead_ahs)) { __ipoib_reap_ah(priv); if (time_after(jiffies, begin + HZ)) { ipoib_warn(priv, "timing out; will leak address handles\n"); break; } msleep(1); } } static void ipoib_ib_tx_timer_func(unsigned long ctx) { drain_tx_cq((struct ipoib_dev_priv *)ctx); } int ipoib_ib_dev_open(struct ipoib_dev_priv *priv) { int ret; if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) { ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey); clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); return -1; } set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); ret = ipoib_init_qp(priv); if (ret) { ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret); return -1; } ret = ipoib_ib_post_receives(priv); if (ret) { ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); ipoib_ib_dev_stop(priv, 1); return -1; } ret = ipoib_cm_dev_open(priv); if (ret) { ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); ipoib_ib_dev_stop(priv, 1); return -1; } clear_bit(IPOIB_STOP_REAPER, &priv->flags); queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); return 0; } static void ipoib_pkey_dev_check_presence(struct ipoib_dev_priv *priv) { u16 pkey_index = 0; if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); else set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); } int ipoib_ib_dev_up(struct ipoib_dev_priv *priv) { ipoib_pkey_dev_check_presence(priv); if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { ipoib_dbg(priv, "PKEY is not assigned.\n"); return 0; } set_bit(IPOIB_FLAG_OPER_UP, &priv->flags); return ipoib_mcast_start_thread(priv); } int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush) { ipoib_dbg(priv, "downing ib_dev\n"); clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); if_link_state_change(priv->dev, LINK_STATE_DOWN); /* Shutdown the P_Key thread if still active */ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { mutex_lock(&pkey_mutex); set_bit(IPOIB_PKEY_STOP, &priv->flags); cancel_delayed_work(&priv->pkey_poll_task); mutex_unlock(&pkey_mutex); if (flush) flush_workqueue(ipoib_workqueue); } ipoib_mcast_stop_thread(priv, flush); ipoib_mcast_dev_flush(priv); ipoib_flush_paths(priv); return 0; } static int recvs_pending(struct ipoib_dev_priv *priv) { int pending = 0; int i; for (i = 0; i < ipoib_recvq_size; ++i) if (priv->rx_ring[i].mb) ++pending; return pending; } static void check_qp_movement_and_print(struct ipoib_dev_priv *priv, struct ib_qp *qp, enum ib_qp_state new_state) { struct ib_qp_attr qp_attr; struct ib_qp_init_attr query_init_attr; int ret; ret = ib_query_qp(qp, &qp_attr, IB_QP_STATE, &query_init_attr); if (ret) { ipoib_warn(priv, "%s: Failed to query QP (%d)\n", __func__, ret); return; } /* print according to the new-state and the previous state */ if (new_state == IB_QPS_ERR && qp_attr.qp_state == IB_QPS_RESET) { ipoib_dbg(priv, "Failed to modify QP %d->%d, acceptable\n", qp_attr.qp_state, new_state); } else { ipoib_warn(priv, "Failed to modify QP %d->%d\n", qp_attr.qp_state, new_state); } } void ipoib_drain_cq(struct ipoib_dev_priv *priv) { int i, n; spin_lock(&priv->drain_lock); do { n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); for (i = 0; i < n; ++i) { /* * Convert any successful completions to flush * errors to avoid passing packets up the * stack after bringing the device down. */ if (priv->ibwc[i].status == IB_WC_SUCCESS) priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR; if ((priv->ibwc[i].wr_id & IPOIB_OP_RECV) == 0) panic("ipoib_drain_cq: Bad wrid 0x%jX\n", (intmax_t)priv->ibwc[i].wr_id); if (priv->ibwc[i].wr_id & IPOIB_OP_CM) ipoib_cm_handle_rx_wc(priv, priv->ibwc + i); else ipoib_ib_handle_rx_wc(priv, priv->ibwc + i); } } while (n == IPOIB_NUM_WC); spin_unlock(&priv->drain_lock); spin_lock(&priv->lock); while (ipoib_poll_tx(priv, true)) ; /* nothing */ spin_unlock(&priv->lock); } int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush) { struct ib_qp_attr qp_attr; unsigned long begin; struct ipoib_tx_buf *tx_req; int i; clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); ipoib_cm_dev_stop(priv); /* * Move our QP to the error state and then reinitialize in * when all work requests have completed or have been flushed. */ qp_attr.qp_state = IB_QPS_ERR; if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) check_qp_movement_and_print(priv, priv->qp, IB_QPS_ERR); /* Wait for all sends and receives to complete */ begin = jiffies; while (priv->tx_head != priv->tx_tail || recvs_pending(priv)) { if (time_after(jiffies, begin + 5 * HZ)) { ipoib_warn(priv, "timing out; %d sends %d receives not completed\n", priv->tx_head - priv->tx_tail, recvs_pending(priv)); /* * assume the HW is wedged and just free up * all our pending work requests. */ while ((int) priv->tx_tail - (int) priv->tx_head < 0) { tx_req = &priv->tx_ring[priv->tx_tail & (ipoib_sendq_size - 1)]; ipoib_dma_unmap_tx(priv->ca, tx_req); m_freem(tx_req->mb); ++priv->tx_tail; --priv->tx_outstanding; } for (i = 0; i < ipoib_recvq_size; ++i) { struct ipoib_rx_buf *rx_req; rx_req = &priv->rx_ring[i]; if (!rx_req->mb) continue; ipoib_dma_unmap_rx(priv, &priv->rx_ring[i]); m_freem(rx_req->mb); rx_req->mb = NULL; } goto timeout; } ipoib_drain_cq(priv); msleep(1); } ipoib_dbg(priv, "All sends and receives done.\n"); timeout: del_timer_sync(&priv->poll_timer); qp_attr.qp_state = IB_QPS_RESET; if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) ipoib_warn(priv, "Failed to modify QP to RESET state\n"); /* Wait for all AHs to be reaped */ set_bit(IPOIB_STOP_REAPER, &priv->flags); cancel_delayed_work(&priv->ah_reap_task); if (flush) flush_workqueue(ipoib_workqueue); ipoib_ah_dev_cleanup(priv); ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); return 0; } int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port) { struct ifnet *dev = priv->dev; priv->ca = ca; priv->port = port; priv->qp = NULL; if (ipoib_transport_dev_init(priv, ca)) { printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name); return -ENODEV; } setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func, (unsigned long) priv); if (dev->if_flags & IFF_UP) { if (ipoib_ib_dev_open(priv)) { ipoib_transport_dev_cleanup(priv); return -ENODEV; } } return 0; } static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, enum ipoib_flush_level level) { struct ipoib_dev_priv *cpriv; u16 new_index; mutex_lock(&priv->vlan_mutex); /* * Flush any child interfaces too -- they might be up even if * the parent is down. */ list_for_each_entry(cpriv, &priv->child_intfs, list) __ipoib_ib_dev_flush(cpriv, level); mutex_unlock(&priv->vlan_mutex); if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); return; } if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); return; } if (level == IPOIB_FLUSH_HEAVY) { if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); ipoib_ib_dev_down(priv, 0); ipoib_ib_dev_stop(priv, 0); if (ipoib_pkey_dev_delay_open(priv)) return; } /* restart QP only if P_Key index is changed */ if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && new_index == priv->pkey_index) { ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); return; } priv->pkey_index = new_index; } if (level == IPOIB_FLUSH_LIGHT) { ipoib_mark_paths_invalid(priv); ipoib_mcast_dev_flush(priv); } if (level >= IPOIB_FLUSH_NORMAL) ipoib_ib_dev_down(priv, 0); if (level == IPOIB_FLUSH_HEAVY) { ipoib_ib_dev_stop(priv, 0); ipoib_ib_dev_open(priv); } /* * The device could have been brought down between the start and when * we get here, don't bring it back up if it's not configured up */ if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { if (level >= IPOIB_FLUSH_NORMAL) ipoib_ib_dev_up(priv); ipoib_mcast_restart_task(&priv->restart_task); } } void ipoib_ib_dev_flush_light(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, flush_light); __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT); } void ipoib_ib_dev_flush_normal(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, flush_normal); __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL); } void ipoib_ib_dev_flush_heavy(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, flush_heavy); __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY); } void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv) { ipoib_dbg(priv, "cleaning up ib_dev\n"); ipoib_mcast_stop_thread(priv, 1); ipoib_mcast_dev_flush(priv); ipoib_ah_dev_cleanup(priv); ipoib_transport_dev_cleanup(priv); } /* * Delayed P_Key Assigment Interim Support * * The following is initial implementation of delayed P_Key assigment * mechanism. It is using the same approach implemented for the multicast * group join. The single goal of this implementation is to quickly address * Bug #2507. This implementation will probably be removed when the P_Key * change async notification is available. */ void ipoib_pkey_poll(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, pkey_poll_task.work); ipoib_pkey_dev_check_presence(priv); if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) ipoib_open(priv); else { mutex_lock(&pkey_mutex); if (!test_bit(IPOIB_PKEY_STOP, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->pkey_poll_task, HZ); mutex_unlock(&pkey_mutex); } } int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv) { /* Look for the interface pkey value in the IB Port P_Key table and */ /* set the interface pkey assigment flag */ ipoib_pkey_dev_check_presence(priv); /* P_Key value not assigned yet - start polling */ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { mutex_lock(&pkey_mutex); clear_bit(IPOIB_PKEY_STOP, &priv->flags); queue_delayed_work(ipoib_workqueue, &priv->pkey_poll_task, HZ); mutex_unlock(&pkey_mutex); return 1; } return 0; } diff --git a/sys/ofed/include/rdma/ib_verbs.h b/sys/ofed/include/rdma/ib_verbs.h index da17bc9f8250..45a25dc06c45 100644 --- a/sys/ofed/include/rdma/ib_verbs.h +++ b/sys/ofed/include/rdma/ib_verbs.h @@ -1,3398 +1,3398 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $FreeBSD$ */ #if !defined(IB_VERBS_H) #define IB_VERBS_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct ifla_vf_info; struct ifla_vf_stats; struct ib_uverbs_file; extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; union ib_gid { u8 raw[16]; struct { __be64 subnet_prefix; __be64 interface_id; } global; }; extern union ib_gid zgid; enum ib_gid_type { /* If link layer is Ethernet, this is RoCE V1 */ IB_GID_TYPE_IB = 0, IB_GID_TYPE_ROCE = 0, IB_GID_TYPE_ROCE_UDP_ENCAP = 1, IB_GID_TYPE_SIZE }; #define ROCE_V2_UDP_DPORT 4791 struct ib_gid_attr { enum ib_gid_type gid_type; struct ifnet *ndev; }; enum rdma_node_type { /* IB values map to NodeInfo:NodeType. */ RDMA_NODE_IB_CA = 1, RDMA_NODE_IB_SWITCH, RDMA_NODE_IB_ROUTER, RDMA_NODE_RNIC, RDMA_NODE_USNIC, RDMA_NODE_USNIC_UDP, }; enum { /* set the local administered indication */ IB_SA_WELL_KNOWN_GUID = BIT_ULL(57) | 2, }; enum rdma_transport_type { RDMA_TRANSPORT_IB, RDMA_TRANSPORT_IWARP, RDMA_TRANSPORT_USNIC, RDMA_TRANSPORT_USNIC_UDP }; enum rdma_protocol_type { RDMA_PROTOCOL_IB, RDMA_PROTOCOL_IBOE, RDMA_PROTOCOL_IWARP, RDMA_PROTOCOL_USNIC_UDP }; __attribute_const__ enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type); enum rdma_network_type { RDMA_NETWORK_IB, RDMA_NETWORK_ROCE_V1 = RDMA_NETWORK_IB, RDMA_NETWORK_IPV4, RDMA_NETWORK_IPV6 }; static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type) { if (network_type == RDMA_NETWORK_IPV4 || network_type == RDMA_NETWORK_IPV6) return IB_GID_TYPE_ROCE_UDP_ENCAP; /* IB_GID_TYPE_IB same as RDMA_NETWORK_ROCE_V1 */ return IB_GID_TYPE_IB; } static inline enum rdma_network_type ib_gid_to_network_type(enum ib_gid_type gid_type, union ib_gid *gid) { if (gid_type == IB_GID_TYPE_IB) return RDMA_NETWORK_IB; if (ipv6_addr_v4mapped((struct in6_addr *)gid)) return RDMA_NETWORK_IPV4; else return RDMA_NETWORK_IPV6; } enum rdma_link_layer { IB_LINK_LAYER_UNSPECIFIED, IB_LINK_LAYER_INFINIBAND, IB_LINK_LAYER_ETHERNET, }; enum ib_device_cap_flags { IB_DEVICE_RESIZE_MAX_WR = (1 << 0), IB_DEVICE_BAD_PKEY_CNTR = (1 << 1), IB_DEVICE_BAD_QKEY_CNTR = (1 << 2), IB_DEVICE_RAW_MULTI = (1 << 3), IB_DEVICE_AUTO_PATH_MIG = (1 << 4), IB_DEVICE_CHANGE_PHY_PORT = (1 << 5), IB_DEVICE_UD_AV_PORT_ENFORCE = (1 << 6), IB_DEVICE_CURR_QP_STATE_MOD = (1 << 7), IB_DEVICE_SHUTDOWN_PORT = (1 << 8), IB_DEVICE_INIT_TYPE = (1 << 9), IB_DEVICE_PORT_ACTIVE_EVENT = (1 << 10), IB_DEVICE_SYS_IMAGE_GUID = (1 << 11), IB_DEVICE_RC_RNR_NAK_GEN = (1 << 12), IB_DEVICE_SRQ_RESIZE = (1 << 13), IB_DEVICE_N_NOTIFY_CQ = (1 << 14), /* * This device supports a per-device lkey or stag that can be * used without performing a memory registration for the local * memory. Note that ULPs should never check this flag, but * instead of use the local_dma_lkey flag in the ib_pd structure, * which will always contain a usable lkey. */ IB_DEVICE_LOCAL_DMA_LKEY = (1 << 15), IB_DEVICE_RESERVED /* old SEND_W_INV */ = (1 << 16), IB_DEVICE_MEM_WINDOW = (1 << 17), /* * Devices should set IB_DEVICE_UD_IP_SUM if they support * insertion of UDP and TCP checksum on outgoing UD IPoIB * messages and can verify the validity of checksum for * incoming messages. Setting this flag implies that the * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode. */ IB_DEVICE_UD_IP_CSUM = (1 << 18), IB_DEVICE_UD_TSO = (1 << 19), IB_DEVICE_XRC = (1 << 20), /* * This device supports the IB "base memory management extension", * which includes support for fast registrations (IB_WR_REG_MR, * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs). This flag should * also be set by any iWarp device which must support FRs to comply * to the iWarp verbs spec. iWarp devices also support the * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the * stag. */ IB_DEVICE_MEM_MGT_EXTENSIONS = (1 << 21), IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1 << 22), IB_DEVICE_MEM_WINDOW_TYPE_2A = (1 << 23), IB_DEVICE_MEM_WINDOW_TYPE_2B = (1 << 24), IB_DEVICE_RC_IP_CSUM = (1 << 25), IB_DEVICE_RAW_IP_CSUM = (1 << 26), /* * Devices should set IB_DEVICE_CROSS_CHANNEL if they * support execution of WQEs that involve synchronization * of I/O operations with single completion queue managed * by hardware. */ IB_DEVICE_CROSS_CHANNEL = (1 << 27), IB_DEVICE_MANAGED_FLOW_STEERING = (1 << 29), IB_DEVICE_SIGNATURE_HANDOVER = (1 << 30), IB_DEVICE_ON_DEMAND_PAGING = (1ULL << 31), IB_DEVICE_SG_GAPS_REG = (1ULL << 32), IB_DEVICE_VIRTUAL_FUNCTION = (1ULL << 33), IB_DEVICE_RAW_SCATTER_FCS = (1ULL << 34), }; enum ib_signature_prot_cap { IB_PROT_T10DIF_TYPE_1 = 1, IB_PROT_T10DIF_TYPE_2 = 1 << 1, IB_PROT_T10DIF_TYPE_3 = 1 << 2, }; enum ib_signature_guard_cap { IB_GUARD_T10DIF_CRC = 1, IB_GUARD_T10DIF_CSUM = 1 << 1, }; enum ib_atomic_cap { IB_ATOMIC_NONE, IB_ATOMIC_HCA, IB_ATOMIC_GLOB }; enum ib_odp_general_cap_bits { IB_ODP_SUPPORT = 1 << 0, }; enum ib_odp_transport_cap_bits { IB_ODP_SUPPORT_SEND = 1 << 0, IB_ODP_SUPPORT_RECV = 1 << 1, IB_ODP_SUPPORT_WRITE = 1 << 2, IB_ODP_SUPPORT_READ = 1 << 3, IB_ODP_SUPPORT_ATOMIC = 1 << 4, }; struct ib_odp_caps { uint64_t general_caps; struct { uint32_t rc_odp_caps; uint32_t uc_odp_caps; uint32_t ud_odp_caps; } per_transport_caps; }; struct ib_rss_caps { /* Corresponding bit will be set if qp type from * 'enum ib_qp_type' is supported, e.g. * supported_qpts |= 1 << IB_QPT_UD */ u32 supported_qpts; u32 max_rwq_indirection_tables; u32 max_rwq_indirection_table_size; }; enum ib_cq_creation_flags { IB_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0, IB_CQ_FLAGS_IGNORE_OVERRUN = 1 << 1, }; struct ib_cq_init_attr { unsigned int cqe; u32 comp_vector; u32 flags; }; struct ib_device_attr { u64 fw_ver; __be64 sys_image_guid; u64 max_mr_size; u64 page_size_cap; u32 vendor_id; u32 vendor_part_id; u32 hw_ver; int max_qp; int max_qp_wr; u64 device_cap_flags; int max_sge; int max_sge_rd; int max_cq; int max_cqe; int max_mr; int max_pd; int max_qp_rd_atom; int max_ee_rd_atom; int max_res_rd_atom; int max_qp_init_rd_atom; int max_ee_init_rd_atom; enum ib_atomic_cap atomic_cap; enum ib_atomic_cap masked_atomic_cap; int max_ee; int max_rdd; int max_mw; int max_raw_ipv6_qp; int max_raw_ethy_qp; int max_mcast_grp; int max_mcast_qp_attach; int max_total_mcast_qp_attach; int max_ah; int max_fmr; int max_map_per_fmr; int max_srq; int max_srq_wr; int max_srq_sge; unsigned int max_fast_reg_page_list_len; u16 max_pkeys; u8 local_ca_ack_delay; int sig_prot_cap; int sig_guard_cap; struct ib_odp_caps odp_caps; uint64_t timestamp_mask; uint64_t hca_core_clock; /* in KHZ */ struct ib_rss_caps rss_caps; u32 max_wq_type_rq; }; enum ib_mtu { IB_MTU_256 = 1, IB_MTU_512 = 2, IB_MTU_1024 = 3, IB_MTU_2048 = 4, IB_MTU_4096 = 5 }; static inline int ib_mtu_enum_to_int(enum ib_mtu mtu) { switch (mtu) { case IB_MTU_256: return 256; case IB_MTU_512: return 512; case IB_MTU_1024: return 1024; case IB_MTU_2048: return 2048; case IB_MTU_4096: return 4096; default: return -1; } } enum ib_port_state { IB_PORT_NOP = 0, IB_PORT_DOWN = 1, IB_PORT_INIT = 2, IB_PORT_ARMED = 3, IB_PORT_ACTIVE = 4, IB_PORT_ACTIVE_DEFER = 5, IB_PORT_DUMMY = -1, /* force enum signed */ }; enum ib_port_cap_flags { IB_PORT_SM = 1 << 1, IB_PORT_NOTICE_SUP = 1 << 2, IB_PORT_TRAP_SUP = 1 << 3, IB_PORT_OPT_IPD_SUP = 1 << 4, IB_PORT_AUTO_MIGR_SUP = 1 << 5, IB_PORT_SL_MAP_SUP = 1 << 6, IB_PORT_MKEY_NVRAM = 1 << 7, IB_PORT_PKEY_NVRAM = 1 << 8, IB_PORT_LED_INFO_SUP = 1 << 9, IB_PORT_SM_DISABLED = 1 << 10, IB_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, IB_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, IB_PORT_CM_SUP = 1 << 16, IB_PORT_SNMP_TUNNEL_SUP = 1 << 17, IB_PORT_REINIT_SUP = 1 << 18, IB_PORT_DEVICE_MGMT_SUP = 1 << 19, IB_PORT_VENDOR_CLASS_SUP = 1 << 20, IB_PORT_DR_NOTICE_SUP = 1 << 21, IB_PORT_CAP_MASK_NOTICE_SUP = 1 << 22, IB_PORT_BOOT_MGMT_SUP = 1 << 23, IB_PORT_LINK_LATENCY_SUP = 1 << 24, IB_PORT_CLIENT_REG_SUP = 1 << 25, IB_PORT_IP_BASED_GIDS = 1 << 26, }; enum ib_port_phys_state { IB_PORT_PHYS_STATE_SLEEP = 1, IB_PORT_PHYS_STATE_POLLING = 2, IB_PORT_PHYS_STATE_DISABLED = 3, IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING = 4, IB_PORT_PHYS_STATE_LINK_UP = 5, IB_PORT_PHYS_STATE_LINK_ERROR_RECOVERY = 6, IB_PORT_PHYS_STATE_PHY_TEST = 7, }; enum ib_port_width { IB_WIDTH_1X = 1, IB_WIDTH_2X = 16, IB_WIDTH_4X = 2, IB_WIDTH_8X = 4, IB_WIDTH_12X = 8 }; static inline int ib_width_enum_to_int(enum ib_port_width width) { switch (width) { case IB_WIDTH_1X: return 1; case IB_WIDTH_2X: return 2; case IB_WIDTH_4X: return 4; case IB_WIDTH_8X: return 8; case IB_WIDTH_12X: return 12; default: return -1; } } enum ib_port_speed { IB_SPEED_SDR = 1, IB_SPEED_DDR = 2, IB_SPEED_QDR = 4, IB_SPEED_FDR10 = 8, IB_SPEED_FDR = 16, IB_SPEED_EDR = 32, IB_SPEED_HDR = 64 }; /** * struct rdma_hw_stats * @lock - Mutex to protect parallel write access to lifespan and values * of counters, which are 64bits and not guaranteeed to be written * atomicaly on 32bits systems. * @timestamp - Used by the core code to track when the last update was * @lifespan - Used by the core code to determine how old the counters * should be before being updated again. Stored in jiffies, defaults * to 10 milliseconds, drivers can override the default be specifying * their own value during their allocation routine. * @name - Array of pointers to static names used for the counters in * directory. * @num_counters - How many hardware counters there are. If name is * shorter than this number, a kernel oops will result. Driver authors * are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters) * in their code to prevent this. * @value - Array of u64 counters that are accessed by the sysfs code and * filled in by the drivers get_stats routine */ struct rdma_hw_stats { struct mutex lock; /* Protect lifespan and values[] */ unsigned long timestamp; unsigned long lifespan; const char * const *names; int num_counters; u64 value[]; }; #define RDMA_HW_STATS_DEFAULT_LIFESPAN 10 /** * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct * for drivers. * @names - Array of static const char * * @num_counters - How many elements in array * @lifespan - How many milliseconds between updates */ static inline struct rdma_hw_stats *rdma_alloc_hw_stats_struct( const char * const *names, int num_counters, unsigned long lifespan) { struct rdma_hw_stats *stats; stats = kzalloc(sizeof(*stats) + num_counters * sizeof(u64), GFP_KERNEL); if (!stats) return NULL; stats->names = names; stats->num_counters = num_counters; stats->lifespan = msecs_to_jiffies(lifespan); return stats; } /* Define bits for the various functionality this port needs to be supported by * the core. */ /* Management 0x00000FFF */ #define RDMA_CORE_CAP_IB_MAD 0x00000001 #define RDMA_CORE_CAP_IB_SMI 0x00000002 #define RDMA_CORE_CAP_IB_CM 0x00000004 #define RDMA_CORE_CAP_IW_CM 0x00000008 #define RDMA_CORE_CAP_IB_SA 0x00000010 #define RDMA_CORE_CAP_OPA_MAD 0x00000020 /* Address format 0x000FF000 */ #define RDMA_CORE_CAP_AF_IB 0x00001000 #define RDMA_CORE_CAP_ETH_AH 0x00002000 /* Protocol 0xFFF00000 */ #define RDMA_CORE_CAP_PROT_IB 0x00100000 #define RDMA_CORE_CAP_PROT_ROCE 0x00200000 #define RDMA_CORE_CAP_PROT_IWARP 0x00400000 #define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000 #define RDMA_CORE_PORT_IBA_IB (RDMA_CORE_CAP_PROT_IB \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_SMI \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_IB_SA \ | RDMA_CORE_CAP_AF_IB) #define RDMA_CORE_PORT_IBA_ROCE (RDMA_CORE_CAP_PROT_ROCE \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_AF_IB \ | RDMA_CORE_CAP_ETH_AH) #define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP \ (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_AF_IB \ | RDMA_CORE_CAP_ETH_AH) #define RDMA_CORE_PORT_IWARP (RDMA_CORE_CAP_PROT_IWARP \ | RDMA_CORE_CAP_IW_CM) #define RDMA_CORE_PORT_INTEL_OPA (RDMA_CORE_PORT_IBA_IB \ | RDMA_CORE_CAP_OPA_MAD) struct ib_port_attr { u64 subnet_prefix; enum ib_port_state state; enum ib_mtu max_mtu; enum ib_mtu active_mtu; int gid_tbl_len; u32 port_cap_flags; u32 max_msg_sz; u32 bad_pkey_cntr; u32 qkey_viol_cntr; u16 pkey_tbl_len; u16 lid; u16 sm_lid; u8 lmc; u8 max_vl_num; u8 sm_sl; u8 subnet_timeout; u8 init_type_reply; u8 active_width; u8 active_speed; u8 phys_state; bool grh_required; }; enum ib_device_modify_flags { IB_DEVICE_MODIFY_SYS_IMAGE_GUID = 1 << 0, IB_DEVICE_MODIFY_NODE_DESC = 1 << 1 }; #define IB_DEVICE_NODE_DESC_MAX 64 struct ib_device_modify { u64 sys_image_guid; char node_desc[IB_DEVICE_NODE_DESC_MAX]; }; enum ib_port_modify_flags { IB_PORT_SHUTDOWN = 1, IB_PORT_INIT_TYPE = (1<<2), IB_PORT_RESET_QKEY_CNTR = (1<<3) }; struct ib_port_modify { u32 set_port_cap_mask; u32 clr_port_cap_mask; u8 init_type; }; enum ib_event_type { IB_EVENT_CQ_ERR, IB_EVENT_QP_FATAL, IB_EVENT_QP_REQ_ERR, IB_EVENT_QP_ACCESS_ERR, IB_EVENT_COMM_EST, IB_EVENT_SQ_DRAINED, IB_EVENT_PATH_MIG, IB_EVENT_PATH_MIG_ERR, IB_EVENT_DEVICE_FATAL, IB_EVENT_PORT_ACTIVE, IB_EVENT_PORT_ERR, IB_EVENT_LID_CHANGE, IB_EVENT_PKEY_CHANGE, IB_EVENT_SM_CHANGE, IB_EVENT_SRQ_ERR, IB_EVENT_SRQ_LIMIT_REACHED, IB_EVENT_QP_LAST_WQE_REACHED, IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, IB_EVENT_WQ_FATAL, }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event); struct ib_event { struct ib_device *device; union { struct ib_cq *cq; struct ib_qp *qp; struct ib_srq *srq; struct ib_wq *wq; u8 port_num; } element; enum ib_event_type event; }; struct ib_event_handler { struct ib_device *device; void (*handler)(struct ib_event_handler *, struct ib_event *); struct list_head list; }; #define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \ do { \ (_ptr)->device = _device; \ (_ptr)->handler = _handler; \ INIT_LIST_HEAD(&(_ptr)->list); \ } while (0) struct ib_global_route { union ib_gid dgid; u32 flow_label; u8 sgid_index; u8 hop_limit; u8 traffic_class; }; struct ib_grh { __be32 version_tclass_flow; __be16 paylen; u8 next_hdr; u8 hop_limit; union ib_gid sgid; union ib_gid dgid; }; union rdma_network_hdr { struct ib_grh ibgrh; struct { /* The IB spec states that if it's IPv4, the header * is located in the last 20 bytes of the header. */ u8 reserved[20]; struct ip roce4grh; }; }; enum { IB_MULTICAST_QPN = 0xffffff }; #define IB_LID_PERMISSIVE cpu_to_be16(0xFFFF) #define IB_MULTICAST_LID_BASE cpu_to_be16(0xC000) enum ib_ah_flags { IB_AH_GRH = 1 }; enum ib_rate { IB_RATE_PORT_CURRENT = 0, IB_RATE_2_5_GBPS = 2, IB_RATE_5_GBPS = 5, IB_RATE_10_GBPS = 3, IB_RATE_20_GBPS = 6, IB_RATE_30_GBPS = 4, IB_RATE_40_GBPS = 7, IB_RATE_60_GBPS = 8, IB_RATE_80_GBPS = 9, IB_RATE_120_GBPS = 10, IB_RATE_14_GBPS = 11, IB_RATE_56_GBPS = 12, IB_RATE_112_GBPS = 13, IB_RATE_168_GBPS = 14, IB_RATE_25_GBPS = 15, IB_RATE_100_GBPS = 16, IB_RATE_200_GBPS = 17, IB_RATE_300_GBPS = 18, IB_RATE_28_GBPS = 19, IB_RATE_50_GBPS = 20, IB_RATE_400_GBPS = 21, IB_RATE_600_GBPS = 22, }; /** * ib_rate_to_mult - Convert the IB rate enum to a multiple of the * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. * @rate: rate to convert. */ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate); /** * ib_rate_to_mbps - Convert the IB rate enum to Mbps. * For example, IB_RATE_2_5_GBPS will be converted to 2500. * @rate: rate to convert. */ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); /** * enum ib_mr_type - memory region type * @IB_MR_TYPE_MEM_REG: memory region that is used for * normal registration * @IB_MR_TYPE_SIGNATURE: memory region that is used for * signature operations (data-integrity * capable regions) * @IB_MR_TYPE_SG_GAPS: memory region that is capable to * register any arbitrary sg lists (without * the normal mr constraints - see * ib_map_mr_sg) */ enum ib_mr_type { IB_MR_TYPE_MEM_REG, IB_MR_TYPE_SIGNATURE, IB_MR_TYPE_SG_GAPS, }; /** * Signature types * IB_SIG_TYPE_NONE: Unprotected. * IB_SIG_TYPE_T10_DIF: Type T10-DIF */ enum ib_signature_type { IB_SIG_TYPE_NONE, IB_SIG_TYPE_T10_DIF, }; /** * Signature T10-DIF block-guard types * IB_T10DIF_CRC: Corresponds to T10-PI mandated CRC checksum rules. * IB_T10DIF_CSUM: Corresponds to IP checksum rules. */ enum ib_t10_dif_bg_type { IB_T10DIF_CRC, IB_T10DIF_CSUM }; /** * struct ib_t10_dif_domain - Parameters specific for T10-DIF * domain. * @bg_type: T10-DIF block guard type (CRC|CSUM) * @pi_interval: protection information interval. * @bg: seed of guard computation. * @app_tag: application tag of guard block * @ref_tag: initial guard block reference tag. * @ref_remap: Indicate wethear the reftag increments each block * @app_escape: Indicate to skip block check if apptag=0xffff * @ref_escape: Indicate to skip block check if reftag=0xffffffff * @apptag_check_mask: check bitmask of application tag. */ struct ib_t10_dif_domain { enum ib_t10_dif_bg_type bg_type; u16 pi_interval; u16 bg; u16 app_tag; u32 ref_tag; bool ref_remap; bool app_escape; bool ref_escape; u16 apptag_check_mask; }; /** * struct ib_sig_domain - Parameters for signature domain * @sig_type: specific signauture type * @sig: union of all signature domain attributes that may * be used to set domain layout. */ struct ib_sig_domain { enum ib_signature_type sig_type; union { struct ib_t10_dif_domain dif; } sig; }; /** * struct ib_sig_attrs - Parameters for signature handover operation * @check_mask: bitmask for signature byte check (8 bytes) * @mem: memory domain layout desciptor. * @wire: wire domain layout desciptor. */ struct ib_sig_attrs { u8 check_mask; struct ib_sig_domain mem; struct ib_sig_domain wire; }; enum ib_sig_err_type { IB_SIG_BAD_GUARD, IB_SIG_BAD_REFTAG, IB_SIG_BAD_APPTAG, }; /** * struct ib_sig_err - signature error descriptor */ struct ib_sig_err { enum ib_sig_err_type err_type; u32 expected; u32 actual; u64 sig_err_offset; u32 key; }; enum ib_mr_status_check { IB_MR_CHECK_SIG_STATUS = 1, }; /** * struct ib_mr_status - Memory region status container * * @fail_status: Bitmask of MR checks status. For each * failed check a corresponding status bit is set. * @sig_err: Additional info for IB_MR_CEHCK_SIG_STATUS * failure. */ struct ib_mr_status { u32 fail_status; struct ib_sig_err sig_err; }; /** * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate * enum. * @mult: multiple to convert. */ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult); struct ib_ah_attr { struct ib_global_route grh; u16 dlid; u8 sl; u8 src_path_bits; u8 static_rate; u8 ah_flags; u8 port_num; u8 dmac[ETH_ALEN]; }; enum ib_wc_status { IB_WC_SUCCESS, IB_WC_LOC_LEN_ERR, IB_WC_LOC_QP_OP_ERR, IB_WC_LOC_EEC_OP_ERR, IB_WC_LOC_PROT_ERR, IB_WC_WR_FLUSH_ERR, IB_WC_MW_BIND_ERR, IB_WC_BAD_RESP_ERR, IB_WC_LOC_ACCESS_ERR, IB_WC_REM_INV_REQ_ERR, IB_WC_REM_ACCESS_ERR, IB_WC_REM_OP_ERR, IB_WC_RETRY_EXC_ERR, IB_WC_RNR_RETRY_EXC_ERR, IB_WC_LOC_RDD_VIOL_ERR, IB_WC_REM_INV_RD_REQ_ERR, IB_WC_REM_ABORT_ERR, IB_WC_INV_EECN_ERR, IB_WC_INV_EEC_STATE_ERR, IB_WC_FATAL_ERR, IB_WC_RESP_TIMEOUT_ERR, IB_WC_GENERAL_ERR }; const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status); enum ib_wc_opcode { IB_WC_SEND, IB_WC_RDMA_WRITE, IB_WC_RDMA_READ, IB_WC_COMP_SWAP, IB_WC_FETCH_ADD, IB_WC_LSO, IB_WC_LOCAL_INV, IB_WC_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, /* * Set value of IB_WC_RECV so consumers can test if a completion is a * receive by testing (opcode & IB_WC_RECV). */ IB_WC_RECV = 1 << 7, IB_WC_RECV_RDMA_WITH_IMM, IB_WC_DUMMY = -1, /* force enum signed */ }; enum ib_wc_flags { IB_WC_GRH = 1, IB_WC_WITH_IMM = (1<<1), IB_WC_WITH_INVALIDATE = (1<<2), IB_WC_IP_CSUM_OK = (1<<3), IB_WC_WITH_SMAC = (1<<4), IB_WC_WITH_VLAN = (1<<5), IB_WC_WITH_NETWORK_HDR_TYPE = (1<<6), }; struct ib_wc { union { u64 wr_id; struct ib_cqe *wr_cqe; }; enum ib_wc_status status; enum ib_wc_opcode opcode; u32 vendor_err; u32 byte_len; struct ib_qp *qp; union { __be32 imm_data; u32 invalidate_rkey; } ex; u32 src_qp; int wc_flags; u16 pkey_index; u16 slid; u8 sl; u8 dlid_path_bits; u8 port_num; /* valid only for DR SMPs on switches */ u8 smac[ETH_ALEN]; u16 vlan_id; u8 network_hdr_type; }; enum ib_cq_notify_flags { IB_CQ_SOLICITED = 1 << 0, IB_CQ_NEXT_COMP = 1 << 1, IB_CQ_SOLICITED_MASK = IB_CQ_SOLICITED | IB_CQ_NEXT_COMP, IB_CQ_REPORT_MISSED_EVENTS = 1 << 2, }; enum ib_srq_type { IB_SRQT_BASIC, IB_SRQT_XRC }; enum ib_srq_attr_mask { IB_SRQ_MAX_WR = 1 << 0, IB_SRQ_LIMIT = 1 << 1, }; struct ib_srq_attr { u32 max_wr; u32 max_sge; u32 srq_limit; }; struct ib_srq_init_attr { void (*event_handler)(struct ib_event *, void *); void *srq_context; struct ib_srq_attr attr; enum ib_srq_type srq_type; union { struct { struct ib_xrcd *xrcd; struct ib_cq *cq; } xrc; } ext; }; struct ib_qp_cap { u32 max_send_wr; u32 max_recv_wr; u32 max_send_sge; u32 max_recv_sge; u32 max_inline_data; /* * Maximum number of rdma_rw_ctx structures in flight at a time. * ib_create_qp() will calculate the right amount of neededed WRs * and MRs based on this. */ u32 max_rdma_ctxs; }; enum ib_sig_type { IB_SIGNAL_ALL_WR, IB_SIGNAL_REQ_WR }; enum ib_qp_type { /* * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries * here (and in that order) since the MAD layer uses them as * indices into a 2-entry table. */ IB_QPT_SMI, IB_QPT_GSI, IB_QPT_RC, IB_QPT_UC, IB_QPT_UD, IB_QPT_RAW_IPV6, IB_QPT_RAW_ETHERTYPE, IB_QPT_RAW_PACKET = 8, IB_QPT_XRC_INI = 9, IB_QPT_XRC_TGT, IB_QPT_MAX, /* Reserve a range for qp types internal to the low level driver. * These qp types will not be visible at the IB core layer, so the * IB_QPT_MAX usages should not be affected in the core layer */ IB_QPT_RESERVED1 = 0x1000, IB_QPT_RESERVED2, IB_QPT_RESERVED3, IB_QPT_RESERVED4, IB_QPT_RESERVED5, IB_QPT_RESERVED6, IB_QPT_RESERVED7, IB_QPT_RESERVED8, IB_QPT_RESERVED9, IB_QPT_RESERVED10, }; enum ib_qp_create_flags { IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, IB_QP_CREATE_CROSS_CHANNEL = 1 << 2, IB_QP_CREATE_MANAGED_SEND = 1 << 3, IB_QP_CREATE_MANAGED_RECV = 1 << 4, IB_QP_CREATE_NETIF_QP = 1 << 5, IB_QP_CREATE_SIGNATURE_EN = 1 << 6, IB_QP_CREATE_USE_GFP_NOIO = 1 << 7, IB_QP_CREATE_SCATTER_FCS = 1 << 8, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, }; /* * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler * callback to destroy the passed in QP. */ struct ib_qp_init_attr { void (*event_handler)(struct ib_event *, void *); void *qp_context; struct ib_cq *send_cq; struct ib_cq *recv_cq; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct ib_qp_cap cap; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; enum ib_qp_create_flags create_flags; /* * Only needed for special QP types, or when using the RW API. */ u8 port_num; struct ib_rwq_ind_table *rwq_ind_tbl; }; struct ib_qp_open_attr { void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; enum ib_qp_type qp_type; }; enum ib_rnr_timeout { IB_RNR_TIMER_655_36 = 0, IB_RNR_TIMER_000_01 = 1, IB_RNR_TIMER_000_02 = 2, IB_RNR_TIMER_000_03 = 3, IB_RNR_TIMER_000_04 = 4, IB_RNR_TIMER_000_06 = 5, IB_RNR_TIMER_000_08 = 6, IB_RNR_TIMER_000_12 = 7, IB_RNR_TIMER_000_16 = 8, IB_RNR_TIMER_000_24 = 9, IB_RNR_TIMER_000_32 = 10, IB_RNR_TIMER_000_48 = 11, IB_RNR_TIMER_000_64 = 12, IB_RNR_TIMER_000_96 = 13, IB_RNR_TIMER_001_28 = 14, IB_RNR_TIMER_001_92 = 15, IB_RNR_TIMER_002_56 = 16, IB_RNR_TIMER_003_84 = 17, IB_RNR_TIMER_005_12 = 18, IB_RNR_TIMER_007_68 = 19, IB_RNR_TIMER_010_24 = 20, IB_RNR_TIMER_015_36 = 21, IB_RNR_TIMER_020_48 = 22, IB_RNR_TIMER_030_72 = 23, IB_RNR_TIMER_040_96 = 24, IB_RNR_TIMER_061_44 = 25, IB_RNR_TIMER_081_92 = 26, IB_RNR_TIMER_122_88 = 27, IB_RNR_TIMER_163_84 = 28, IB_RNR_TIMER_245_76 = 29, IB_RNR_TIMER_327_68 = 30, IB_RNR_TIMER_491_52 = 31 }; enum ib_qp_attr_mask { IB_QP_STATE = 1, IB_QP_CUR_STATE = (1<<1), IB_QP_EN_SQD_ASYNC_NOTIFY = (1<<2), IB_QP_ACCESS_FLAGS = (1<<3), IB_QP_PKEY_INDEX = (1<<4), IB_QP_PORT = (1<<5), IB_QP_QKEY = (1<<6), IB_QP_AV = (1<<7), IB_QP_PATH_MTU = (1<<8), IB_QP_TIMEOUT = (1<<9), IB_QP_RETRY_CNT = (1<<10), IB_QP_RNR_RETRY = (1<<11), IB_QP_RQ_PSN = (1<<12), IB_QP_MAX_QP_RD_ATOMIC = (1<<13), IB_QP_ALT_PATH = (1<<14), IB_QP_MIN_RNR_TIMER = (1<<15), IB_QP_SQ_PSN = (1<<16), IB_QP_MAX_DEST_RD_ATOMIC = (1<<17), IB_QP_PATH_MIG_STATE = (1<<18), IB_QP_CAP = (1<<19), IB_QP_DEST_QPN = (1<<20), IB_QP_RESERVED1 = (1<<21), IB_QP_RESERVED2 = (1<<22), IB_QP_RESERVED3 = (1<<23), IB_QP_RESERVED4 = (1<<24), IB_QP_RATE_LIMIT = (1<<25), }; enum ib_qp_state { IB_QPS_RESET, IB_QPS_INIT, IB_QPS_RTR, IB_QPS_RTS, IB_QPS_SQD, IB_QPS_SQE, IB_QPS_ERR, IB_QPS_DUMMY = -1, /* force enum signed */ }; enum ib_mig_state { IB_MIG_MIGRATED, IB_MIG_REARM, IB_MIG_ARMED }; enum ib_mw_type { IB_MW_TYPE_1 = 1, IB_MW_TYPE_2 = 2 }; struct ib_qp_attr { enum ib_qp_state qp_state; enum ib_qp_state cur_qp_state; enum ib_mtu path_mtu; enum ib_mig_state path_mig_state; u32 qkey; u32 rq_psn; u32 sq_psn; u32 dest_qp_num; int qp_access_flags; struct ib_qp_cap cap; struct ib_ah_attr ah_attr; struct ib_ah_attr alt_ah_attr; u16 pkey_index; u16 alt_pkey_index; u8 en_sqd_async_notify; u8 sq_draining; u8 max_rd_atomic; u8 max_dest_rd_atomic; u8 min_rnr_timer; u8 port_num; u8 timeout; u8 retry_cnt; u8 rnr_retry; u8 alt_port_num; u8 alt_timeout; u32 rate_limit; }; enum ib_wr_opcode { IB_WR_RDMA_WRITE, IB_WR_RDMA_WRITE_WITH_IMM, IB_WR_SEND, IB_WR_SEND_WITH_IMM, IB_WR_RDMA_READ, IB_WR_ATOMIC_CMP_AND_SWP, IB_WR_ATOMIC_FETCH_AND_ADD, IB_WR_LSO, IB_WR_SEND_WITH_INV, IB_WR_RDMA_READ_WITH_INV, IB_WR_LOCAL_INV, IB_WR_REG_MR, IB_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD, IB_WR_REG_SIG_MR, /* reserve values for low level drivers' internal use. * These values will not be used at all in the ib core layer. */ IB_WR_RESERVED1 = 0xf0, IB_WR_RESERVED2, IB_WR_RESERVED3, IB_WR_RESERVED4, IB_WR_RESERVED5, IB_WR_RESERVED6, IB_WR_RESERVED7, IB_WR_RESERVED8, IB_WR_RESERVED9, IB_WR_RESERVED10, IB_WR_DUMMY = -1, /* force enum signed */ }; enum ib_send_flags { IB_SEND_FENCE = 1, IB_SEND_SIGNALED = (1<<1), IB_SEND_SOLICITED = (1<<2), IB_SEND_INLINE = (1<<3), IB_SEND_IP_CSUM = (1<<4), /* reserve bits 26-31 for low level drivers' internal use */ IB_SEND_RESERVED_START = (1 << 26), IB_SEND_RESERVED_END = (1 << 31), }; struct ib_sge { u64 addr; u32 length; u32 lkey; }; struct ib_cqe { void (*done)(struct ib_cq *cq, struct ib_wc *wc); }; struct ib_send_wr { struct ib_send_wr *next; union { u64 wr_id; struct ib_cqe *wr_cqe; }; struct ib_sge *sg_list; int num_sge; enum ib_wr_opcode opcode; int send_flags; union { __be32 imm_data; u32 invalidate_rkey; } ex; }; struct ib_rdma_wr { struct ib_send_wr wr; u64 remote_addr; u32 rkey; }; -static inline struct ib_rdma_wr *rdma_wr(struct ib_send_wr *wr) +static inline const struct ib_rdma_wr *rdma_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_rdma_wr, wr); } struct ib_atomic_wr { struct ib_send_wr wr; u64 remote_addr; u64 compare_add; u64 swap; u64 compare_add_mask; u64 swap_mask; u32 rkey; }; -static inline struct ib_atomic_wr *atomic_wr(struct ib_send_wr *wr) +static inline const struct ib_atomic_wr *atomic_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_atomic_wr, wr); } struct ib_ud_wr { struct ib_send_wr wr; struct ib_ah *ah; void *header; int hlen; int mss; u32 remote_qpn; u32 remote_qkey; u16 pkey_index; /* valid for GSI only */ u8 port_num; /* valid for DR SMPs on switch only */ }; -static inline struct ib_ud_wr *ud_wr(struct ib_send_wr *wr) +static inline const struct ib_ud_wr *ud_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_ud_wr, wr); } struct ib_reg_wr { struct ib_send_wr wr; struct ib_mr *mr; u32 key; int access; }; -static inline struct ib_reg_wr *reg_wr(struct ib_send_wr *wr) +static inline const struct ib_reg_wr *reg_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_reg_wr, wr); } struct ib_sig_handover_wr { struct ib_send_wr wr; struct ib_sig_attrs *sig_attrs; struct ib_mr *sig_mr; int access_flags; struct ib_sge *prot; }; -static inline struct ib_sig_handover_wr *sig_handover_wr(struct ib_send_wr *wr) +static inline const struct ib_sig_handover_wr *sig_handover_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_sig_handover_wr, wr); } struct ib_recv_wr { struct ib_recv_wr *next; union { u64 wr_id; struct ib_cqe *wr_cqe; }; struct ib_sge *sg_list; int num_sge; }; enum ib_access_flags { IB_ACCESS_LOCAL_WRITE = 1, IB_ACCESS_REMOTE_WRITE = (1<<1), IB_ACCESS_REMOTE_READ = (1<<2), IB_ACCESS_REMOTE_ATOMIC = (1<<3), IB_ACCESS_MW_BIND = (1<<4), IB_ZERO_BASED = (1<<5), IB_ACCESS_ON_DEMAND = (1<<6), }; /* * XXX: these are apparently used for ->rereg_user_mr, no idea why they * are hidden here instead of a uapi header! */ enum ib_mr_rereg_flags { IB_MR_REREG_TRANS = 1, IB_MR_REREG_PD = (1<<1), IB_MR_REREG_ACCESS = (1<<2), IB_MR_REREG_SUPPORTED = ((IB_MR_REREG_ACCESS << 1) - 1) }; struct ib_fmr_attr { int max_pages; int max_maps; u8 page_shift; }; struct ib_umem; struct ib_ucontext { struct ib_device *device; struct list_head pd_list; struct list_head mr_list; struct list_head mw_list; struct list_head cq_list; struct list_head qp_list; struct list_head srq_list; struct list_head ah_list; struct list_head xrcd_list; struct list_head rule_list; struct list_head wq_list; struct list_head rwq_ind_tbl_list; int closing; pid_t tgid; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct rb_root umem_tree; /* * Protects .umem_rbroot and tree, as well as odp_mrs_count and * mmu notifiers registration. */ struct rw_semaphore umem_rwsem; void (*invalidate_range)(struct ib_umem *umem, unsigned long start, unsigned long end); struct mmu_notifier mn; atomic_t notifier_count; /* A list of umems that don't have private mmu notifier counters yet. */ struct list_head no_private_counters; int odp_mrs_count; #endif }; struct ib_uobject { u64 user_handle; /* handle given to us by userspace */ struct ib_ucontext *context; /* associated user context */ void *object; /* containing object */ struct list_head list; /* link to context's list */ int id; /* index into kernel idr */ struct kref ref; struct rw_semaphore mutex; /* protects .live */ struct rcu_head rcu; /* kfree_rcu() overhead */ int live; }; struct ib_udata { const void __user *inbuf; void __user *outbuf; size_t inlen; size_t outlen; }; struct ib_pd { u32 local_dma_lkey; u32 flags; struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; /* count all resources */ u32 unsafe_global_rkey; /* * Implementation details of the RDMA core, don't use in drivers: */ struct ib_mr *__internal_mr; }; struct ib_xrcd { struct ib_device *device; atomic_t usecnt; /* count all exposed resources */ struct inode *inode; struct mutex tgt_qp_mutex; struct list_head tgt_qp_list; }; struct ib_ah { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; }; typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); enum ib_poll_context { IB_POLL_DIRECT, /* caller context, no hw completions */ IB_POLL_SOFTIRQ, /* poll from softirq context */ IB_POLL_WORKQUEUE, /* poll from workqueue */ }; struct ib_cq { struct ib_device *device; struct ib_uobject *uobject; ib_comp_handler comp_handler; void (*event_handler)(struct ib_event *, void *); void *cq_context; int cqe; atomic_t usecnt; /* count number of work queues */ enum ib_poll_context poll_ctx; struct work_struct work; }; struct ib_srq { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; void (*event_handler)(struct ib_event *, void *); void *srq_context; enum ib_srq_type srq_type; atomic_t usecnt; union { struct { struct ib_xrcd *xrcd; struct ib_cq *cq; u32 srq_num; } xrc; } ext; }; enum ib_wq_type { IB_WQT_RQ }; enum ib_wq_state { IB_WQS_RESET, IB_WQS_RDY, IB_WQS_ERR }; struct ib_wq { struct ib_device *device; struct ib_uobject *uobject; void *wq_context; void (*event_handler)(struct ib_event *, void *); struct ib_pd *pd; struct ib_cq *cq; u32 wq_num; enum ib_wq_state state; enum ib_wq_type wq_type; atomic_t usecnt; }; struct ib_wq_init_attr { void *wq_context; enum ib_wq_type wq_type; u32 max_wr; u32 max_sge; struct ib_cq *cq; void (*event_handler)(struct ib_event *, void *); }; enum ib_wq_attr_mask { IB_WQ_STATE = 1 << 0, IB_WQ_CUR_STATE = 1 << 1, }; struct ib_wq_attr { enum ib_wq_state wq_state; enum ib_wq_state curr_wq_state; }; struct ib_rwq_ind_table { struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; u32 ind_tbl_num; u32 log_ind_tbl_size; struct ib_wq **ind_tbl; }; struct ib_rwq_ind_table_init_attr { u32 log_ind_tbl_size; /* Each entry is a pointer to Receive Work Queue */ struct ib_wq **ind_tbl; }; /* * @max_write_sge: Maximum SGE elements per RDMA WRITE request. * @max_read_sge: Maximum SGE elements per RDMA READ request. */ struct ib_qp { struct ib_device *device; struct ib_pd *pd; struct ib_cq *send_cq; struct ib_cq *recv_cq; spinlock_t mr_lock; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct list_head xrcd_list; /* count times opened, mcast attaches, flow attaches */ atomic_t usecnt; struct list_head open_list; struct ib_qp *real_qp; struct ib_uobject *uobject; void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; u32 max_write_sge; u32 max_read_sge; enum ib_qp_type qp_type; struct ib_rwq_ind_table *rwq_ind_tbl; }; struct ib_mr { struct ib_device *device; struct ib_pd *pd; u32 lkey; u32 rkey; u64 iova; u64 length; unsigned int page_size; bool need_inval; union { struct ib_uobject *uobject; /* user */ struct list_head qp_entry; /* FR */ }; }; struct ib_mw { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; u32 rkey; enum ib_mw_type type; }; struct ib_fmr { struct ib_device *device; struct ib_pd *pd; struct list_head list; u32 lkey; u32 rkey; }; /* Supported steering options */ enum ib_flow_attr_type { /* steering according to rule specifications */ IB_FLOW_ATTR_NORMAL = 0x0, /* default unicast and multicast rule - * receive all Eth traffic which isn't steered to any QP */ IB_FLOW_ATTR_ALL_DEFAULT = 0x1, /* default multicast rule - * receive all Eth multicast traffic which isn't steered to any QP */ IB_FLOW_ATTR_MC_DEFAULT = 0x2, /* sniffer rule - receive all port traffic */ IB_FLOW_ATTR_SNIFFER = 0x3 }; /* Supported steering header types */ enum ib_flow_spec_type { /* L2 headers*/ IB_FLOW_SPEC_ETH = 0x20, IB_FLOW_SPEC_IB = 0x22, /* L3 header*/ IB_FLOW_SPEC_IPV4 = 0x30, IB_FLOW_SPEC_IPV6 = 0x31, /* L4 headers*/ IB_FLOW_SPEC_TCP = 0x40, IB_FLOW_SPEC_UDP = 0x41 }; #define IB_FLOW_SPEC_LAYER_MASK 0xF0 #define IB_FLOW_SPEC_SUPPORT_LAYERS 4 /* Flow steering rule priority is set according to it's domain. * Lower domain value means higher priority. */ enum ib_flow_domain { IB_FLOW_DOMAIN_USER, IB_FLOW_DOMAIN_ETHTOOL, IB_FLOW_DOMAIN_RFS, IB_FLOW_DOMAIN_NIC, IB_FLOW_DOMAIN_NUM /* Must be last */ }; enum ib_flow_flags { IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */ IB_FLOW_ATTR_FLAGS_RESERVED = 1UL << 2 /* Must be last */ }; struct ib_flow_eth_filter { u8 dst_mac[6]; u8 src_mac[6]; __be16 ether_type; __be16 vlan_tag; /* Must be last */ u8 real_sz[0]; }; struct ib_flow_spec_eth { enum ib_flow_spec_type type; u16 size; struct ib_flow_eth_filter val; struct ib_flow_eth_filter mask; }; struct ib_flow_ib_filter { __be16 dlid; __u8 sl; /* Must be last */ u8 real_sz[0]; }; struct ib_flow_spec_ib { enum ib_flow_spec_type type; u16 size; struct ib_flow_ib_filter val; struct ib_flow_ib_filter mask; }; /* IPv4 header flags */ enum ib_ipv4_flags { IB_IPV4_DONT_FRAG = 0x2, /* Don't enable packet fragmentation */ IB_IPV4_MORE_FRAG = 0X4 /* For All fragmented packets except the last have this flag set */ }; struct ib_flow_ipv4_filter { __be32 src_ip; __be32 dst_ip; u8 proto; u8 tos; u8 ttl; u8 flags; /* Must be last */ u8 real_sz[0]; }; struct ib_flow_spec_ipv4 { enum ib_flow_spec_type type; u16 size; struct ib_flow_ipv4_filter val; struct ib_flow_ipv4_filter mask; }; struct ib_flow_ipv6_filter { u8 src_ip[16]; u8 dst_ip[16]; __be32 flow_label; u8 next_hdr; u8 traffic_class; u8 hop_limit; /* Must be last */ u8 real_sz[0]; }; struct ib_flow_spec_ipv6 { enum ib_flow_spec_type type; u16 size; struct ib_flow_ipv6_filter val; struct ib_flow_ipv6_filter mask; }; struct ib_flow_tcp_udp_filter { __be16 dst_port; __be16 src_port; /* Must be last */ u8 real_sz[0]; }; struct ib_flow_spec_tcp_udp { enum ib_flow_spec_type type; u16 size; struct ib_flow_tcp_udp_filter val; struct ib_flow_tcp_udp_filter mask; }; union ib_flow_spec { struct { enum ib_flow_spec_type type; u16 size; }; struct ib_flow_spec_eth eth; struct ib_flow_spec_ib ib; struct ib_flow_spec_ipv4 ipv4; struct ib_flow_spec_tcp_udp tcp_udp; struct ib_flow_spec_ipv6 ipv6; }; struct ib_flow_attr { enum ib_flow_attr_type type; u16 size; u16 priority; u32 flags; u8 num_of_specs; u8 port; /* Following are the optional layers according to user request * struct ib_flow_spec_xxx * struct ib_flow_spec_yyy */ }; struct ib_flow { struct ib_qp *qp; struct ib_uobject *uobject; }; struct ib_mad_hdr; struct ib_grh; enum ib_process_mad_flags { IB_MAD_IGNORE_MKEY = 1, IB_MAD_IGNORE_BKEY = 2, IB_MAD_IGNORE_ALL = IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY }; enum ib_mad_result { IB_MAD_RESULT_FAILURE = 0, /* (!SUCCESS is the important flag) */ IB_MAD_RESULT_SUCCESS = 1 << 0, /* MAD was successfully processed */ IB_MAD_RESULT_REPLY = 1 << 1, /* Reply packet needs to be sent */ IB_MAD_RESULT_CONSUMED = 1 << 2 /* Packet consumed: stop processing */ }; #define IB_DEVICE_NAME_MAX 64 struct ib_cache { rwlock_t lock; struct ib_event_handler event_handler; struct ib_pkey_cache **pkey_cache; struct ib_gid_table **gid_cache; u8 *lmc_cache; }; struct ib_dma_mapping_ops { int (*mapping_error)(struct ib_device *dev, u64 dma_addr); u64 (*map_single)(struct ib_device *dev, void *ptr, size_t size, enum dma_data_direction direction); void (*unmap_single)(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction); u64 (*map_page)(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction); void (*unmap_page)(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction); int (*map_sg)(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); void (*unmap_sg)(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); int (*map_sg_attrs)(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs); void (*unmap_sg_attrs)(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs); void (*sync_single_for_cpu)(struct ib_device *dev, u64 dma_handle, size_t size, enum dma_data_direction dir); void (*sync_single_for_device)(struct ib_device *dev, u64 dma_handle, size_t size, enum dma_data_direction dir); void *(*alloc_coherent)(struct ib_device *dev, size_t size, u64 *dma_handle, gfp_t flag); void (*free_coherent)(struct ib_device *dev, size_t size, void *cpu_addr, u64 dma_handle); }; struct iw_cm_verbs; struct ib_port_immutable { int pkey_tbl_len; int gid_tbl_len; u32 core_cap_flags; u32 max_mad_size; }; struct ib_device { struct device *dma_device; char name[IB_DEVICE_NAME_MAX]; struct list_head event_handler_list; spinlock_t event_handler_lock; spinlock_t client_data_lock; struct list_head core_list; /* Access to the client_data_list is protected by the client_data_lock * spinlock and the lists_rwsem read-write semaphore */ struct list_head client_data_list; struct ib_cache cache; /** * port_immutable is indexed by port number */ struct ib_port_immutable *port_immutable; int num_comp_vectors; struct iw_cm_verbs *iwcm; /** * alloc_hw_stats - Allocate a struct rdma_hw_stats and fill in the * driver initialized data. The struct is kfree()'ed by the sysfs * core when the device is removed. A lifespan of -1 in the return * struct tells the core to set a default lifespan. */ struct rdma_hw_stats *(*alloc_hw_stats)(struct ib_device *device, u8 port_num); /** * get_hw_stats - Fill in the counter value(s) in the stats struct. * @index - The index in the value array we wish to have updated, or * num_counters if we want all stats updated * Return codes - * < 0 - Error, no counters updated * index - Updated the single counter pointed to by index * num_counters - Updated all counters (will reset the timestamp * and prevent further calls for lifespan milliseconds) * Drivers are allowed to update all counters in leiu of just the * one given in index at their option */ int (*get_hw_stats)(struct ib_device *device, struct rdma_hw_stats *stats, u8 port, int index); int (*query_device)(struct ib_device *device, struct ib_device_attr *device_attr, struct ib_udata *udata); int (*query_port)(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer (*get_link_layer)(struct ib_device *device, u8 port_num); /* When calling get_netdev, the HW vendor's driver should return the * net device of device @device at port @port_num or NULL if such * a net device doesn't exist. The vendor driver should call dev_hold * on this net device. The HW vendor's device driver must guarantee * that this function returns NULL before the net device reaches * NETDEV_UNREGISTER_FINAL state. */ struct ifnet *(*get_netdev)(struct ib_device *device, u8 port_num); int (*query_gid)(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); /* When calling add_gid, the HW vendor's driver should * add the gid of device @device at gid index @index of * port @port_num to be @gid. Meta-info of that gid (for example, * the network device related to this gid is available * at @attr. @context allows the HW vendor driver to store extra * information together with a GID entry. The HW vendor may allocate * memory to contain this information and store it in @context when a * new GID entry is written to. Params are consistent until the next * call of add_gid or delete_gid. The function should return 0 on * success or error otherwise. The function could be called * concurrently for different ports. This function is only called * when roce_gid_table is used. */ int (*add_gid)(struct ib_device *device, u8 port_num, unsigned int index, const union ib_gid *gid, const struct ib_gid_attr *attr, void **context); /* When calling del_gid, the HW vendor's driver should delete the * gid of device @device at gid index @index of port @port_num. * Upon the deletion of a GID entry, the HW vendor must free any * allocated memory. The caller will clear @context afterwards. * This function is only called when roce_gid_table is used. */ int (*del_gid)(struct ib_device *device, u8 port_num, unsigned int index, void **context); int (*query_pkey)(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); int (*modify_device)(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); int (*modify_port)(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify); struct ib_ucontext * (*alloc_ucontext)(struct ib_device *device, struct ib_udata *udata); int (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); struct ib_pd * (*alloc_pd)(struct ib_device *device, struct ib_ucontext *context, struct ib_udata *udata); int (*dealloc_pd)(struct ib_pd *pd); struct ib_ah * (*create_ah)(struct ib_pd *pd, struct ib_ah_attr *ah_attr, struct ib_udata *udata); int (*modify_ah)(struct ib_ah *ah, struct ib_ah_attr *ah_attr); int (*query_ah)(struct ib_ah *ah, struct ib_ah_attr *ah_attr); int (*destroy_ah)(struct ib_ah *ah); struct ib_srq * (*create_srq)(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr, struct ib_udata *udata); int (*modify_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask, struct ib_udata *udata); int (*query_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr); int (*destroy_srq)(struct ib_srq *srq); int (*post_srq_recv)(struct ib_srq *srq, - struct ib_recv_wr *recv_wr, - struct ib_recv_wr **bad_recv_wr); + const struct ib_recv_wr *recv_wr, + const struct ib_recv_wr **bad_recv_wr); struct ib_qp * (*create_qp)(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata); int (*modify_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_udata *udata); int (*query_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int (*destroy_qp)(struct ib_qp *qp); int (*post_send)(struct ib_qp *qp, - struct ib_send_wr *send_wr, - struct ib_send_wr **bad_send_wr); + const struct ib_send_wr *send_wr, + const struct ib_send_wr **bad_send_wr); int (*post_recv)(struct ib_qp *qp, - struct ib_recv_wr *recv_wr, - struct ib_recv_wr **bad_recv_wr); + const struct ib_recv_wr *recv_wr, + const struct ib_recv_wr **bad_recv_wr); struct ib_cq * (*create_cq)(struct ib_device *device, const struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*destroy_cq)(struct ib_cq *cq); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); int (*poll_cq)(struct ib_cq *cq, int num_entries, struct ib_wc *wc); int (*peek_cq)(struct ib_cq *cq, int wc_cnt); int (*req_notify_cq)(struct ib_cq *cq, enum ib_cq_notify_flags flags); int (*req_ncomp_notif)(struct ib_cq *cq, int wc_cnt); struct ib_mr * (*get_dma_mr)(struct ib_pd *pd, int mr_access_flags); struct ib_mr * (*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_udata *udata); int (*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_pd *pd, struct ib_udata *udata); int (*dereg_mr)(struct ib_mr *mr); struct ib_mr * (*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); int (*map_mr_sg)(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); struct ib_mw * (*alloc_mw)(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata); int (*dealloc_mw)(struct ib_mw *mw); struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); int (*map_phys_fmr)(struct ib_fmr *fmr, u64 *page_list, int list_len, u64 iova); int (*unmap_fmr)(struct list_head *fmr_list); int (*dealloc_fmr)(struct ib_fmr *fmr); int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*process_mad)(struct ib_device *device, int process_mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *in_mad, size_t in_mad_size, struct ib_mad_hdr *out_mad, size_t *out_mad_size, u16 *out_mad_pkey_index); struct ib_xrcd * (*alloc_xrcd)(struct ib_device *device, struct ib_ucontext *ucontext, struct ib_udata *udata); int (*dealloc_xrcd)(struct ib_xrcd *xrcd); struct ib_flow * (*create_flow)(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain); int (*destroy_flow)(struct ib_flow *flow_id); int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); void (*drain_rq)(struct ib_qp *qp); void (*drain_sq)(struct ib_qp *qp); int (*set_vf_link_state)(struct ib_device *device, int vf, u8 port, int state); int (*get_vf_config)(struct ib_device *device, int vf, u8 port, struct ifla_vf_info *ivf); int (*get_vf_stats)(struct ib_device *device, int vf, u8 port, struct ifla_vf_stats *stats); int (*set_vf_guid)(struct ib_device *device, int vf, u8 port, u64 guid, int type); struct ib_wq * (*create_wq)(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata); int (*destroy_wq)(struct ib_wq *wq); int (*modify_wq)(struct ib_wq *wq, struct ib_wq_attr *attr, u32 wq_attr_mask, struct ib_udata *udata); struct ib_rwq_ind_table * (*create_rwq_ind_table)(struct ib_device *device, struct ib_rwq_ind_table_init_attr *init_attr, struct ib_udata *udata); int (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table); struct ib_dma_mapping_ops *dma_ops; struct module *owner; struct device dev; struct kobject *ports_parent; struct list_head port_list; enum { IB_DEV_UNINITIALIZED, IB_DEV_REGISTERED, IB_DEV_UNREGISTERED } reg_state; int uverbs_abi_ver; u64 uverbs_cmd_mask; u64 uverbs_ex_cmd_mask; char node_desc[IB_DEVICE_NODE_DESC_MAX]; __be64 node_guid; u32 local_dma_lkey; u16 is_switch:1; u8 node_type; u8 phys_port_cnt; struct ib_device_attr attrs; struct attribute_group *hw_stats_ag; struct rdma_hw_stats *hw_stats; /** * The following mandatory functions are used only at device * registration. Keep functions such as these at the end of this * structure to avoid cache line misses when accessing struct ib_device * in fast paths. */ int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *); void (*get_dev_fw_str)(struct ib_device *, char *str, size_t str_len); }; struct ib_client { char *name; void (*add) (struct ib_device *); void (*remove)(struct ib_device *, void *client_data); /* Returns the net_dev belonging to this ib_client and matching the * given parameters. * @dev: An RDMA device that the net_dev use for communication. * @port: A physical port number on the RDMA device. * @pkey: P_Key that the net_dev uses if applicable. * @gid: A GID that the net_dev uses to communicate. * @addr: An IP address the net_dev is configured with. * @client_data: The device's client data set by ib_set_client_data(). * * An ib_client that implements a net_dev on top of RDMA devices * (such as IP over IB) should implement this callback, allowing the * rdma_cm module to find the right net_dev for a given request. * * The caller is responsible for calling dev_put on the returned * netdev. */ struct ifnet *(*get_net_dev_by_params)( struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); struct list_head list; }; struct ib_device *ib_alloc_device(size_t size); void ib_dealloc_device(struct ib_device *device); void ib_get_device_fw_str(struct ib_device *device, char *str, size_t str_len); int ib_register_device(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)); void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); void ib_unregister_client(struct ib_client *client); void *ib_get_client_data(struct ib_device *device, struct ib_client *client); void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; } static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) { return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; } static inline bool ib_is_udata_cleared(struct ib_udata *udata, size_t offset, size_t len) { const void __user *p = (const char __user *)udata->inbuf + offset; bool ret; u8 *buf; if (len > USHRT_MAX) return false; buf = memdup_user(p, len); if (IS_ERR(buf)) return false; ret = !memchr_inv(buf, 0, len); kfree(buf); return ret; } /** * ib_modify_qp_is_ok - Check that the supplied attribute mask * contains all required attributes and no attributes not allowed for * the given QP state transition. * @cur_state: Current QP state * @next_state: Next QP state * @type: QP type * @mask: Mask of supplied QP attributes * * This function is a helper function that a low-level driver's * modify_qp method can use to validate the consumer's input. It * checks that cur_state and next_state are valid QP states, that a * transition from cur_state to next_state is allowed by the IB spec, * and that the attribute mask supplied is allowed for the transition. */ bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, enum ib_qp_type type, enum ib_qp_attr_mask mask); int ib_register_event_handler (struct ib_event_handler *event_handler); int ib_unregister_event_handler(struct ib_event_handler *event_handler); void ib_dispatch_event(struct ib_event *event); int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num); /** * rdma_cap_ib_switch - Check if the device is IB switch * @device: Device to check * * Device driver is responsible for setting is_switch bit on * in ib_device structure at init time. * * Return: true if the device is IB switch. */ static inline bool rdma_cap_ib_switch(const struct ib_device *device) { return device->is_switch; } /** * rdma_start_port - Return the first valid port number for the device * specified * * @device: Device to be checked * * Return start port number */ static inline u8 rdma_start_port(const struct ib_device *device) { return rdma_cap_ib_switch(device) ? 0 : 1; } /** * rdma_end_port - Return the last valid port number for the device * specified * * @device: Device to be checked * * Return last port number */ static inline u8 rdma_end_port(const struct ib_device *device) { return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt; } static inline int rdma_is_port_valid(const struct ib_device *device, unsigned int port) { return (port >= rdma_start_port(device) && port <= rdma_end_port(device)); } static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IB; } static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP); } static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; } static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE; } static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IWARP; } static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num) { return rdma_protocol_ib(device, port_num) || rdma_protocol_roce(device, port_num); } /** * rdma_cap_ib_mad - Check if the port of a device supports Infiniband * Management Datagrams. * @device: Device to check * @port_num: Port number to check * * Management Datagrams (MAD) are a required part of the InfiniBand * specification and are supported on all InfiniBand devices. A slightly * extended version are also supported on OPA interfaces. * * Return: true if the port supports sending/receiving of MAD packets. */ static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_MAD; } /** * rdma_cap_opa_mad - Check if the port of device provides support for OPA * Management Datagrams. * @device: Device to check * @port_num: Port number to check * * Intel OmniPath devices extend and/or replace the InfiniBand Management * datagrams with their own versions. These OPA MADs share many but not all of * the characteristics of InfiniBand MADs. * * OPA MADs differ in the following ways: * * 1) MADs are variable size up to 2K * IBTA defined MADs remain fixed at 256 bytes * 2) OPA SMPs must carry valid PKeys * 3) OPA SMP packets are a different format * * Return: true if the port supports OPA MAD packet formats. */ static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num) { return (device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_OPA_MAD) == RDMA_CORE_CAP_OPA_MAD; } /** * rdma_cap_ib_smi - Check if the port of a device provides an Infiniband * Subnet Management Agent (SMA) on the Subnet Management Interface (SMI). * @device: Device to check * @port_num: Port number to check * * Each InfiniBand node is required to provide a Subnet Management Agent * that the subnet manager can access. Prior to the fabric being fully * configured by the subnet manager, the SMA is accessed via a well known * interface called the Subnet Management Interface (SMI). This interface * uses directed route packets to communicate with the SM to get around the * chicken and egg problem of the SM needing to know what's on the fabric * in order to configure the fabric, and needing to configure the fabric in * order to send packets to the devices on the fabric. These directed * route packets do not need the fabric fully configured in order to reach * their destination. The SMI is the only method allowed to send * directed route packets on an InfiniBand fabric. * * Return: true if the port provides an SMI. */ static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SMI; } /** * rdma_cap_ib_cm - Check if the port of device has the capability Infiniband * Communication Manager. * @device: Device to check * @port_num: Port number to check * * The InfiniBand Communication Manager is one of many pre-defined General * Service Agents (GSA) that are accessed via the General Service * Interface (GSI). It's role is to facilitate establishment of connections * between nodes as well as other management related tasks for established * connections. * * Return: true if the port supports an IB CM (this does not guarantee that * a CM is actually running however). */ static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_CM; } /** * rdma_cap_iw_cm - Check if the port of device has the capability IWARP * Communication Manager. * @device: Device to check * @port_num: Port number to check * * Similar to above, but specific to iWARP connections which have a different * managment protocol than InfiniBand. * * Return: true if the port supports an iWARP CM (this does not guarantee that * a CM is actually running however). */ static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IW_CM; } /** * rdma_cap_ib_sa - Check if the port of device has the capability Infiniband * Subnet Administration. * @device: Device to check * @port_num: Port number to check * * An InfiniBand Subnet Administration (SA) service is a pre-defined General * Service Agent (GSA) provided by the Subnet Manager (SM). On InfiniBand * fabrics, devices should resolve routes to other hosts by contacting the * SA to query the proper route. * * Return: true if the port should act as a client to the fabric Subnet * Administration interface. This does not imply that the SA service is * running locally. */ static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SA; } /** * rdma_cap_ib_mcast - Check if the port of device has the capability Infiniband * Multicast. * @device: Device to check * @port_num: Port number to check * * InfiniBand multicast registration is more complex than normal IPv4 or * IPv6 multicast registration. Each Host Channel Adapter must register * with the Subnet Manager when it wishes to join a multicast group. It * should do so only once regardless of how many queue pairs it subscribes * to this group. And it should leave the group only after all queue pairs * attached to the group have been detached. * * Return: true if the port must undertake the additional adminstrative * overhead of registering/unregistering with the SM and tracking of the * total number of queue pairs attached to the multicast group. */ static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num) { return rdma_cap_ib_sa(device, port_num); } /** * rdma_cap_af_ib - Check if the port of device has the capability * Native Infiniband Address. * @device: Device to check * @port_num: Port number to check * * InfiniBand addressing uses a port's GUID + Subnet Prefix to make a default * GID. RoCE uses a different mechanism, but still generates a GID via * a prescribed mechanism and port specific data. * * Return: true if the port uses a GID address to identify devices on the * network. */ static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_AF_IB; } /** * rdma_cap_eth_ah - Check if the port of device has the capability * Ethernet Address Handle. * @device: Device to check * @port_num: Port number to check * * RoCE is InfiniBand over Ethernet, and it uses a well defined technique * to fabricate GIDs over Ethernet/IP specific addresses native to the * port. Normally, packet headers are generated by the sending host * adapter, but when sending connectionless datagrams, we must manually * inject the proper headers for the fabric we are communicating over. * * Return: true if we are running as a RoCE port and must force the * addition of a Global Route Header built from our Ethernet Address * Handle into our header list for connectionless packets. */ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_ETH_AH; } /** * rdma_max_mad_size - Return the max MAD size required by this RDMA Port. * * @device: Device * @port_num: Port number * * This MAD size includes the MAD headers and MAD payload. No other headers * are included. * * Return the max MAD size required by the Port. Will return 0 if the port * does not support MADs */ static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].max_mad_size; } /** * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table * @device: Device to check * @port_num: Port number to check * * RoCE GID table mechanism manages the various GIDs for a device. * * NOTE: if allocating the port's GID table has failed, this call will still * return true, but any RoCE GID table API will fail. * * Return: true if the port uses RoCE GID table mechanism in order to manage * its GIDs. */ static inline bool rdma_cap_roce_gid_table(const struct ib_device *device, u8 port_num) { return rdma_protocol_roce(device, port_num) && device->add_gid && device->del_gid; } /* * Check if the device supports READ W/ INVALIDATE. */ static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num) { /* * iWarp drivers must support READ W/ INVALIDATE. No other protocol * has support for it yet. */ return rdma_protocol_iwarp(dev, port_num); } int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid, struct ib_gid_attr *attr); int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, int state); int ib_get_vf_config(struct ib_device *device, int vf, u8 port, struct ifla_vf_info *info); int ib_get_vf_stats(struct ib_device *device, int vf, u8 port, struct ifla_vf_stats *stats); int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, int type); int ib_query_pkey(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); int ib_modify_port(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify); int ib_find_gid(struct ib_device *device, union ib_gid *gid, enum ib_gid_type gid_type, struct ifnet *ndev, u8 *port_num, u16 *index); int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index); enum ib_pd_flags { /* * Create a memory registration for all memory in the system and place * the rkey for it into pd->unsafe_global_rkey. This can be used by * ULPs to avoid the overhead of dynamic MRs. * * This flag is generally considered unsafe and must only be used in * extremly trusted environments. Every use of it will log a warning * in the kernel log. */ IB_PD_UNSAFE_GLOBAL_RKEY = 0x01, }; struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, const char *caller); #define ib_alloc_pd(device, flags) \ __ib_alloc_pd((device), (flags), __func__) void ib_dealloc_pd(struct ib_pd *pd); /** * ib_create_ah - Creates an address handle for the given address vector. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); /** * ib_init_ah_from_wc - Initializes address handle attributes from a * work completion. * @device: Device on which the received message arrived. * @port_num: Port on which the received message arrived. * @wc: Work completion associated with the received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @ah_attr: Returned attributes that can be used when creating an address * handle for replying to the message. */ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct ib_ah_attr *ah_attr); /** * ib_create_ah_from_wc - Creates an address handle associated with the * sender of the specified work completion. * @pd: The protection domain associated with the address handle. * @wc: Work completion information associated with a received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @port_num: The outbound port number to associate with the address. * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, const struct ib_grh *grh, u8 port_num); /** * ib_modify_ah - Modifies the address vector associated with an address * handle. * @ah: The address handle to modify. * @ah_attr: The new address vector attributes to associate with the * address handle. */ int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); /** * ib_query_ah - Queries the address vector associated with an address * handle. * @ah: The address handle to query. * @ah_attr: The address vector attributes associated with the address * handle. */ int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); /** * ib_destroy_ah - Destroys an address handle. * @ah: The address handle to destroy. */ int ib_destroy_ah(struct ib_ah *ah); /** * ib_create_srq - Creates a SRQ associated with the specified protection * domain. * @pd: The protection domain associated with the SRQ. * @srq_init_attr: A list of initial attributes required to create the * SRQ. If SRQ creation succeeds, then the attributes are updated to * the actual capabilities of the created SRQ. * * srq_attr->max_wr and srq_attr->max_sge are read the determine the * requested size of the SRQ, and set to the actual values allocated * on return. If ib_create_srq() succeeds, then max_wr and max_sge * will always be at least as large as the requested values. */ struct ib_srq *ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr); /** * ib_modify_srq - Modifies the attributes for the specified SRQ. * @srq: The SRQ to modify. * @srq_attr: On input, specifies the SRQ attributes to modify. On output, * the current values of selected SRQ attributes are returned. * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ * are being modified. * * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or * IB_SRQ_LIMIT to set the SRQ's limit and request notification when * the number of receives queued drops below the limit. */ int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask); /** * ib_query_srq - Returns the attribute list and current values for the * specified SRQ. * @srq: The SRQ to query. * @srq_attr: The attributes of the specified SRQ. */ int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); /** * ib_destroy_srq - Destroys the specified SRQ. * @srq: The SRQ to destroy. */ int ib_destroy_srq(struct ib_srq *srq); /** * ib_post_srq_recv - Posts a list of work requests to the specified SRQ. * @srq: The SRQ to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_srq_recv(struct ib_srq *srq, - struct ib_recv_wr *recv_wr, - struct ib_recv_wr **bad_recv_wr) + const struct ib_recv_wr *recv_wr, + const struct ib_recv_wr **bad_recv_wr) { return srq->device->post_srq_recv(srq, recv_wr, bad_recv_wr); } /** * ib_create_qp - Creates a QP associated with the specified protection * domain. * @pd: The protection domain associated with the QP. * @qp_init_attr: A list of initial attributes required to create the * QP. If QP creation succeeds, then the attributes are updated to * the actual capabilities of the created QP. */ struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr); /** * ib_modify_qp - Modifies the attributes for the specified QP and then * transitions the QP to the given state. * @qp: The QP to modify. * @qp_attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @qp_attr_mask: A bit-mask used to specify which attributes of the QP * are being modified. */ int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask); /** * ib_query_qp - Returns the attribute list and current values for the * specified QP. * @qp: The QP to query. * @qp_attr: The attributes of the specified QP. * @qp_attr_mask: A bit-mask used to select specific attributes to query. * @qp_init_attr: Additional attributes of the selected QP. * * The qp_attr_mask may be used to limit the query to gathering only the * selected attributes. */ int ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); /** * ib_destroy_qp - Destroys the specified QP. * @qp: The QP to destroy. */ int ib_destroy_qp(struct ib_qp *qp); /** * ib_open_qp - Obtain a reference to an existing sharable QP. * @xrcd - XRC domain * @qp_open_attr: Attributes identifying the QP to open. * * Returns a reference to a sharable QP. */ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, struct ib_qp_open_attr *qp_open_attr); /** * ib_close_qp - Release an external reference to a QP. * @qp: The QP handle to release * * The opened QP handle is released by the caller. The underlying * shared QP is not destroyed until all internal references are released. */ int ib_close_qp(struct ib_qp *qp); /** * ib_post_send - Posts a list of work requests to the send queue of * the specified QP. * @qp: The QP to post the work request on. * @send_wr: A list of work requests to post on the send queue. * @bad_send_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. * * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate * error is returned, the QP state shall not be affected, * ib_post_send() will return an immediate error after queueing any * earlier work requests in the list. */ static inline int ib_post_send(struct ib_qp *qp, - struct ib_send_wr *send_wr, - struct ib_send_wr **bad_send_wr) + const struct ib_send_wr *send_wr, + const struct ib_send_wr **bad_send_wr) { return qp->device->post_send(qp, send_wr, bad_send_wr); } /** * ib_post_recv - Posts a list of work requests to the receive queue of * the specified QP. * @qp: The QP to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_recv(struct ib_qp *qp, - struct ib_recv_wr *recv_wr, - struct ib_recv_wr **bad_recv_wr) + const struct ib_recv_wr *recv_wr, + const struct ib_recv_wr **bad_recv_wr) { return qp->device->post_recv(qp, recv_wr, bad_recv_wr); } struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx); void ib_free_cq(struct ib_cq *cq); /** * ib_create_cq - Creates a CQ on the specified device. * @device: The device on which to create the CQ. * @comp_handler: A user-specified callback that is invoked when a * completion event occurs on the CQ. * @event_handler: A user-specified callback that is invoked when an * asynchronous event not associated with a completion occurs on the CQ. * @cq_context: Context associated with the CQ returned to the user via * the associated completion and event handlers. * @cq_attr: The attributes the CQ should be created upon. * * Users can examine the cq structure to determine the actual CQ size. */ struct ib_cq *ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), void *cq_context, const struct ib_cq_init_attr *cq_attr); /** * ib_resize_cq - Modifies the capacity of the CQ. * @cq: The CQ to resize. * @cqe: The minimum size of the CQ. * * Users can examine the cq structure to determine the actual CQ size. */ int ib_resize_cq(struct ib_cq *cq, int cqe); /** * ib_modify_cq - Modifies moderation params of the CQ * @cq: The CQ to modify. * @cq_count: number of CQEs that will trigger an event * @cq_period: max period of time in usec before triggering an event * */ int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); /** * ib_destroy_cq - Destroys the specified CQ. * @cq: The CQ to destroy. */ int ib_destroy_cq(struct ib_cq *cq); /** * ib_poll_cq - poll a CQ for completion(s) * @cq:the CQ being polled * @num_entries:maximum number of completions to return * @wc:array of at least @num_entries &struct ib_wc where completions * will be returned * * Poll a CQ for (possibly multiple) completions. If the return value * is < 0, an error occurred. If the return value is >= 0, it is the * number of completions returned. If the return value is * non-negative and < num_entries, then the CQ was emptied. */ static inline int ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) { return cq->device->poll_cq(cq, num_entries, wc); } /** * ib_peek_cq - Returns the number of unreaped completions currently * on the specified CQ. * @cq: The CQ to peek. * @wc_cnt: A minimum number of unreaped completions to check for. * * If the number of unreaped completions is greater than or equal to wc_cnt, * this function returns wc_cnt, otherwise, it returns the actual number of * unreaped completions. */ int ib_peek_cq(struct ib_cq *cq, int wc_cnt); /** * ib_req_notify_cq - Request completion notification on a CQ. * @cq: The CQ to generate an event for. * @flags: * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP * to request an event on the next solicited event or next work * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS * may also be |ed in to request a hint about missed events, as * described below. * * Return Value: * < 0 means an error occurred while requesting notification * == 0 means notification was requested successfully, and if * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events * were missed and it is safe to wait for another event. In * this case is it guaranteed that any work completions added * to the CQ since the last CQ poll will trigger a completion * notification event. * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed * in. It means that the consumer must poll the CQ again to * make sure it is empty to avoid missing an event because of a * race between requesting notification and an entry being * added to the CQ. This return value means it is possible * (but not guaranteed) that a work completion has been added * to the CQ since the last poll without triggering a * completion notification event. */ static inline int ib_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags) { return cq->device->req_notify_cq(cq, flags); } /** * ib_req_ncomp_notif - Request completion notification when there are * at least the specified number of unreaped completions on the CQ. * @cq: The CQ to generate an event for. * @wc_cnt: The number of unreaped completions that should be on the * CQ before an event is generated. */ static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt) { return cq->device->req_ncomp_notif ? cq->device->req_ncomp_notif(cq, wc_cnt) : -ENOSYS; } /** * ib_dma_mapping_error - check a DMA addr for error * @dev: The device for which the dma_addr was created * @dma_addr: The DMA address to check */ static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr) { if (dev->dma_ops) return dev->dma_ops->mapping_error(dev, dma_addr); return dma_mapping_error(dev->dma_device, dma_addr); } /** * ib_dma_map_single - Map a kernel virtual address to DMA address * @dev: The device for which the dma_addr is to be created * @cpu_addr: The kernel virtual address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_single(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) return dev->dma_ops->map_single(dev, cpu_addr, size, direction); return dma_map_single(dev->dma_device, cpu_addr, size, direction); } /** * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) dev->dma_ops->unmap_single(dev, addr, size, direction); else dma_unmap_single(dev->dma_device, addr, size, direction); } static inline u64 ib_dma_map_single_attrs(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction, struct dma_attrs *dma_attrs) { return dma_map_single_attrs(dev->dma_device, cpu_addr, size, direction, dma_attrs); } static inline void ib_dma_unmap_single_attrs(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction, struct dma_attrs *dma_attrs) { return dma_unmap_single_attrs(dev->dma_device, addr, size, direction, dma_attrs); } /** * ib_dma_map_page - Map a physical page to DMA address * @dev: The device for which the dma_addr is to be created * @page: The page to be mapped * @offset: The offset within the page * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_page(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) return dev->dma_ops->map_page(dev, page, offset, size, direction); return dma_map_page(dev->dma_device, page, offset, size, direction); } /** * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) dev->dma_ops->unmap_page(dev, addr, size, direction); else dma_unmap_page(dev->dma_device, addr, size, direction); } /** * ib_dma_map_sg - Map a scatter/gather list to DMA addresses * @dev: The device for which the DMA addresses are to be created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline int ib_dma_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { if (dev->dma_ops) return dev->dma_ops->map_sg(dev, sg, nents, direction); return dma_map_sg(dev->dma_device, sg, nents, direction); } /** * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses * @dev: The device for which the DMA addresses were created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline void ib_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { if (dev->dma_ops) dev->dma_ops->unmap_sg(dev, sg, nents, direction); else dma_unmap_sg(dev->dma_device, sg, nents, direction); } static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *dma_attrs) { if (dev->dma_ops) return dev->dma_ops->map_sg_attrs(dev, sg, nents, direction, dma_attrs); else return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs); } static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *dma_attrs) { if (dev->dma_ops) return dev->dma_ops->unmap_sg_attrs(dev, sg, nents, direction, dma_attrs); else dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs); } /** * ib_sg_dma_address - Return the DMA address from a scatter/gather entry * @dev: The device for which the DMA addresses were created * @sg: The scatter/gather entry * * Note: this function is obsolete. To do: change all occurrences of * ib_sg_dma_address() into sg_dma_address(). */ static inline u64 ib_sg_dma_address(struct ib_device *dev, struct scatterlist *sg) { return sg_dma_address(sg); } /** * ib_sg_dma_len - Return the DMA length from a scatter/gather entry * @dev: The device for which the DMA addresses were created * @sg: The scatter/gather entry * * Note: this function is obsolete. To do: change all occurrences of * ib_sg_dma_len() into sg_dma_len(). */ static inline unsigned int ib_sg_dma_len(struct ib_device *dev, struct scatterlist *sg) { return sg_dma_len(sg); } /** * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (dev->dma_ops) dev->dma_ops->sync_single_for_cpu(dev, addr, size, dir); else dma_sync_single_for_cpu(dev->dma_device, addr, size, dir); } /** * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_device(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (dev->dma_ops) dev->dma_ops->sync_single_for_device(dev, addr, size, dir); else dma_sync_single_for_device(dev->dma_device, addr, size, dir); } /** * ib_dma_alloc_coherent - Allocate memory and map it for DMA * @dev: The device for which the DMA address is requested * @size: The size of the region to allocate in bytes * @dma_handle: A pointer for returning the DMA address of the region * @flag: memory allocator flags */ static inline void *ib_dma_alloc_coherent(struct ib_device *dev, size_t size, u64 *dma_handle, gfp_t flag) { if (dev->dma_ops) return dev->dma_ops->alloc_coherent(dev, size, dma_handle, flag); else { dma_addr_t handle; void *ret; ret = dma_alloc_coherent(dev->dma_device, size, &handle, flag); *dma_handle = handle; return ret; } } /** * ib_dma_free_coherent - Free memory allocated by ib_dma_alloc_coherent() * @dev: The device for which the DMA addresses were allocated * @size: The size of the region * @cpu_addr: the address returned by ib_dma_alloc_coherent() * @dma_handle: the DMA address returned by ib_dma_alloc_coherent() */ static inline void ib_dma_free_coherent(struct ib_device *dev, size_t size, void *cpu_addr, u64 dma_handle) { if (dev->dma_ops) dev->dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); else dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle); } /** * ib_dereg_mr - Deregisters a memory region and removes it from the * HCA translation table. * @mr: The memory region to deregister. * * This function can fail, if the memory region has memory windows bound to it. */ int ib_dereg_mr(struct ib_mr *mr); struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); /** * ib_update_fast_reg_key - updates the key portion of the fast_reg MR * R_Key and L_Key. * @mr - struct ib_mr pointer to be updated. * @newkey - new key to be used. */ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) { mr->lkey = (mr->lkey & 0xffffff00) | newkey; mr->rkey = (mr->rkey & 0xffffff00) | newkey; } /** * ib_inc_rkey - increments the key portion of the given rkey. Can be used * for calculating a new rkey for type 2 memory windows. * @rkey - the rkey to increment. */ static inline u32 ib_inc_rkey(u32 rkey) { const u32 mask = 0x000000ff; return ((rkey + 1) & mask) | (rkey & ~mask); } /** * ib_alloc_fmr - Allocates a unmapped fast memory region. * @pd: The protection domain associated with the unmapped region. * @mr_access_flags: Specifies the memory access rights. * @fmr_attr: Attributes of the unmapped region. * * A fast memory region must be mapped before it can be used as part of * a work request. */ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); /** * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region. * @fmr: The fast memory region to associate with the pages. * @page_list: An array of physical pages to map to the fast memory region. * @list_len: The number of pages in page_list. * @iova: The I/O virtual address to use with the mapped region. */ static inline int ib_map_phys_fmr(struct ib_fmr *fmr, u64 *page_list, int list_len, u64 iova) { return fmr->device->map_phys_fmr(fmr, page_list, list_len, iova); } /** * ib_unmap_fmr - Removes the mapping from a list of fast memory regions. * @fmr_list: A linked list of fast memory regions to unmap. */ int ib_unmap_fmr(struct list_head *fmr_list); /** * ib_dealloc_fmr - Deallocates a fast memory region. * @fmr: The fast memory region to deallocate. */ int ib_dealloc_fmr(struct ib_fmr *fmr); /** * ib_attach_mcast - Attaches the specified QP to a multicast group. * @qp: QP to attach to the multicast group. The QP must be type * IB_QPT_UD. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. * * In order to send and receive multicast packets, subnet * administration must have created the multicast group and configured * the fabric appropriately. The port associated with the specified * QP must also be a member of the multicast group. */ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); /** * ib_detach_mcast - Detaches the specified QP from a multicast group. * @qp: QP to detach from the multicast group. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. */ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); /** * ib_alloc_xrcd - Allocates an XRC domain. * @device: The device on which to allocate the XRC domain. */ struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device); /** * ib_dealloc_xrcd - Deallocates an XRC domain. * @xrcd: The XRC domain to deallocate. */ int ib_dealloc_xrcd(struct ib_xrcd *xrcd); struct ib_flow *ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain); int ib_destroy_flow(struct ib_flow *flow_id); static inline int ib_check_mr_access(int flags) { /* * Local write permission is required if remote write or * remote atomic permission is also requested. */ if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && !(flags & IB_ACCESS_LOCAL_WRITE)) return -EINVAL; return 0; } /** * ib_check_mr_status: lightweight check of MR status. * This routine may provide status checks on a selected * ib_mr. first use is for signature status check. * * @mr: A memory region. * @check_mask: Bitmask of which checks to perform from * ib_mr_status_check enumeration. * @mr_status: The container of relevant status checks. * failed checks will be indicated in the status bitmask * and the relevant info shall be in the error item. */ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); struct ifnet *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr); int ib_destroy_wq(struct ib_wq *wq); int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *attr, u32 wq_attr_mask); struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device, struct ib_rwq_ind_table_init_attr* wq_ind_table_init_attr); int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size); static inline int ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size) { int n; n = ib_map_mr_sg(mr, sg, sg_nents, sg_offset, page_size); mr->iova = 0; return n; } int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, unsigned int *sg_offset, int (*set_page)(struct ib_mr *, u64)); void ib_drain_rq(struct ib_qp *qp); void ib_drain_sq(struct ib_qp *qp); void ib_drain_qp(struct ib_qp *qp); struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile); int ib_resolve_eth_dmac(struct ib_device *device, struct ib_ah_attr *ah_attr); #endif /* IB_VERBS_H */