Index: projects/bsd_rdma_4_9/sys/contrib/rdma/krping/krping.c =================================================================== --- projects/bsd_rdma_4_9/sys/contrib/rdma/krping/krping.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/contrib/rdma/krping/krping.c (revision 319974) @@ -1,3347 +1,2144 @@ /* * Copyright (c) 2005 Ammasso, Inc. All rights reserved. * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "krping.h" #include "getopt.h" +#define PFX "krping: " + extern int krping_debug; -#define DEBUG_LOG(cb, x...) if (krping_debug) log(LOG_INFO, x) -#define PRINTF(cb, x...) log(LOG_INFO, x) +#define DEBUG_LOG(...) do { if (krping_debug) log(LOG_INFO, __VA_ARGS__); } while (0) #define BIND_INFO 1 MODULE_AUTHOR("Steve Wise"); -MODULE_DESCRIPTION("RDMA ping client/server"); +MODULE_DESCRIPTION("RDMA ping server"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(krping, 1); MODULE_DEPEND(krping, linuxkpi, 1, 1, 1); static __inline uint64_t get_cycles(void) { uint32_t low, high; __asm __volatile("rdtsc" : "=a" (low), "=d" (high)); return (low | ((u_int64_t)high << 32)); } typedef uint64_t cycles_t; enum mem_type { DMA = 1, - FASTREG = 2, - MW = 3, - MR = 4 + REG = 2, }; static const struct krping_option krping_opts[] = { {"count", OPT_INT, 'C'}, {"size", OPT_INT, 'S'}, {"addr", OPT_STRING, 'a'}, + {"addr6", OPT_STRING, 'A'}, {"port", OPT_INT, 'p'}, {"verbose", OPT_NOPARAM, 'v'}, {"validate", OPT_NOPARAM, 'V'}, {"server", OPT_NOPARAM, 's'}, {"client", OPT_NOPARAM, 'c'}, - {"mem_mode", OPT_STRING, 'm'}, {"server_inv", OPT_NOPARAM, 'I'}, {"wlat", OPT_NOPARAM, 'l'}, {"rlat", OPT_NOPARAM, 'L'}, {"bw", OPT_NOPARAM, 'B'}, {"duplex", OPT_NOPARAM, 'd'}, {"txdepth", OPT_INT, 'T'}, {"poll", OPT_NOPARAM, 'P'}, {"local_dma_lkey", OPT_NOPARAM, 'Z'}, {"read_inv", OPT_NOPARAM, 'R'}, - {"fr", OPT_INT, 'f'}, + {"fr", OPT_NOPARAM, 'f'}, {NULL, 0, 0} }; #define htonll(x) cpu_to_be64((x)) #define ntohll(x) cpu_to_be64((x)) -static struct mutex krping_mutex; +static DEFINE_MUTEX(krping_mutex); /* * List of running krping threads. */ static LIST_HEAD(krping_cbs); /* + * Invoke like this, one on each side, using the server's address on + * the RDMA device (iw%d): + * + * /bin/echo server,port=9999,addr=192.168.69.142,validate > /proc/krping + * /bin/echo client,port=9999,addr=192.168.69.142,validate > /proc/krping + * /bin/echo client,port=9999,addr6=2001:db8:0:f101::1,validate > /proc/krping + * * krping "ping/pong" loop: * client sends source rkey/addr/len * server receives source rkey/add/len * server rdma reads "ping" data from source * server sends "go ahead" on rdma read completion * client sends sink rkey/addr/len * server receives sink rkey/addr/len * server rdma writes "pong" data to sink * server sends "go ahead" on rdma write completion * */ /* * These states are used to signal events between the completion handler * and the main client or server thread. * * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV, * and RDMA_WRITE_COMPLETE for each ping. */ enum test_state { IDLE = 1, CONNECT_REQUEST, ADDR_RESOLVED, ROUTE_RESOLVED, CONNECTED, RDMA_READ_ADV, RDMA_READ_COMPLETE, RDMA_WRITE_ADV, RDMA_WRITE_COMPLETE, ERROR }; struct krping_rdma_info { uint64_t buf; uint32_t rkey; uint32_t size; }; /* * Default max buffer size for IO... */ #define RPING_BUFSIZE 128*1024 #define RPING_SQ_DEPTH 64 /* * Control block struct. */ struct krping_cb { - void *cookie; int server; /* 0 iff client */ struct ib_cq *cq; struct ib_pd *pd; struct ib_qp *qp; - enum mem_type mem; struct ib_mr *dma_mr; struct ib_fast_reg_page_list *page_list; int page_list_len; - struct ib_send_wr fastreg_wr; + struct ib_reg_wr reg_mr_wr; struct ib_send_wr invalidate_wr; - struct ib_mr *fastreg_mr; + struct ib_mr *reg_mr; int server_invalidate; int read_inv; u8 key; - struct ib_mw *mw; - struct ib_mw_bind bind_attr; - struct ib_recv_wr rq_wr; /* recv work request record */ struct ib_sge recv_sgl; /* recv single SGE */ - struct krping_rdma_info recv_buf;/* malloc'd buffer */ + struct krping_rdma_info recv_buf __aligned(16); /* malloc'd buffer */ u64 recv_dma_addr; DECLARE_PCI_UNMAP_ADDR(recv_mapping) - struct ib_mr *recv_mr; struct ib_send_wr sq_wr; /* send work requrest record */ struct ib_sge send_sgl; - struct krping_rdma_info send_buf;/* single send buf */ + struct krping_rdma_info send_buf __aligned(16); /* single send buf */ u64 send_dma_addr; DECLARE_PCI_UNMAP_ADDR(send_mapping) - struct ib_mr *send_mr; - struct ib_send_wr rdma_sq_wr; /* rdma work request record */ + struct ib_rdma_wr rdma_sq_wr; /* rdma work request record */ struct ib_sge rdma_sgl; /* rdma single SGE */ char *rdma_buf; /* used as rdma sink */ u64 rdma_dma_addr; DECLARE_PCI_UNMAP_ADDR(rdma_mapping) struct ib_mr *rdma_mr; uint32_t remote_rkey; /* remote guys RKEY */ uint64_t remote_addr; /* remote guys TO */ uint32_t remote_len; /* remote guys LEN */ char *start_buf; /* rdma read src */ u64 start_dma_addr; DECLARE_PCI_UNMAP_ADDR(start_mapping) struct ib_mr *start_mr; enum test_state state; /* used for cond/signalling */ wait_queue_head_t sem; struct krping_stats stats; uint16_t port; /* dst port in NBO */ - struct in_addr addr; /* dst addr in NBO */ + u8 addr[16]; /* dst addr in NBO */ char *addr_str; /* dst addr string */ + uint8_t addr_type; /* ADDR_FAMILY - IPv4/V6 */ int verbose; /* verbose logging */ int count; /* ping count */ int size; /* ping data size */ int validate; /* validate ping data */ int wlat; /* run wlat test */ int rlat; /* run rlat test */ int bw; /* run bw test */ int duplex; /* run bw full duplex test */ int poll; /* poll or block for rlat test */ int txdepth; /* SQ depth */ int local_dma_lkey; /* use 0 for lkey */ - int frtest; /* fastreg test */ - int testnum; + int frtest; /* reg test */ /* CM stuff */ struct rdma_cm_id *cm_id; /* connection on client side,*/ /* listener on server side. */ struct rdma_cm_id *child_cm_id; /* connection on server side */ struct list_head list; }; static int krping_cma_event_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { int ret; struct krping_cb *cb = cma_id->context; - DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event, - cma_id, (cma_id == cb->cm_id) ? "parent" : "child"); + DEBUG_LOG("cma_event type %d cma_id %p (%s)\n", event->event, cma_id, + (cma_id == cb->cm_id) ? "parent" : "child"); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: cb->state = ADDR_RESOLVED; ret = rdma_resolve_route(cma_id, 2000); if (ret) { - PRINTF(cb, "rdma_resolve_route error %d\n", ret); + printk(KERN_ERR PFX "rdma_resolve_route error %d\n", + ret); wake_up_interruptible(&cb->sem); } break; case RDMA_CM_EVENT_ROUTE_RESOLVED: cb->state = ROUTE_RESOLVED; - cb->child_cm_id = cma_id; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_CONNECT_REQUEST: - if (cb->state == IDLE) { - cb->state = CONNECT_REQUEST; - cb->child_cm_id = cma_id; - } else { - PRINTF(cb, "Received connection request in wrong state" - " (%d)\n", cb->state); - } - DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id); + cb->state = CONNECT_REQUEST; + cb->child_cm_id = cma_id; + DEBUG_LOG("child cma %p\n", cb->child_cm_id); wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_ESTABLISHED: - DEBUG_LOG(cb, "ESTABLISHED\n"); + DEBUG_LOG("ESTABLISHED\n"); if (!cb->server) { cb->state = CONNECTED; } wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: - PRINTF(cb, "cma event %d, error %d\n", event->event, + printk(KERN_ERR PFX "cma event %d, error %d\n", event->event, event->status); cb->state = ERROR; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DISCONNECTED: - PRINTF(cb, "DISCONNECT EVENT...\n"); + printk(KERN_ERR PFX "DISCONNECT EVENT...\n"); cb->state = ERROR; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: - PRINTF(cb, "cma detected device removal!!!!\n"); + printk(KERN_ERR PFX "cma detected device removal!!!!\n"); break; default: - PRINTF(cb, "oof bad type!\n"); + printk(KERN_ERR PFX "oof bad type!\n"); wake_up_interruptible(&cb->sem); break; } return 0; } static int server_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { - PRINTF(cb, "Received bogus data, size %d\n", + printk(KERN_ERR PFX "Received bogus data, size %d\n", wc->byte_len); return -1; } cb->remote_rkey = ntohl(cb->recv_buf.rkey); cb->remote_addr = ntohll(cb->recv_buf.buf); cb->remote_len = ntohl(cb->recv_buf.size); - DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n", + DEBUG_LOG("Received rkey %x addr %llx len %d from peer\n", cb->remote_rkey, (unsigned long long)cb->remote_addr, cb->remote_len); if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE) cb->state = RDMA_READ_ADV; else cb->state = RDMA_WRITE_ADV; return 0; } static int client_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { - PRINTF(cb, "Received bogus data, size %d\n", + printk(KERN_ERR PFX "Received bogus data, size %d\n", wc->byte_len); return -1; } if (cb->state == RDMA_READ_ADV) cb->state = RDMA_WRITE_ADV; else cb->state = RDMA_WRITE_COMPLETE; return 0; } static void krping_cq_event_handler(struct ib_cq *cq, void *ctx) { struct krping_cb *cb = ctx; struct ib_wc wc; struct ib_recv_wr *bad_wr; int ret; BUG_ON(cb->cq != cq); if (cb->state == ERROR) { - PRINTF(cb, "cq completion in ERROR state\n"); + printk(KERN_ERR PFX "cq completion in ERROR state\n"); return; } - if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) + if (cb->frtest) { + printk(KERN_ERR PFX "cq completion event in frtest!\n"); + return; + } + if (!cb->wlat && !cb->rlat && !cb->bw) ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { if (wc.status) { if (wc.status == IB_WC_WR_FLUSH_ERR) { - DEBUG_LOG(cb, "cq flushed\n"); + DEBUG_LOG("cq flushed\n"); continue; } else { - PRINTF(cb, "cq completion failed with " + printk(KERN_ERR PFX "cq completion failed with " "wr_id %jx status %d opcode %d vender_err %x\n", (uintmax_t)wc.wr_id, wc.status, wc.opcode, wc.vendor_err); goto error; } } switch (wc.opcode) { case IB_WC_SEND: - DEBUG_LOG(cb, "send completion\n"); + DEBUG_LOG("send completion\n"); cb->stats.send_bytes += cb->send_sgl.length; cb->stats.send_msgs++; break; case IB_WC_RDMA_WRITE: - DEBUG_LOG(cb, "rdma write completion\n"); - cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length; + DEBUG_LOG("rdma write completion\n"); + cb->stats.write_bytes += cb->rdma_sq_wr.wr.sg_list->length; cb->stats.write_msgs++; cb->state = RDMA_WRITE_COMPLETE; wake_up_interruptible(&cb->sem); break; case IB_WC_RDMA_READ: - DEBUG_LOG(cb, "rdma read completion\n"); - cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length; + DEBUG_LOG("rdma read completion\n"); + cb->stats.read_bytes += cb->rdma_sq_wr.wr.sg_list->length; cb->stats.read_msgs++; cb->state = RDMA_READ_COMPLETE; wake_up_interruptible(&cb->sem); break; case IB_WC_RECV: - DEBUG_LOG(cb, "recv completion\n"); + DEBUG_LOG("recv completion\n"); cb->stats.recv_bytes += sizeof(cb->recv_buf); cb->stats.recv_msgs++; - if (cb->wlat || cb->rlat || cb->bw || cb->frtest) + if (cb->wlat || cb->rlat || cb->bw) ret = server_recv(cb, &wc); else ret = cb->server ? server_recv(cb, &wc) : client_recv(cb, &wc); if (ret) { - PRINTF(cb, "recv wc error: %d\n", ret); + printk(KERN_ERR PFX "recv wc error: %d\n", ret); goto error; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post recv error: %d\n", + printk(KERN_ERR PFX "post recv error: %d\n", ret); goto error; } wake_up_interruptible(&cb->sem); break; default: - PRINTF(cb, + printk(KERN_ERR PFX "%s:%d Unexpected opcode %d, Shutting down\n", __func__, __LINE__, wc.opcode); goto error; } } if (ret) { - PRINTF(cb, "poll error %d\n", ret); + printk(KERN_ERR PFX "poll error %d\n", ret); goto error; } return; error: cb->state = ERROR; wake_up_interruptible(&cb->sem); } static int krping_accept(struct krping_cb *cb) { struct rdma_conn_param conn_param; int ret; - DEBUG_LOG(cb, "accepting client connection request\n"); + DEBUG_LOG("accepting client connection request\n"); memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; ret = rdma_accept(cb->child_cm_id, &conn_param); if (ret) { - PRINTF(cb, "rdma_accept error: %d\n", ret); + printk(KERN_ERR PFX "rdma_accept error: %d\n", ret); return ret; } - if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) { + if (!cb->wlat && !cb->rlat && !cb->bw) { wait_event_interruptible(cb->sem, cb->state >= CONNECTED); if (cb->state == ERROR) { - PRINTF(cb, "wait for CONNECTED state %d\n", + printk(KERN_ERR PFX "wait for CONNECTED state %d\n", cb->state); return -1; } } return 0; } static void krping_setup_wr(struct krping_cb *cb) { cb->recv_sgl.addr = cb->recv_dma_addr; cb->recv_sgl.length = sizeof cb->recv_buf; - if (cb->local_dma_lkey) - cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey; - else if (cb->mem == DMA) - cb->recv_sgl.lkey = cb->dma_mr->lkey; - else - cb->recv_sgl.lkey = cb->recv_mr->lkey; + cb->recv_sgl.lkey = cb->pd->local_dma_lkey; cb->rq_wr.sg_list = &cb->recv_sgl; cb->rq_wr.num_sge = 1; cb->send_sgl.addr = cb->send_dma_addr; cb->send_sgl.length = sizeof cb->send_buf; - if (cb->local_dma_lkey) - cb->send_sgl.lkey = cb->qp->device->local_dma_lkey; - else if (cb->mem == DMA) - cb->send_sgl.lkey = cb->dma_mr->lkey; - else - cb->send_sgl.lkey = cb->send_mr->lkey; + cb->send_sgl.lkey = cb->pd->local_dma_lkey; cb->sq_wr.opcode = IB_WR_SEND; cb->sq_wr.send_flags = IB_SEND_SIGNALED; cb->sq_wr.sg_list = &cb->send_sgl; cb->sq_wr.num_sge = 1; - if (cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) { + if (cb->server || cb->wlat || cb->rlat || cb->bw) { cb->rdma_sgl.addr = cb->rdma_dma_addr; - if (cb->mem == MR) - cb->rdma_sgl.lkey = cb->rdma_mr->lkey; - cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED; - cb->rdma_sq_wr.sg_list = &cb->rdma_sgl; - cb->rdma_sq_wr.num_sge = 1; + cb->rdma_sq_wr.wr.send_flags = IB_SEND_SIGNALED; + cb->rdma_sq_wr.wr.sg_list = &cb->rdma_sgl; + cb->rdma_sq_wr.wr.num_sge = 1; } - switch(cb->mem) { - case FASTREG: + /* + * A chain of 2 WRs, INVALDATE_MR + REG_MR. + * both unsignaled. The client uses them to reregister + * the rdma buffers with a new key each iteration. + */ + cb->reg_mr_wr.wr.opcode = IB_WR_REG_MR; + cb->reg_mr_wr.mr = cb->reg_mr; - /* - * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR. - * both unsignaled. The client uses them to reregister - * the rdma buffers with a new key each iteration. - */ - cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR; - cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - cb->fastreg_wr.wr.fast_reg.length = cb->size; - cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list; - cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len; - - cb->invalidate_wr.next = &cb->fastreg_wr; - cb->invalidate_wr.opcode = IB_WR_LOCAL_INV; - break; - case MW: - cb->bind_attr.wr_id = 0xabbaabba; - cb->bind_attr.send_flags = 0; /* unsignaled */ -#ifdef BIND_INFO - cb->bind_attr.bind_info.length = cb->size; -#else - cb->bind_attr.length = cb->size; -#endif - break; - default: - break; - } + cb->invalidate_wr.next = &cb->reg_mr_wr.wr; + cb->invalidate_wr.opcode = IB_WR_LOCAL_INV; } static int krping_setup_buffers(struct krping_cb *cb) { int ret; - struct ib_phys_buf buf; - u64 iovbase; - DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb); + DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb); - cb->recv_dma_addr = ib_dma_map_single(cb->pd->device, + cb->recv_dma_addr = ib_dma_map_single(cb->pd->device, &cb->recv_buf, sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr); - cb->send_dma_addr = ib_dma_map_single(cb->pd->device, + cb->send_dma_addr = ib_dma_map_single(cb->pd->device, &cb->send_buf, sizeof(cb->send_buf), DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr); - if (cb->mem == DMA) { - cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE| - IB_ACCESS_REMOTE_READ| - IB_ACCESS_REMOTE_WRITE); - if (IS_ERR(cb->dma_mr)) { - DEBUG_LOG(cb, "reg_dmamr failed\n"); - ret = PTR_ERR(cb->dma_mr); - goto bail; - } - } else { - if (!cb->local_dma_lkey) { - buf.addr = cb->recv_dma_addr; - buf.size = sizeof cb->recv_buf; - DEBUG_LOG(cb, "recv buf dma_addr %jx size %d\n", - (uintmax_t)buf.addr, (int)buf.size); - iovbase = cb->recv_dma_addr; - cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, - IB_ACCESS_LOCAL_WRITE, - &iovbase); - - if (IS_ERR(cb->recv_mr)) { - DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); - ret = PTR_ERR(cb->recv_mr); - goto bail; - } - - buf.addr = cb->send_dma_addr; - buf.size = sizeof cb->send_buf; - DEBUG_LOG(cb, "send buf dma_addr %jx size %d\n", - (uintmax_t)buf.addr, (int)buf.size); - iovbase = cb->send_dma_addr; - cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, - 0, &iovbase); - - if (IS_ERR(cb->send_mr)) { - DEBUG_LOG(cb, "send_buf reg_mr failed\n"); - ret = PTR_ERR(cb->send_mr); - goto bail; - } - } - } - - cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL); + cb->rdma_buf = ib_dma_alloc_coherent(cb->pd->device, cb->size, + &cb->rdma_dma_addr, + GFP_KERNEL); if (!cb->rdma_buf) { - DEBUG_LOG(cb, "rdma_buf malloc failed\n"); + DEBUG_LOG(PFX "rdma_buf allocation failed\n"); ret = -ENOMEM; goto bail; } - - cb->rdma_dma_addr = ib_dma_map_single(cb->pd->device, - cb->rdma_buf, cb->size, - DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr); - if (cb->mem != DMA) { - switch (cb->mem) { - case FASTREG: - cb->page_list_len = (((cb->size - 1) & PAGE_MASK) + - PAGE_SIZE) >> PAGE_SHIFT; - cb->page_list = ib_alloc_fast_reg_page_list( - cb->pd->device, - cb->page_list_len); - if (IS_ERR(cb->page_list)) { - DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); - ret = PTR_ERR(cb->page_list); - goto bail; - } - cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd, - cb->page_list->max_page_list_len); - if (IS_ERR(cb->fastreg_mr)) { - DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); - ret = PTR_ERR(cb->fastreg_mr); - goto bail; - } - DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p" - " page_list_len %u\n", cb->fastreg_mr->rkey, - cb->page_list, cb->page_list_len); - break; - case MW: - cb->mw = ib_alloc_mw(cb->pd,IB_MW_TYPE_1); - if (IS_ERR(cb->mw)) { - DEBUG_LOG(cb, "recv_buf alloc_mw failed\n"); - ret = PTR_ERR(cb->mw); - goto bail; - } - DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey); - /*FALLTHROUGH*/ - case MR: - buf.addr = cb->rdma_dma_addr; - buf.size = cb->size; - iovbase = cb->rdma_dma_addr; - cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, - IB_ACCESS_LOCAL_WRITE| - IB_ACCESS_REMOTE_READ| - IB_ACCESS_REMOTE_WRITE, - &iovbase); - if (IS_ERR(cb->rdma_mr)) { - DEBUG_LOG(cb, "rdma_buf reg_mr failed\n"); - ret = PTR_ERR(cb->rdma_mr); - goto bail; - } - DEBUG_LOG(cb, "rdma buf dma_addr %jx size %d mr rkey 0x%x\n", - (uintmax_t)buf.addr, (int)buf.size, cb->rdma_mr->rkey); - break; - default: - ret = -EINVAL; - goto bail; - break; - } + cb->page_list_len = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) + >> PAGE_SHIFT; + cb->reg_mr = ib_alloc_mr(cb->pd, IB_MR_TYPE_MEM_REG, + cb->page_list_len); + if (IS_ERR(cb->reg_mr)) { + ret = PTR_ERR(cb->reg_mr); + DEBUG_LOG(PFX "recv_buf reg_mr failed %d\n", ret); + goto bail; } + DEBUG_LOG(PFX "reg rkey 0x%x page_list_len %u\n", + cb->reg_mr->rkey, cb->page_list_len); - if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) { + if (!cb->server || cb->wlat || cb->rlat || cb->bw) { - cb->start_buf = kmalloc(cb->size, GFP_KERNEL); + cb->start_buf = ib_dma_alloc_coherent(cb->pd->device, cb->size, + &cb->start_dma_addr, + GFP_KERNEL); if (!cb->start_buf) { - DEBUG_LOG(cb, "start_buf malloc failed\n"); + DEBUG_LOG(PFX "start_buf malloc failed\n"); ret = -ENOMEM; goto bail; } - - cb->start_dma_addr = ib_dma_map_single(cb->pd->device, - cb->start_buf, cb->size, - DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr); - - if (cb->mem == MR || cb->mem == MW) { - unsigned flags = IB_ACCESS_REMOTE_READ; - - if (cb->wlat || cb->rlat || cb->bw || cb->frtest) { - flags |= IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE; - } - - buf.addr = cb->start_dma_addr; - buf.size = cb->size; - DEBUG_LOG(cb, "start buf dma_addr %jx size %d\n", - (uintmax_t)buf.addr, (int)buf.size); - iovbase = cb->start_dma_addr; - cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, - flags, - &iovbase); - - if (IS_ERR(cb->start_mr)) { - DEBUG_LOG(cb, "start_buf reg_mr failed\n"); - ret = PTR_ERR(cb->start_mr); - goto bail; - } - } } krping_setup_wr(cb); - DEBUG_LOG(cb, "allocated & registered buffers...\n"); + DEBUG_LOG(PFX "allocated & registered buffers...\n"); return 0; bail: - if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr)) - ib_dereg_mr(cb->fastreg_mr); - if (cb->mw && !IS_ERR(cb->mw)) - ib_dealloc_mw(cb->mw); + if (cb->reg_mr && !IS_ERR(cb->reg_mr)) + ib_dereg_mr(cb->reg_mr); if (cb->rdma_mr && !IS_ERR(cb->rdma_mr)) ib_dereg_mr(cb->rdma_mr); - if (cb->page_list && !IS_ERR(cb->page_list)) - ib_free_fast_reg_page_list(cb->page_list); if (cb->dma_mr && !IS_ERR(cb->dma_mr)) ib_dereg_mr(cb->dma_mr); - if (cb->recv_mr && !IS_ERR(cb->recv_mr)) - ib_dereg_mr(cb->recv_mr); - if (cb->send_mr && !IS_ERR(cb->send_mr)) - ib_dereg_mr(cb->send_mr); - if (cb->rdma_buf) - kfree(cb->rdma_buf); - if (cb->start_buf) - kfree(cb->start_buf); + if (cb->rdma_buf) { + ib_dma_free_coherent(cb->pd->device, cb->size, cb->rdma_buf, + cb->rdma_dma_addr); + } + if (cb->start_buf) { + ib_dma_free_coherent(cb->pd->device, cb->size, cb->start_buf, + cb->start_dma_addr); + } return ret; } static void krping_free_buffers(struct krping_cb *cb) { - DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb); + DEBUG_LOG("krping_free_buffers called on cb %p\n", cb); if (cb->dma_mr) ib_dereg_mr(cb->dma_mr); - if (cb->send_mr) - ib_dereg_mr(cb->send_mr); - if (cb->recv_mr) - ib_dereg_mr(cb->recv_mr); if (cb->rdma_mr) ib_dereg_mr(cb->rdma_mr); if (cb->start_mr) ib_dereg_mr(cb->start_mr); - if (cb->fastreg_mr) - ib_dereg_mr(cb->fastreg_mr); - if (cb->mw) - ib_dealloc_mw(cb->mw); + if (cb->reg_mr) + ib_dereg_mr(cb->reg_mr); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, recv_mapping), sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, send_mapping), sizeof(cb->send_buf), DMA_BIDIRECTIONAL); - dma_unmap_single(cb->pd->device->dma_device, - pci_unmap_addr(cb, rdma_mapping), - cb->size, DMA_BIDIRECTIONAL); - kfree(cb->rdma_buf); + + ib_dma_free_coherent(cb->pd->device, cb->size, cb->rdma_buf, + cb->rdma_dma_addr); + if (cb->start_buf) { - dma_unmap_single(cb->pd->device->dma_device, - pci_unmap_addr(cb, start_mapping), - cb->size, DMA_BIDIRECTIONAL); - kfree(cb->start_buf); + ib_dma_free_coherent(cb->pd->device, cb->size, cb->start_buf, + cb->start_dma_addr); } } static int krping_create_qp(struct krping_cb *cb) { struct ib_qp_init_attr init_attr; int ret; memset(&init_attr, 0, sizeof(init_attr)); init_attr.cap.max_send_wr = cb->txdepth; init_attr.cap.max_recv_wr = 2; + + /* For flush_qp() */ + init_attr.cap.max_send_wr++; + init_attr.cap.max_recv_wr++; + init_attr.cap.max_recv_sge = 1; init_attr.cap.max_send_sge = 1; init_attr.qp_type = IB_QPT_RC; init_attr.send_cq = cb->cq; init_attr.recv_cq = cb->cq; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; if (cb->server) { ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr); if (!ret) cb->qp = cb->child_cm_id->qp; } else { ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr); if (!ret) cb->qp = cb->cm_id->qp; } return ret; } static void krping_free_qp(struct krping_cb *cb) { ib_destroy_qp(cb->qp); ib_destroy_cq(cb->cq); ib_dealloc_pd(cb->pd); } static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id) { int ret; - cb->pd = ib_alloc_pd(cm_id->device); + struct ib_cq_init_attr attr = {0}; + + cb->pd = ib_alloc_pd(cm_id->device, 0); if (IS_ERR(cb->pd)) { - PRINTF(cb, "ib_alloc_pd failed\n"); + printk(KERN_ERR PFX "ib_alloc_pd failed\n"); return PTR_ERR(cb->pd); } - DEBUG_LOG(cb, "created pd %p\n", cb->pd); + DEBUG_LOG("created pd %p\n", cb->pd); strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name)); + attr.cqe = cb->txdepth * 2; + attr.comp_vector = 0; cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL, - cb, cb->txdepth * 2, 0); + cb, &attr); if (IS_ERR(cb->cq)) { - PRINTF(cb, "ib_create_cq failed\n"); + printk(KERN_ERR PFX "ib_create_cq failed\n"); ret = PTR_ERR(cb->cq); goto err1; } - DEBUG_LOG(cb, "created cq %p\n", cb->cq); + DEBUG_LOG("created cq %p\n", cb->cq); if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) { ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); if (ret) { - PRINTF(cb, "ib_create_cq failed\n"); + printk(KERN_ERR PFX "ib_create_cq failed\n"); goto err2; } } ret = krping_create_qp(cb); if (ret) { - PRINTF(cb, "krping_create_qp failed: %d\n", ret); + printk(KERN_ERR PFX "krping_create_qp failed: %d\n", ret); goto err2; } - DEBUG_LOG(cb, "created qp %p\n", cb->qp); + DEBUG_LOG("created qp %p\n", cb->qp); return 0; err2: ib_destroy_cq(cb->cq); err1: ib_dealloc_pd(cb->pd); return ret; } /* * return the (possibly rebound) rkey for the rdma buffer. - * FASTREG mode: invalidate and rebind via fastreg wr. - * MW mode: rebind the MW. + * REG mode: invalidate and rebind via reg wr. * other modes: just return the mr rkey. */ static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv) { - u32 rkey = 0xffffffff; - u64 p; + u32 rkey; struct ib_send_wr *bad_wr; - int i; int ret; + struct scatterlist sg = {0}; - switch (cb->mem) { - case FASTREG: - cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey; + cb->invalidate_wr.ex.invalidate_rkey = cb->reg_mr->rkey; - /* - * Update the fastreg key. - */ - ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key); - cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey; + /* + * Update the reg key. + */ + ib_update_fast_reg_key(cb->reg_mr, ++cb->key); + cb->reg_mr_wr.key = cb->reg_mr->rkey; - /* - * Update the fastreg WR with new buf info. - */ - if (buf == (u64)cb->start_dma_addr) - cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ; - else - cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; - cb->fastreg_wr.wr.fast_reg.iova_start = buf; - p = (u64)(buf & PAGE_MASK); - for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len; - i++, p += PAGE_SIZE) { - cb->page_list->page_list[i] = p; - DEBUG_LOG(cb, "page_list[%d] 0x%jx\n", i, (uintmax_t)p); - } + /* + * Update the reg WR with new buf info. + */ + if (buf == (u64)cb->start_dma_addr) + cb->reg_mr_wr.access = IB_ACCESS_REMOTE_READ; + else + cb->reg_mr_wr.access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; + sg_dma_address(&sg) = buf; + sg_dma_len(&sg) = cb->size; - DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u" - " iova_start %jx page_list_len %u\n", - post_inv, - cb->fastreg_wr.wr.fast_reg.rkey, - cb->fastreg_wr.wr.fast_reg.page_shift, - (unsigned)cb->fastreg_wr.wr.fast_reg.length, - (uintmax_t)cb->fastreg_wr.wr.fast_reg.iova_start, - cb->fastreg_wr.wr.fast_reg.page_list_len); + ret = ib_map_mr_sg(cb->reg_mr, &sg, 1, NULL, PAGE_SIZE); + BUG_ON(ret <= 0 || ret > cb->page_list_len); - if (post_inv) - ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr); - else - ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr); - if (ret) { - PRINTF(cb, "post send error %d\n", ret); - cb->state = ERROR; - } - rkey = cb->fastreg_mr->rkey; - break; - case MW: - /* - * Update the MW with new buf info. - */ - if (buf == (u64)cb->start_dma_addr) { -#ifdef BIND_INFO - cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_READ; - cb->bind_attr.bind_info.mr = cb->start_mr; -#else - cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ; - cb->bind_attr.mr = cb->start_mr; -#endif - } else { -#ifdef BIND_INFO - cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_WRITE; - cb->bind_attr.bind_info.mr = cb->rdma_mr; -#else - cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE; - cb->bind_attr.mr = cb->rdma_mr; -#endif - } -#ifdef BIND_INFO - cb->bind_attr.bind_info.addr = buf; -#else - cb->bind_attr.addr = buf; -#endif - DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %jx mr rkey 0x%x\n", -#ifdef BIND_INFO - cb->mw->rkey, (uintmax_t)buf, cb->bind_attr.bind_info.mr->rkey); -#else - cb->mw->rkey, buf, cb->bind_attr.mr->rkey); -#endif - ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr); - if (ret) { - PRINTF(cb, "bind mw error %d\n", ret); - cb->state = ERROR; - } else - rkey = cb->mw->rkey; - break; - case MR: - if (buf == (u64)cb->start_dma_addr) - rkey = cb->start_mr->rkey; - else - rkey = cb->rdma_mr->rkey; - break; - case DMA: - rkey = cb->dma_mr->rkey; - break; - default: - PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__); + DEBUG_LOG(PFX "post_inv = %d, reg_mr new rkey 0x%x pgsz %u len %u" + " iova_start %llx\n", + post_inv, + cb->reg_mr_wr.key, + cb->reg_mr->page_size, + cb->reg_mr->length, + (unsigned long long)cb->reg_mr->iova); + + if (post_inv) + ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr); + else + ret = ib_post_send(cb->qp, &cb->reg_mr_wr.wr, &bad_wr); + if (ret) { + printk(KERN_ERR PFX "post send error %d\n", ret); cb->state = ERROR; - break; } + rkey = cb->reg_mr->rkey; return rkey; } static void krping_format_send(struct krping_cb *cb, u64 buf) { struct krping_rdma_info *info = &cb->send_buf; u32 rkey; /* - * Client side will do fastreg or mw bind before + * Client side will do reg or mw bind before * advertising the rdma buffer. Server side * sends have no data. */ - if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) { + if (!cb->server || cb->wlat || cb->rlat || cb->bw) { rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate); info->buf = htonll(buf); info->rkey = htonl(rkey); info->size = htonl(cb->size); - DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n", + DEBUG_LOG("RDMA addr %llx rkey %x len %d\n", (unsigned long long)buf, rkey, cb->size); } } static void krping_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr, inv; int ret; while (1) { /* Wait for client's Start STAG/TO/Len */ wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV); if (cb->state != RDMA_READ_ADV) { - PRINTF(cb, "wait for RDMA_READ_ADV state %d\n", + printk(KERN_ERR PFX "wait for RDMA_READ_ADV state %d\n", cb->state); break; } - DEBUG_LOG(cb, "server received sink adv\n"); + DEBUG_LOG("server received sink adv\n"); - cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; - cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; - cb->rdma_sq_wr.sg_list->length = cb->remote_len; - cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1); + cb->rdma_sq_wr.rkey = cb->remote_rkey; + cb->rdma_sq_wr.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.wr.sg_list->length = cb->remote_len; + cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, !cb->read_inv); + cb->rdma_sq_wr.wr.next = NULL; /* Issue RDMA Read. */ if (cb->read_inv) - cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV; + cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; else { - cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; - if (cb->mem == FASTREG) { - /* - * Immediately follow the read with a - * fenced LOCAL_INV. - */ - cb->rdma_sq_wr.next = &inv; - memset(&inv, 0, sizeof inv); - inv.opcode = IB_WR_LOCAL_INV; - inv.ex.invalidate_rkey = cb->fastreg_mr->rkey; - inv.send_flags = IB_SEND_FENCE; - } + cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ; + /* + * Immediately follow the read with a + * fenced LOCAL_INV. + */ + cb->rdma_sq_wr.wr.next = &inv; + memset(&inv, 0, sizeof inv); + inv.opcode = IB_WR_LOCAL_INV; + inv.ex.invalidate_rkey = cb->reg_mr->rkey; + inv.send_flags = IB_SEND_FENCE; } - ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); + ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); break; } - cb->rdma_sq_wr.next = NULL; + cb->rdma_sq_wr.wr.next = NULL; - DEBUG_LOG(cb, "server posted rdma read req \n"); + DEBUG_LOG("server posted rdma read req \n"); /* Wait for read completion */ wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_COMPLETE); if (cb->state != RDMA_READ_COMPLETE) { - PRINTF(cb, + printk(KERN_ERR PFX "wait for RDMA_READ_COMPLETE state %d\n", cb->state); break; } - DEBUG_LOG(cb, "server received read complete\n"); + DEBUG_LOG("server received read complete\n"); /* Display data in recv buf */ - if (cb->verbose) { - if (strlen(cb->rdma_buf) > 128) { - char msgbuf[128]; + if (cb->verbose) + printk(KERN_INFO PFX "server ping data: %s\n", + cb->rdma_buf); - strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf)); - PRINTF(cb, "server ping data stripped: %s\n", - msgbuf); - } else - PRINTF(cb, "server ping data: %s\n", - cb->rdma_buf); - } - /* Tell client to continue */ if (cb->server && cb->server_invalidate) { cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey; cb->sq_wr.opcode = IB_WR_SEND_WITH_INV; - DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey); + DEBUG_LOG("send-w-inv rkey 0x%x\n", cb->remote_rkey); } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); break; } - DEBUG_LOG(cb, "server posted go ahead\n"); + DEBUG_LOG("server posted go ahead\n"); /* Wait for client's RDMA STAG/TO/Len */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV); if (cb->state != RDMA_WRITE_ADV) { - PRINTF(cb, + printk(KERN_ERR PFX "wait for RDMA_WRITE_ADV state %d\n", cb->state); break; } - DEBUG_LOG(cb, "server received sink adv\n"); + DEBUG_LOG("server received sink adv\n"); /* RDMA Write echo data */ - cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; - cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; - cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; - cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1; + cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE; + cb->rdma_sq_wr.rkey = cb->remote_rkey; + cb->rdma_sq_wr.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.wr.sg_list->length = strlen(cb->rdma_buf) + 1; if (cb->local_dma_lkey) - cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey; + cb->rdma_sgl.lkey = cb->pd->local_dma_lkey; else cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0); - DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n", - cb->rdma_sq_wr.sg_list->lkey, - (unsigned long long)cb->rdma_sq_wr.sg_list->addr, - cb->rdma_sq_wr.sg_list->length); + DEBUG_LOG("rdma write from lkey %x laddr %llx len %d\n", + cb->rdma_sq_wr.wr.sg_list->lkey, + (unsigned long long)cb->rdma_sq_wr.wr.sg_list->addr, + cb->rdma_sq_wr.wr.sg_list->length); - ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); + ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); break; } /* Wait for completion */ ret = wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_COMPLETE); if (cb->state != RDMA_WRITE_COMPLETE) { - PRINTF(cb, + printk(KERN_ERR PFX "wait for RDMA_WRITE_COMPLETE state %d\n", cb->state); break; } - DEBUG_LOG(cb, "server rdma write complete \n"); + DEBUG_LOG("server rdma write complete \n"); cb->state = CONNECTED; /* Tell client to begin again */ if (cb->server && cb->server_invalidate) { cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey; cb->sq_wr.opcode = IB_WR_SEND_WITH_INV; - DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey); + DEBUG_LOG("send-w-inv rkey 0x%x\n", cb->remote_rkey); } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); break; } - DEBUG_LOG(cb, "server posted go ahead\n"); + DEBUG_LOG("server posted go ahead\n"); } } static void rlat_test(struct krping_cb *cb) { int scnt; int iters = cb->count; struct timeval start_tv, stop_tv; int ret; struct ib_wc wc; struct ib_send_wr *bad_wr; int ne; scnt = 0; - cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; - cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; - cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; - cb->rdma_sq_wr.sg_list->length = cb->size; + cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ; + cb->rdma_sq_wr.rkey = cb->remote_rkey; + cb->rdma_sq_wr.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.wr.sg_list->length = cb->size; microtime(&start_tv); if (!cb->poll) { cb->state = RDMA_READ_ADV; ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); } while (scnt < iters) { cb->state = RDMA_READ_ADV; - ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); + ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr); if (ret) { - PRINTF(cb, + printk(KERN_ERR PFX "Couldn't post send: ret=%d scnt %d\n", ret, scnt); return; } do { if (!cb->poll) { wait_event_interruptible(cb->sem, cb->state != RDMA_READ_ADV); if (cb->state == RDMA_READ_COMPLETE) { ne = 1; ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); } else { ne = -1; } } else ne = ib_poll_cq(cb->cq, 1, &wc); if (cb->state == ERROR) { - PRINTF(cb, + printk(KERN_ERR PFX "state == ERROR...bailing scnt %d\n", scnt); return; } } while (ne == 0); if (ne < 0) { - PRINTF(cb, "poll CQ failed %d\n", ne); + printk(KERN_ERR PFX "poll CQ failed %d\n", ne); return; } if (cb->poll && wc.status != IB_WC_SUCCESS) { - PRINTF(cb, "Completion wth error at %s:\n", + printk(KERN_ERR PFX "Completion wth error at %s:\n", cb->server ? "server" : "client"); - PRINTF(cb, "Failed status %d: wr_id %d\n", + printk(KERN_ERR PFX "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } ++scnt; } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } - PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n", + printk(KERN_ERR PFX "delta sec %lu delta usec %lu iter %d size %d\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size); } static void wlat_test(struct krping_cb *cb) { int ccnt, scnt, rcnt; int iters=cb->count; volatile char *poll_buf = (char *) cb->start_buf; char *buf = (char *)cb->rdma_buf; struct timeval start_tv, stop_tv; cycles_t *post_cycles_start, *post_cycles_stop; cycles_t *poll_cycles_start, *poll_cycles_stop; cycles_t *last_poll_cycles_start; cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; int i; int cycle_iters = 1000; ccnt = 0; scnt = 0; rcnt = 0; post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_start) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_stop) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_start) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_stop) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!last_poll_cycles_start) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } - cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; - cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; - cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; - cb->rdma_sq_wr.sg_list->length = cb->size; + cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE; + cb->rdma_sq_wr.rkey = cb->remote_rkey; + cb->rdma_sq_wr.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.wr.sg_list->length = cb->size; if (cycle_iters > iters) cycle_iters = iters; microtime(&start_tv); while (scnt < iters || ccnt < iters || rcnt < iters) { /* Wait till buffer changes. */ if (rcnt < iters && !(scnt < 1 && !cb->server)) { ++rcnt; while (*poll_buf != (char)rcnt) { if (cb->state == ERROR) { - PRINTF(cb, + printk(KERN_ERR PFX "state = ERROR, bailing\n"); return; } } } if (scnt < iters) { struct ib_send_wr *bad_wr; *buf = (char)scnt+1; if (scnt < cycle_iters) post_cycles_start[scnt] = get_cycles(); - if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { - PRINTF(cb, + if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) { + printk(KERN_ERR PFX "Couldn't post send: scnt=%d\n", scnt); return; } if (scnt < cycle_iters) post_cycles_stop[scnt] = get_cycles(); scnt++; } if (ccnt < iters) { struct ib_wc wc; int ne; if (ccnt < cycle_iters) poll_cycles_start[ccnt] = get_cycles(); do { if (ccnt < cycle_iters) last_poll_cycles_start[ccnt] = get_cycles(); ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ccnt < cycle_iters) poll_cycles_stop[ccnt] = get_cycles(); ++ccnt; if (ne < 0) { - PRINTF(cb, "poll CQ failed %d\n", ne); + printk(KERN_ERR PFX "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { - PRINTF(cb, + printk(KERN_ERR PFX "Completion wth error at %s:\n", cb->server ? "server" : "client"); - PRINTF(cb, + printk(KERN_ERR PFX "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); - PRINTF(cb, + printk(KERN_ERR PFX "scnt=%d, rcnt=%d, ccnt=%d\n", scnt, rcnt, ccnt); return; } } } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } for (i=0; i < cycle_iters; i++) { sum_post += post_cycles_stop[i] - post_cycles_start[i]; sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i]; } - PRINTF(cb, + printk(KERN_ERR PFX "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d" " sum_post %llu sum_poll %llu sum_last_poll %llu\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size, cycle_iters, (unsigned long long)sum_post, (unsigned long long)sum_poll, (unsigned long long)sum_last_poll); kfree(post_cycles_start); kfree(post_cycles_stop); kfree(poll_cycles_start); kfree(poll_cycles_stop); kfree(last_poll_cycles_start); } static void bw_test(struct krping_cb *cb) { int ccnt, scnt, rcnt; int iters=cb->count; struct timeval start_tv, stop_tv; cycles_t *post_cycles_start, *post_cycles_stop; cycles_t *poll_cycles_start, *poll_cycles_stop; cycles_t *last_poll_cycles_start; cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; int i; int cycle_iters = 1000; ccnt = 0; scnt = 0; rcnt = 0; post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_start) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_stop) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_start) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_stop) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!last_poll_cycles_start) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } - cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; - cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; - cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; - cb->rdma_sq_wr.sg_list->length = cb->size; + cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE; + cb->rdma_sq_wr.rkey = cb->remote_rkey; + cb->rdma_sq_wr.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.wr.sg_list->length = cb->size; if (cycle_iters > iters) cycle_iters = iters; microtime(&start_tv); while (scnt < iters || ccnt < iters) { while (scnt < iters && scnt - ccnt < cb->txdepth) { struct ib_send_wr *bad_wr; if (scnt < cycle_iters) post_cycles_start[scnt] = get_cycles(); - if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { - PRINTF(cb, + if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) { + printk(KERN_ERR PFX "Couldn't post send: scnt=%d\n", scnt); return; } if (scnt < cycle_iters) post_cycles_stop[scnt] = get_cycles(); ++scnt; } if (ccnt < iters) { int ne; struct ib_wc wc; if (ccnt < cycle_iters) poll_cycles_start[ccnt] = get_cycles(); do { if (ccnt < cycle_iters) last_poll_cycles_start[ccnt] = get_cycles(); ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ccnt < cycle_iters) poll_cycles_stop[ccnt] = get_cycles(); ccnt += 1; if (ne < 0) { - PRINTF(cb, "poll CQ failed %d\n", ne); + printk(KERN_ERR PFX "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { - PRINTF(cb, + printk(KERN_ERR PFX "Completion wth error at %s:\n", cb->server ? "server" : "client"); - PRINTF(cb, + printk(KERN_ERR PFX "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } } } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } for (i=0; i < cycle_iters; i++) { sum_post += post_cycles_stop[i] - post_cycles_start[i]; sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i]; } - PRINTF(cb, + printk(KERN_ERR PFX "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d" " sum_post %llu sum_poll %llu sum_last_poll %llu\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size, cycle_iters, (unsigned long long)sum_post, (unsigned long long)sum_poll, (unsigned long long)sum_last_poll); kfree(post_cycles_start); kfree(post_cycles_stop); kfree(poll_cycles_start); kfree(poll_cycles_stop); kfree(last_poll_cycles_start); } static void krping_rlat_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); + printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { - PRINTF(cb, "send completiong error %d\n", wc.status); + printk(KERN_ERR PFX "send completiong error %d\n", wc.status); return; } + wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_wlat_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); + printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { - PRINTF(cb, "send completiong error %d\n", wc.status); + printk(KERN_ERR PFX "send completiong error %d\n", wc.status); return; } wlat_test(cb); wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_bw_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); + printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { - PRINTF(cb, "send completiong error %d\n", wc.status); + printk(KERN_ERR PFX "send completiong error %d\n", wc.status); return; } if (cb->duplex) bw_test(cb); wait_event_interruptible(cb->sem, cb->state == ERROR); } -static int fastreg_supported(struct krping_cb *cb, int server) +static int reg_supported(struct ib_device *dev) { - struct ib_device *dev = server?cb->child_cm_id->device: - cb->cm_id->device; - struct ib_device_attr attr; - int ret; + u64 needed_flags = IB_DEVICE_MEM_MGT_EXTENSIONS; - ret = ib_query_device(dev, &attr); - if (ret) { - PRINTF(cb, "ib_query_device failed ret %d\n", ret); + if ((dev->attrs.device_cap_flags & needed_flags) != needed_flags) { + printk(KERN_ERR PFX + "Fastreg not supported - device_cap_flags 0x%llx\n", + (unsigned long long)dev->attrs.device_cap_flags); return 0; } - if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { - PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%llx\n", - (unsigned long long)attr.device_cap_flags); - return 0; - } - DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%jx\n", - (uintmax_t)attr.device_cap_flags); + DEBUG_LOG("Fastreg supported - device_cap_flags 0x%llx\n", + (unsigned long long)dev->attrs.device_cap_flags); return 1; } +static void fill_sockaddr(struct sockaddr_storage *sin, struct krping_cb *cb) +{ + memset(sin, 0, sizeof(*sin)); + + if (cb->addr_type == AF_INET) { + struct sockaddr_in *sin4 = (struct sockaddr_in *)sin; + sin4->sin_family = AF_INET; + memcpy((void *)&sin4->sin_addr.s_addr, cb->addr, 4); + sin4->sin_port = cb->port; + } else if (cb->addr_type == AF_INET6) { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; + sin6->sin6_family = AF_INET6; + memcpy((void *)&sin6->sin6_addr, cb->addr, 16); + sin6->sin6_port = cb->port; + } +} + static int krping_bind_server(struct krping_cb *cb) { - struct sockaddr_in sin; + struct sockaddr_storage sin; int ret; - memset(&sin, 0, sizeof(sin)); - sin.sin_len = sizeof sin; - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = cb->addr.s_addr; - sin.sin_port = cb->port; - ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin); + fill_sockaddr(&sin, cb); + + ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *)&sin); if (ret) { - PRINTF(cb, "rdma_bind_addr error %d\n", ret); + printk(KERN_ERR PFX "rdma_bind_addr error %d\n", ret); return ret; } - DEBUG_LOG(cb, "rdma_bind_addr successful\n"); + DEBUG_LOG("rdma_bind_addr successful\n"); - DEBUG_LOG(cb, "rdma_listen\n"); + DEBUG_LOG("rdma_listen\n"); ret = rdma_listen(cb->cm_id, 3); if (ret) { - PRINTF(cb, "rdma_listen failed: %d\n", ret); + printk(KERN_ERR PFX "rdma_listen failed: %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST); if (cb->state != CONNECT_REQUEST) { - PRINTF(cb, "wait for CONNECT_REQUEST state %d\n", + printk(KERN_ERR PFX "wait for CONNECT_REQUEST state %d\n", cb->state); return -1; } - if (cb->mem == FASTREG && !fastreg_supported(cb, 1)) + if (!reg_supported(cb->child_cm_id->device)) return -EINVAL; return 0; } -/* - * sq-depth worth of fastreg + 0B read-inv pairs, reposting them as the reads - * complete. - * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy. - */ -static void krping_fr_test5(struct krping_cb *cb) -{ - struct ib_fast_reg_page_list **pl; - struct ib_send_wr *fr, *read, *bad; - struct ib_wc wc; - struct ib_sge *sgl; - u8 key = 0; - struct ib_mr **mr; - u8 **buf; - dma_addr_t *dma_addr; - int i; - int ret; - int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; - time_t start; - int count = 0; - int scnt; - int depth = cb->txdepth >> 1; - - if (!depth) { - PRINTF(cb, "txdepth must be > 1 for this test!\n"); - return; - } - - pl = kzalloc(sizeof *pl * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth); - mr = kzalloc(sizeof *mr * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth); - fr = kzalloc(sizeof *fr * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth); - sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth); - read = kzalloc(sizeof *read * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, read, sizeof *read * depth); - buf = kzalloc(sizeof *buf * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth); - dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth); - if (!pl || !mr || !fr || !read || !sgl || !buf || !dma_addr) { - PRINTF(cb, "kzalloc failed\n"); - goto err1; - } - - for (scnt = 0; scnt < depth; scnt++) { - pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen); - if (IS_ERR(pl[scnt])) { - PRINTF(cb, "alloc_fr_page_list failed %ld\n", - PTR_ERR(pl[scnt])); - goto err2; - } - DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]); - - mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen); - if (IS_ERR(mr[scnt])) { - PRINTF(cb, "alloc_fr failed %ld\n", - PTR_ERR(mr[scnt])); - goto err2; - } - DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]); - ib_update_fast_reg_key(mr[scnt], ++key); - - buf[scnt] = kmalloc(cb->size, GFP_KERNEL); - if (!buf[scnt]) { - PRINTF(cb, "kmalloc failed\n"); - ret = -ENOMEM; - goto err2; - } - DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]); - dma_addr[scnt] = ib_dma_map_single(cb->pd->device, - buf[scnt], cb->size, - DMA_BIDIRECTIONAL); - if (dma_mapping_error(cb->pd->device->dma_device, - dma_addr[scnt])) { - PRINTF(cb, "dma_map failed\n"); - ret = -ENOMEM; - goto err2; - } - DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]); - for (i=0; ipage_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE); - DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n", - __func__, scnt, i, (uintmax_t)pl[scnt]->page_list[i]); - } - - sgl[scnt].lkey = mr[scnt]->rkey; - sgl[scnt].length = cb->size; - sgl[scnt].addr = (u64)buf[scnt]; - DEBUG_LOG(cb, "%s sgl[%u].lkey 0x%x length %u addr 0x%jx\n", - __func__, scnt, sgl[scnt].lkey, sgl[scnt].length, - (uintmax_t)sgl[scnt].addr); - - fr[scnt].opcode = IB_WR_FAST_REG_MR; - fr[scnt].wr_id = scnt; - fr[scnt].send_flags = 0; - fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT; - fr[scnt].wr.fast_reg.length = cb->size; - fr[scnt].wr.fast_reg.page_list = pl[scnt]; - fr[scnt].wr.fast_reg.page_list_len = plen; - fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt]; - fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; - fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey; - fr[scnt].next = &read[scnt]; - read[scnt].opcode = IB_WR_RDMA_READ_WITH_INV; - read[scnt].wr_id = scnt; - read[scnt].send_flags = IB_SEND_SIGNALED; - read[scnt].wr.rdma.rkey = cb->remote_rkey; - read[scnt].wr.rdma.remote_addr = cb->remote_addr; - read[scnt].num_sge = 1; - read[scnt].sg_list = &sgl[scnt]; - ret = ib_post_send(cb->qp, &fr[scnt], &bad); - if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); - goto err2; - } - } - - start = time_uptime; - DEBUG_LOG(cb, "%s starting IO.\n", __func__); - while (!cb->count || cb->server || count < cb->count) { - if ((time_uptime - start) >= 9) { - DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__, - count); - wait_event_interruptible_timeout(cb->sem, - cb->state == ERROR, - 1); - if (cb->state == ERROR) - break; - start = time_uptime; - } - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", - ret); - goto err2; - } - if (ret == 1) { - if (wc.status) { - PRINTF(cb, - "completion error %u wr_id %ju " - "opcode %d\n", wc.status, - (uintmax_t)wc.wr_id, wc.opcode); - goto err2; - } - count++; - if (count == cb->count) - break; - ib_update_fast_reg_key(mr[wc.wr_id], ++key); - fr[wc.wr_id].wr.fast_reg.rkey = - mr[wc.wr_id]->rkey; - sgl[wc.wr_id].lkey = mr[wc.wr_id]->rkey; - ret = ib_post_send(cb->qp, &fr[wc.wr_id], &bad); - if (ret) { - PRINTF(cb, - "ib_post_send failed %d\n", ret); - goto err2; - } - } else if (krping_sigpending()) { - PRINTF(cb, "signal!\n"); - goto err2; - } - } while (ret == 1); - } - DEBUG_LOG(cb, "%s done!\n", __func__); -err2: - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - DEBUG_LOG(cb, "draining the cq...\n"); - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - break; - } - if (ret == 1) { - if (wc.status) { - PRINTF(cb, "completion error %u " - "opcode %u\n", wc.status, wc.opcode); - } - } - } while (ret == 1); - - DEBUG_LOG(cb, "destroying fr mrs!\n"); - for (scnt = 0; scnt < depth; scnt++) { - if (mr[scnt]) { - ib_dereg_mr(mr[scnt]); - DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]); - } - } - DEBUG_LOG(cb, "unmapping/freeing bufs!\n"); - for (scnt = 0; scnt < depth; scnt++) { - if (buf[scnt]) { - dma_unmap_single(cb->pd->device->dma_device, - dma_addr[scnt], cb->size, - DMA_BIDIRECTIONAL); - kfree(buf[scnt]); - DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]); - } - } - DEBUG_LOG(cb, "destroying fr page lists!\n"); - for (scnt = 0; scnt < depth; scnt++) { - if (pl[scnt]) { - DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]); - ib_free_fast_reg_page_list(pl[scnt]); - } - } -err1: - if (pl) - kfree(pl); - if (mr) - kfree(mr); - if (fr) - kfree(fr); - if (read) - kfree(read); - if (sgl) - kfree(sgl); - if (buf) - kfree(buf); - if (dma_addr) - kfree(dma_addr); -} -static void krping_fr_test_server(struct krping_cb *cb) -{ - DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__); - wait_event_interruptible(cb->sem, cb->state == ERROR); -} - -static void krping_fr_test5_server(struct krping_cb *cb) -{ - struct ib_send_wr *bad_wr; - struct ib_wc wc; - int ret; - - /* Spin waiting for client's Start STAG/TO/Len */ - while (cb->state < RDMA_READ_ADV) { - krping_cq_event_handler(cb->cq, cb); - } - DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__, - cb->remote_rkey, (uintmax_t)cb->remote_addr); - - /* Send STAG/TO/Len to client */ - krping_format_send(cb, cb->start_dma_addr); - ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); - if (ret) { - PRINTF(cb, "post send error %d\n", ret); - return; - } - - /* Spin waiting for send completion */ - while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); - if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); - return; - } - if (wc.status) { - PRINTF(cb, "send completiong error %d\n", wc.status); - return; - } - - if (cb->duplex) - krping_fr_test5(cb); - DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__); - wait_event_interruptible(cb->sem, cb->state == ERROR); -} - -static void krping_fr_test5_client(struct krping_cb *cb) -{ - struct ib_send_wr *bad; - struct ib_wc wc; - int ret; - - cb->state = RDMA_READ_ADV; - - /* Send STAG/TO/Len to server */ - krping_format_send(cb, cb->start_dma_addr); - if (cb->state == ERROR) { - PRINTF(cb, "krping_format_send failed\n"); - return; - } - ret = ib_post_send(cb->qp, &cb->sq_wr, &bad); - if (ret) { - PRINTF(cb, "post send error %d\n", ret); - return; - } - - /* Spin waiting for send completion */ - while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); - if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); - return; - } - if (wc.status) { - PRINTF(cb, "send completion error %d\n", wc.status); - return; - } - - /* Spin waiting for server's Start STAG/TO/Len */ - while (cb->state < RDMA_WRITE_ADV) { - krping_cq_event_handler(cb->cq, cb); - } - DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, - (uintmax_t)cb->remote_addr); - - return krping_fr_test5(cb); -} - -/* - * sq-depth worth of write + fastreg + inv, reposting them as the invs - * complete. - * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy. - * If a count is given, then the last IO will have a bogus lkey in the - * write work request. This reproduces a fw bug where the connection - * will get stuck if a fastreg is processed while the ulptx is failing - * the bad write. - */ -static void krping_fr_test6(struct krping_cb *cb) -{ - struct ib_fast_reg_page_list **pl; - struct ib_send_wr *fr, *write, *inv, *bad; - struct ib_wc wc; - struct ib_sge *sgl; - u8 key = 0; - struct ib_mr **mr; - u8 **buf; - dma_addr_t *dma_addr; - int i; - int ret; - int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; - unsigned long start; - int count = 0; - int scnt; - int depth = cb->txdepth / 3; - - if (!depth) { - PRINTF(cb, "txdepth must be > 3 for this test!\n"); - return; - } - - pl = kzalloc(sizeof *pl * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth); - - mr = kzalloc(sizeof *mr * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth); - - fr = kzalloc(sizeof *fr * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth); - - sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth); - - write = kzalloc(sizeof *write * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, write, sizeof *write * depth); - - inv = kzalloc(sizeof *inv * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s inv %p size %zu\n", __func__, inv, sizeof *inv * depth); - - buf = kzalloc(sizeof *buf * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth); - - dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth); - - if (!pl || !mr || !fr || !write || !sgl || !buf || !dma_addr) { - PRINTF(cb, "kzalloc failed\n"); - goto err1; - } - - for (scnt = 0; scnt < depth; scnt++) { - pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen); - if (IS_ERR(pl[scnt])) { - PRINTF(cb, "alloc_fr_page_list failed %ld\n", - PTR_ERR(pl[scnt])); - goto err2; - } - DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]); - - mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen); - if (IS_ERR(mr[scnt])) { - PRINTF(cb, "alloc_fr failed %ld\n", - PTR_ERR(mr[scnt])); - goto err2; - } - DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]); - ib_update_fast_reg_key(mr[scnt], ++key); - - buf[scnt] = kmalloc(cb->size, GFP_KERNEL); - if (!buf[scnt]) { - PRINTF(cb, "kmalloc failed\n"); - ret = -ENOMEM; - goto err2; - } - DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]); - dma_addr[scnt] = ib_dma_map_single(cb->pd->device, - buf[scnt], cb->size, - DMA_BIDIRECTIONAL); - if (dma_mapping_error(cb->pd->device->dma_device, - dma_addr[scnt])) { - PRINTF(cb, "dma_map failed\n"); - ret = -ENOMEM; - goto err2; - } - DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]); - for (i=0; ipage_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE); - DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n", - __func__, scnt, i, (uintmax_t)pl[scnt]->page_list[i]); - } - - write[scnt].opcode = IB_WR_RDMA_WRITE; - write[scnt].wr_id = scnt; - write[scnt].wr.rdma.rkey = cb->remote_rkey; - write[scnt].wr.rdma.remote_addr = cb->remote_addr; - write[scnt].num_sge = 1; - write[scnt].sg_list = &cb->rdma_sgl; - write[scnt].sg_list->length = cb->size; - write[scnt].next = &fr[scnt]; - - fr[scnt].opcode = IB_WR_FAST_REG_MR; - fr[scnt].wr_id = scnt; - fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT; - fr[scnt].wr.fast_reg.length = cb->size; - fr[scnt].wr.fast_reg.page_list = pl[scnt]; - fr[scnt].wr.fast_reg.page_list_len = plen; - fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt]; - fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; - fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey; - fr[scnt].next = &inv[scnt]; - - inv[scnt].opcode = IB_WR_LOCAL_INV; - inv[scnt].send_flags = IB_SEND_SIGNALED; - inv[scnt].ex.invalidate_rkey = mr[scnt]->rkey; - - ret = ib_post_send(cb->qp, &write[scnt], &bad); - if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); - goto err2; - } - } - - start = time_uptime; - DEBUG_LOG(cb, "%s starting IO.\n", __func__); - while (!cb->count || cb->server || count < cb->count) { - if ((time_uptime - start) >= 9) { - DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__, - count); - wait_event_interruptible_timeout(cb->sem, - cb->state == ERROR, - 1); - if (cb->state == ERROR) - break; - start = time_uptime; - } - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", - ret); - goto err2; - } - if (ret == 1) { - if (wc.status) { - PRINTF(cb, - "completion error %u wr_id %ju " - "opcode %d\n", wc.status, - (uintmax_t)wc.wr_id, wc.opcode); - goto err2; - } - count++; - if (count == (cb->count -1)) - cb->rdma_sgl.lkey = 0x00dead; - if (count == cb->count) - break; - ib_update_fast_reg_key(mr[wc.wr_id], ++key); - fr[wc.wr_id].wr.fast_reg.rkey = - mr[wc.wr_id]->rkey; - inv[wc.wr_id].ex.invalidate_rkey = - mr[wc.wr_id]->rkey; - ret = ib_post_send(cb->qp, &write[wc.wr_id], &bad); - if (ret) { - PRINTF(cb, - "ib_post_send failed %d\n", ret); - goto err2; - } - } else if (krping_sigpending()){ - PRINTF(cb, "signal!\n"); - goto err2; - } - } while (ret == 1); - } - DEBUG_LOG(cb, "%s done!\n", __func__); -err2: - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - DEBUG_LOG(cb, "draining the cq...\n"); - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - break; - } - if (ret == 1) { - if (wc.status) { - PRINTF(cb, "completion error %u " - "opcode %u\n", wc.status, wc.opcode); - } - } - } while (ret == 1); - - DEBUG_LOG(cb, "destroying fr mrs!\n"); - for (scnt = 0; scnt < depth; scnt++) { - if (mr[scnt]) { - ib_dereg_mr(mr[scnt]); - DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]); - } - } - DEBUG_LOG(cb, "unmapping/freeing bufs!\n"); - for (scnt = 0; scnt < depth; scnt++) { - if (buf[scnt]) { - dma_unmap_single(cb->pd->device->dma_device, - dma_addr[scnt], cb->size, - DMA_BIDIRECTIONAL); - kfree(buf[scnt]); - DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]); - } - } - DEBUG_LOG(cb, "destroying fr page lists!\n"); - for (scnt = 0; scnt < depth; scnt++) { - if (pl[scnt]) { - DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]); - ib_free_fast_reg_page_list(pl[scnt]); - } - } -err1: - if (pl) - kfree(pl); - if (mr) - kfree(mr); - if (fr) - kfree(fr); - if (write) - kfree(write); - if (inv) - kfree(inv); - if (sgl) - kfree(sgl); - if (buf) - kfree(buf); - if (dma_addr) - kfree(dma_addr); -} - -static void krping_fr_test6_server(struct krping_cb *cb) -{ - struct ib_send_wr *bad_wr; - struct ib_wc wc; - int ret; - - /* Spin waiting for client's Start STAG/TO/Len */ - while (cb->state < RDMA_READ_ADV) { - krping_cq_event_handler(cb->cq, cb); - } - DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__, - cb->remote_rkey, (uintmax_t)cb->remote_addr); - - /* Send STAG/TO/Len to client */ - krping_format_send(cb, cb->start_dma_addr); - ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); - if (ret) { - PRINTF(cb, "post send error %d\n", ret); - return; - } - - /* Spin waiting for send completion */ - while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); - if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); - return; - } - if (wc.status) { - PRINTF(cb, "send completiong error %d\n", wc.status); - return; - } - - if (cb->duplex) - krping_fr_test6(cb); - DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__); - wait_event_interruptible(cb->sem, cb->state == ERROR); -} - -static void krping_fr_test6_client(struct krping_cb *cb) -{ - struct ib_send_wr *bad; - struct ib_wc wc; - int ret; - - cb->state = RDMA_READ_ADV; - - /* Send STAG/TO/Len to server */ - krping_format_send(cb, cb->start_dma_addr); - if (cb->state == ERROR) { - PRINTF(cb, "krping_format_send failed\n"); - return; - } - ret = ib_post_send(cb->qp, &cb->sq_wr, &bad); - if (ret) { - PRINTF(cb, "post send error %d\n", ret); - return; - } - - /* Spin waiting for send completion */ - while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); - if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); - return; - } - if (wc.status) { - PRINTF(cb, "send completion error %d\n", wc.status); - return; - } - - /* Spin waiting for server's Start STAG/TO/Len */ - while (cb->state < RDMA_WRITE_ADV) { - krping_cq_event_handler(cb->cq, cb); - } - DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, - (uintmax_t)cb->remote_addr); - - return krping_fr_test6(cb); -} - static void krping_run_server(struct krping_cb *cb) { struct ib_recv_wr *bad_wr; int ret; ret = krping_bind_server(cb); if (ret) return; ret = krping_setup_qp(cb, cb->child_cm_id); if (ret) { - PRINTF(cb, "setup_qp failed: %d\n", ret); + printk(KERN_ERR PFX "setup_qp failed: %d\n", ret); goto err0; } ret = krping_setup_buffers(cb); if (ret) { - PRINTF(cb, "krping_setup_buffers failed: %d\n", ret); + printk(KERN_ERR PFX "krping_setup_buffers failed: %d\n", ret); goto err1; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { - PRINTF(cb, "ib_post_recv failed: %d\n", ret); + printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret); goto err2; } ret = krping_accept(cb); if (ret) { - PRINTF(cb, "connect error %d\n", ret); + printk(KERN_ERR PFX "connect error %d\n", ret); goto err2; } if (cb->wlat) krping_wlat_test_server(cb); else if (cb->rlat) krping_rlat_test_server(cb); else if (cb->bw) krping_bw_test_server(cb); - else if (cb->frtest) { - switch (cb->testnum) { - case 1: - case 2: - case 3: - case 4: - krping_fr_test_server(cb); - break; - case 5: - krping_fr_test5_server(cb); - break; - case 6: - krping_fr_test6_server(cb); - break; - default: - PRINTF(cb, "unknown fr test %d\n", cb->testnum); - goto err2; - break; - } - } else + else krping_test_server(cb); rdma_disconnect(cb->child_cm_id); err2: krping_free_buffers(cb); err1: krping_free_qp(cb); err0: rdma_destroy_id(cb->child_cm_id); } static void krping_test_client(struct krping_cb *cb) { int ping, start, cc, i, ret; struct ib_send_wr *bad_wr; unsigned char c; start = 65; for (ping = 0; !cb->count || ping < cb->count; ping++) { cb->state = RDMA_READ_ADV; /* Put some ascii text in the buffer. */ cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping); for (i = cc, c = start; i < cb->size; i++) { cb->start_buf[i] = c; c++; if (c > 122) c = 65; } start++; if (start > 122) start = 65; cb->start_buf[cb->size - 1] = 0; krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { - PRINTF(cb, "krping_format_send failed\n"); + printk(KERN_ERR PFX "krping_format_send failed\n"); break; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); break; } /* Wait for server to ACK */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV); if (cb->state != RDMA_WRITE_ADV) { - PRINTF(cb, + printk(KERN_ERR PFX "wait for RDMA_WRITE_ADV state %d\n", cb->state); break; } krping_format_send(cb, cb->rdma_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); break; } /* Wait for the server to say the RDMA Write is complete. */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_COMPLETE); if (cb->state != RDMA_WRITE_COMPLETE) { - PRINTF(cb, + printk(KERN_ERR PFX "wait for RDMA_WRITE_COMPLETE state %d\n", cb->state); break; } if (cb->validate) if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) { - PRINTF(cb, "data mismatch!\n"); + printk(KERN_ERR PFX "data mismatch!\n"); break; } - if (cb->verbose) { - if (strlen(cb->rdma_buf) > 128) { - char msgbuf[128]; - - strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf)); - PRINTF(cb, "ping data stripped: %s\n", - msgbuf); - } else - PRINTF(cb, "ping data: %s\n", cb->rdma_buf); - } + if (cb->verbose) + printk(KERN_INFO PFX "ping data: %s\n", cb->rdma_buf); #ifdef SLOW_KRPING wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); #endif } } static void krping_rlat_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { - PRINTF(cb, "krping_format_send failed\n"); + printk(KERN_ERR PFX "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); + printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { - PRINTF(cb, "send completion error %d\n", wc.status); + printk(KERN_ERR PFX "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } #if 0 { int i; struct timeval start, stop; time_t sec; suseconds_t usec; unsigned long long elapsed; struct ib_wc wc; struct ib_send_wr *bad_wr; int ne; - cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; - cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; - cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; - cb->rdma_sq_wr.sg_list->length = 0; - cb->rdma_sq_wr.num_sge = 0; + cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE; + cb->rdma_sq_wr.rkey = cb->remote_rkey; + cb->rdma_sq_wr.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.wr.sg_list->length = 0; + cb->rdma_sq_wr.wr.num_sge = 0; microtime(&start); for (i=0; i < 100000; i++) { - if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { - PRINTF(cb, "Couldn't post send\n"); + if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) { + printk(KERN_ERR PFX "Couldn't post send\n"); return; } do { ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ne < 0) { - PRINTF(cb, "poll CQ failed %d\n", ne); + printk(KERN_ERR PFX "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { - PRINTF(cb, "Completion wth error at %s:\n", + printk(KERN_ERR PFX "Completion wth error at %s:\n", cb->server ? "server" : "client"); - PRINTF(cb, "Failed status %d: wr_id %d\n", + printk(KERN_ERR PFX "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } } microtime(&stop); if (stop.tv_usec < start.tv_usec) { stop.tv_usec += 1000000; stop.tv_sec -= 1; } sec = stop.tv_sec - start.tv_sec; usec = stop.tv_usec - start.tv_usec; elapsed = sec * 1000000 + usec; - PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed); + printk(KERN_ERR PFX "0B-write-lat iters 100000 usec %llu\n", elapsed); } #endif rlat_test(cb); } static void krping_wlat_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { - PRINTF(cb, "krping_format_send failed\n"); + printk(KERN_ERR PFX "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); + printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { - PRINTF(cb, "send completion error %d\n", wc.status); + printk(KERN_ERR PFX "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } wlat_test(cb); } static void krping_bw_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { - PRINTF(cb, "krping_format_send failed\n"); + printk(KERN_ERR PFX "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); + printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { - PRINTF(cb, "send completion error %d\n", wc.status); + printk(KERN_ERR PFX "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } bw_test(cb); } - /* - * fastreg 2 valid different mrs and verify the completions. + * Manual qp flush test */ -static void krping_fr_test1(struct krping_cb *cb) +static void flush_qp(struct krping_cb *cb) { - struct ib_fast_reg_page_list *pl; - struct ib_send_wr fr, *bad; + struct ib_send_wr wr = { 0 }, *bad; + struct ib_recv_wr recv_wr = { 0 }, *recv_bad; struct ib_wc wc; - struct ib_mr *mr1, *mr2; - int i; int ret; - int size = cb->size; - int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; - int count = 0; + int flushed = 0; + int ccnt = 0; - pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); - if (IS_ERR(pl)) { - PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); - return; - } + rdma_disconnect(cb->cm_id); + DEBUG_LOG("disconnected!\n"); - mr1 = ib_alloc_fast_reg_mr(cb->pd, plen); - if (IS_ERR(mr1)) { - PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); - goto err1; - } - mr2 = ib_alloc_fast_reg_mr(cb->pd, plen); - if (IS_ERR(mr2)) { - PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); - goto err2; - } - - - for (i=0; ipage_list[i] = i * PAGE_SIZE; - - memset(&fr, 0, sizeof fr); - fr.opcode = IB_WR_FAST_REG_MR; - fr.wr_id = 1; - fr.wr.fast_reg.page_shift = PAGE_SHIFT; - fr.wr.fast_reg.length = size; - fr.wr.fast_reg.page_list = pl; - fr.wr.fast_reg.page_list_len = plen; - fr.wr.fast_reg.iova_start = 0; - fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; - fr.send_flags = IB_SEND_SIGNALED; - fr.wr.fast_reg.rkey = mr1->rkey; - DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); - ret = ib_post_send(cb->qp, &fr, &bad); + wr.opcode = IB_WR_SEND; + wr.wr_id = 0xdeadbeefcafebabe; + ret = ib_post_send(cb->qp, &wr, &bad); if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); - goto err3; - } - fr.wr.fast_reg.rkey = mr2->rkey; - DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); - ret = ib_post_send(cb->qp, &fr, &bad); - if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); - goto err3; - } - - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - goto err3; - } - if (ret == 1) { - DEBUG_LOG(cb, "completion status %u wr %s\n", - wc.status, wc.wr_id == 1 ? "fr" : "inv"); - count++; - } else if (krping_sigpending()) { - PRINTF(cb, "signal!\n"); - goto err3; - } - - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - } while (count != 2); -err3: - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - DEBUG_LOG(cb, "draining the cq...\n"); - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - break; - } - if (ret == 1) { - PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode); - } - } while (ret == 1); - DEBUG_LOG(cb, "destroying fr mr2!\n"); - - ib_dereg_mr(mr2); -err2: - DEBUG_LOG(cb, "destroying fr mr1!\n"); - ib_dereg_mr(mr1); -err1: - DEBUG_LOG(cb, "destroying fr page list!\n"); - ib_free_fast_reg_page_list(pl); - DEBUG_LOG(cb, "%s done!\n", __func__); -} - -/* - * fastreg the same mr twice, 2nd one should produce error cqe. - */ -static void krping_fr_test2(struct krping_cb *cb) -{ - struct ib_fast_reg_page_list *pl; - struct ib_send_wr fr, *bad; - struct ib_wc wc; - struct ib_mr *mr1; - int i; - int ret; - int size = cb->size; - int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; - int count = 0; - - pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); - if (IS_ERR(pl)) { - PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); + printk(KERN_ERR PFX "%s post_send failed ret %d\n", __func__, ret); return; } - mr1 = ib_alloc_fast_reg_mr(cb->pd, plen); - if (IS_ERR(mr1)) { - PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); - goto err1; - } - - for (i=0; ipage_list[i] = i * PAGE_SIZE; - - memset(&fr, 0, sizeof fr); - fr.opcode = IB_WR_FAST_REG_MR; - fr.wr_id = 1; - fr.wr.fast_reg.page_shift = PAGE_SHIFT; - fr.wr.fast_reg.length = size; - fr.wr.fast_reg.page_list = pl; - fr.wr.fast_reg.page_list_len = plen; - fr.wr.fast_reg.iova_start = 0; - fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; - fr.send_flags = IB_SEND_SIGNALED; - fr.wr.fast_reg.rkey = mr1->rkey; - DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); - ret = ib_post_send(cb->qp, &fr, &bad); + recv_wr.wr_id = 0xcafebabedeadbeef; + ret = ib_post_recv(cb->qp, &recv_wr, &recv_bad); if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); - goto err3; + printk(KERN_ERR PFX "%s post_recv failed ret %d\n", __func__, ret); + return; } - DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); - ret = ib_post_send(cb->qp, &fr, &bad); - if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); - goto err3; - } - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); + /* poll until the flush WRs complete */ do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - goto err3; + printk(KERN_ERR PFX "ib_poll_cq failed %d\n", ret); + return; } - if (ret == 1) { - DEBUG_LOG(cb, "completion status %u wr %s\n", - wc.status, wc.wr_id == 1 ? "fr" : "inv"); - count++; - } else if (krping_sigpending()) { - PRINTF(cb, "signal!\n"); - goto err3; - } - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - } while (count != 2); -err3: - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - DEBUG_LOG(cb, "draining the cq...\n"); - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - break; - } - if (ret == 1) { - PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode); - } - } while (ret == 1); - DEBUG_LOG(cb, "destroying fr mr1!\n"); - ib_dereg_mr(mr1); -err1: - DEBUG_LOG(cb, "destroying fr page list!\n"); - ib_free_fast_reg_page_list(pl); - DEBUG_LOG(cb, "%s done!\n", __func__); + if (ret == 0) + continue; + ccnt++; + if (wc.wr_id == 0xdeadbeefcafebabe || + wc.wr_id == 0xcafebabedeadbeef) + flushed++; + } while (flushed != 2); + DEBUG_LOG("qp_flushed! ccnt %u\n", ccnt); } -/* - * fastreg pipelined in a loop as fast as we can until the user interrupts. - * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy. - */ -static void krping_fr_test3(struct krping_cb *cb) +static void krping_fr_test(struct krping_cb *cb) { - struct ib_fast_reg_page_list *pl; - struct ib_send_wr fr, inv, *bad; + struct ib_send_wr inv, *bad; + struct ib_reg_wr fr; struct ib_wc wc; u8 key = 0; struct ib_mr *mr; - int i; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; unsigned long start; int count = 0; int scnt = 0; + struct scatterlist sg = {0}; - - pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); - if (IS_ERR(pl)) { - PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); + mr = ib_alloc_mr(cb->pd, IB_MR_TYPE_MEM_REG, plen); + if (IS_ERR(mr)) { + printk(KERN_ERR PFX "ib_alloc_mr failed %ld\n", PTR_ERR(mr)); return; } - - mr = ib_alloc_fast_reg_mr(cb->pd, plen); - if (IS_ERR(mr)) { - PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); - goto err1; + + sg_dma_address(&sg) = 0xcafebabe0000UL; + sg_dma_len(&sg) = size; + ret = ib_map_mr_sg(mr, &sg, 1, NULL, PAGE_SIZE); + if (ret <= 0) { + printk(KERN_ERR PFX "ib_map_mr_sge err %d\n", ret); + goto err2; } - for (i=0; ipage_list[i] = i * PAGE_SIZE; - memset(&fr, 0, sizeof fr); - fr.opcode = IB_WR_FAST_REG_MR; - fr.wr.fast_reg.page_shift = PAGE_SHIFT; - fr.wr.fast_reg.length = size; - fr.wr.fast_reg.page_list = pl; - fr.wr.fast_reg.page_list_len = plen; - fr.wr.fast_reg.iova_start = 0; - fr.send_flags = IB_SEND_SIGNALED; - fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; - fr.next = &inv; + fr.wr.opcode = IB_WR_REG_MR; + fr.access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; + fr.mr = mr; + fr.wr.next = &inv; + memset(&inv, 0, sizeof inv); inv.opcode = IB_WR_LOCAL_INV; inv.send_flags = IB_SEND_SIGNALED; - DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth); + DEBUG_LOG("fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth); start = time_uptime; - while (1) { + while (!cb->count || count <= cb->count) { + if (SIGPENDING(curthread)) { + printk(KERN_ERR PFX "signal!\n"); + break; + } if ((time_uptime - start) >= 9) { - DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen); + DEBUG_LOG("fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); if (cb->state == ERROR) break; start = time_uptime; } while (scnt < (cb->txdepth>>1)) { ib_update_fast_reg_key(mr, ++key); - fr.wr.fast_reg.rkey = mr->rkey; + fr.key = mr->rkey; inv.ex.invalidate_rkey = mr->rkey; + size = arc4random() % cb->size; if (size == 0) size = cb->size; - plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; - fr.wr.fast_reg.length = size; - fr.wr.fast_reg.page_list_len = plen; - ret = ib_post_send(cb->qp, &fr, &bad); + sg_dma_len(&sg) = size; + ret = ib_map_mr_sg(mr, &sg, 1, NULL, PAGE_SIZE); + if (ret <= 0) { + printk(KERN_ERR PFX "ib_map_mr_sge err %d\n", ret); + goto err2; + } + ret = ib_post_send(cb->qp, &fr.wr, &bad); if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); + printk(KERN_ERR PFX "ib_post_send failed %d\n", ret); goto err2; } - scnt+=2; + scnt++; } - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - goto err2; - } - if (ret == 1) { - if (wc.status) { - PRINTF(cb, "completion error %u\n", wc.status); - goto err2; - } - count++; - scnt--; - } - else if (krping_sigpending()) { - PRINTF(cb, "signal!\n"); - goto err2; - } - } while (ret == 1); - } -err2: - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - DEBUG_LOG(cb, "draining the cq...\n"); - do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - break; + printk(KERN_ERR PFX "ib_poll_cq failed %d\n", ret); + goto err2; } if (ret == 1) { if (wc.status) { - PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode); + printk(KERN_ERR PFX "completion error %u\n", wc.status); + goto err2; } - } - } while (ret == 1); - DEBUG_LOG(cb, "fr_test: done!\n"); - ib_dereg_mr(mr); -err1: - DEBUG_LOG(cb, "destroying fr page list!\n"); - ib_free_fast_reg_page_list(pl); - DEBUG_LOG(cb, "%s done!\n", __func__); -} - -/* - * fastreg 1 and invalidate 1 mr and verify completion. - */ -static void krping_fr_test4(struct krping_cb *cb) -{ - struct ib_fast_reg_page_list *pl; - struct ib_send_wr fr, inv, *bad; - struct ib_wc wc; - struct ib_mr *mr1; - int i; - int ret; - int size = cb->size; - int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; - int count = 0; - - pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); - if (IS_ERR(pl)) { - PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); - return; - } - - mr1 = ib_alloc_fast_reg_mr(cb->pd, plen); - if (IS_ERR(mr1)) { - PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); - goto err1; - } - - for (i=0; ipage_list[i] = i * PAGE_SIZE; - - memset(&fr, 0, sizeof fr); - fr.opcode = IB_WR_FAST_REG_MR; - fr.wr_id = 1; - fr.wr.fast_reg.page_shift = PAGE_SHIFT; - fr.wr.fast_reg.length = size; - fr.wr.fast_reg.page_list = pl; - fr.wr.fast_reg.page_list_len = plen; - fr.wr.fast_reg.iova_start = 0; - fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; - fr.send_flags = IB_SEND_SIGNALED; - fr.wr.fast_reg.rkey = mr1->rkey; - fr.next = &inv; - memset(&inv, 0, sizeof inv); - inv.opcode = IB_WR_LOCAL_INV; - inv.ex.invalidate_rkey = mr1->rkey; - - DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); - ret = ib_post_send(cb->qp, &fr, &bad); - if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); - goto err3; - } - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - goto err3; - } - if (ret == 1) { - DEBUG_LOG(cb, "completion status %u wr %s\n", - wc.status, wc.wr_id == 1 ? "fr" : "inv"); count++; - } else if (krping_sigpending()) { - PRINTF(cb, "signal!\n"); - goto err3; + scnt--; } - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - } while (count != 1); -err3: - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - DEBUG_LOG(cb, "draining the cq...\n"); - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - break; - } - if (ret == 1) { - PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode); - } - } while (ret == 1); - DEBUG_LOG(cb, "destroying fr mr1!\n"); - ib_dereg_mr(mr1); -err1: - DEBUG_LOG(cb, "destroying fr page list!\n"); - ib_free_fast_reg_page_list(pl); - DEBUG_LOG(cb, "%s done!\n", __func__); -} - -static void krping_fr_test(struct krping_cb *cb) -{ - switch (cb->testnum) { - case 1: - krping_fr_test1(cb); - break; - case 2: - krping_fr_test2(cb); - break; - case 3: - krping_fr_test3(cb); - break; - case 4: - krping_fr_test4(cb); - break; - case 5: - krping_fr_test5_client(cb); - break; - case 6: - krping_fr_test6_client(cb); - break; - default: - PRINTF(cb, "Unkown frtest num %u\n", cb->testnum); - break; } +err2: + flush_qp(cb); + DEBUG_LOG("fr_test: done!\n"); + ib_dereg_mr(mr); } static int krping_connect_client(struct krping_cb *cb) { struct rdma_conn_param conn_param; int ret; memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; conn_param.retry_count = 10; ret = rdma_connect(cb->cm_id, &conn_param); if (ret) { - PRINTF(cb, "rdma_connect error %d\n", ret); + printk(KERN_ERR PFX "rdma_connect error %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= CONNECTED); if (cb->state == ERROR) { - PRINTF(cb, "wait for CONNECTED state %d\n", cb->state); + printk(KERN_ERR PFX "wait for CONNECTED state %d\n", cb->state); return -1; } - DEBUG_LOG(cb, "rdma_connect successful\n"); + DEBUG_LOG("rdma_connect successful\n"); return 0; } static int krping_bind_client(struct krping_cb *cb) { - struct sockaddr_in sin; + struct sockaddr_storage sin; int ret; - memset(&sin, 0, sizeof(sin)); - sin.sin_len = sizeof sin; - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = cb->addr.s_addr; - sin.sin_port = cb->port; + fill_sockaddr(&sin, cb); - ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin, - 2000); + ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *)&sin, 2000); if (ret) { - PRINTF(cb, "rdma_resolve_addr error %d\n", ret); + printk(KERN_ERR PFX "rdma_resolve_addr error %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED); if (cb->state != ROUTE_RESOLVED) { - PRINTF(cb, + printk(KERN_ERR PFX "addr/route resolution did not resolve: state %d\n", cb->state); return -EINTR; } - if (cb->mem == FASTREG && !fastreg_supported(cb, 0)) + if (!reg_supported(cb->cm_id->device)) return -EINVAL; - DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n"); + DEBUG_LOG("rdma_resolve_addr - rdma_resolve_route successful\n"); return 0; } static void krping_run_client(struct krping_cb *cb) { struct ib_recv_wr *bad_wr; int ret; ret = krping_bind_client(cb); if (ret) return; ret = krping_setup_qp(cb, cb->cm_id); if (ret) { - PRINTF(cb, "setup_qp failed: %d\n", ret); + printk(KERN_ERR PFX "setup_qp failed: %d\n", ret); return; } ret = krping_setup_buffers(cb); if (ret) { - PRINTF(cb, "krping_setup_buffers failed: %d\n", ret); + printk(KERN_ERR PFX "krping_setup_buffers failed: %d\n", ret); goto err1; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { - PRINTF(cb, "ib_post_recv failed: %d\n", ret); + printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret); goto err2; } ret = krping_connect_client(cb); if (ret) { - PRINTF(cb, "connect error %d\n", ret); + printk(KERN_ERR PFX "connect error %d\n", ret); goto err2; } if (cb->wlat) krping_wlat_test_client(cb); else if (cb->rlat) krping_rlat_test_client(cb); else if (cb->bw) krping_bw_test_client(cb); else if (cb->frtest) krping_fr_test(cb); else krping_test_client(cb); rdma_disconnect(cb->cm_id); err2: krping_free_buffers(cb); err1: krping_free_qp(cb); } -int krping_doit(char *cmd, void *cookie) +int krping_doit(char *cmd) { struct krping_cb *cb; int op; int ret = 0; char *optarg; unsigned long optint; cb = kzalloc(sizeof(*cb), GFP_KERNEL); if (!cb) return -ENOMEM; mutex_lock(&krping_mutex); list_add_tail(&cb->list, &krping_cbs); mutex_unlock(&krping_mutex); - cb->cookie = cookie; cb->server = -1; cb->state = IDLE; cb->size = 64; cb->txdepth = RPING_SQ_DEPTH; - cb->mem = DMA; init_waitqueue_head(&cb->sem); while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg, &optint)) != 0) { switch (op) { + struct in_addr in_addr; case 'a': cb->addr_str = optarg; - DEBUG_LOG(cb, "ipaddr (%s)\n", optarg); - if (!inet_aton(optarg, &cb->addr)) { - PRINTF(cb, "bad addr string %s\n", + cb->addr_type = AF_INET; + DEBUG_LOG("ipaddr (%s)\n", optarg); + if (!inet_aton(optarg, &in_addr)) { + printk(KERN_ERR PFX "bad addr string %s\n", optarg); ret = EINVAL; } + memcpy(cb->addr, &in_addr.s_addr, sizeof(in_addr.s_addr)); break; + case 'A': + cb->addr_str = optarg; + cb->addr_type = AF_INET6; + DEBUG_LOG("ipv6addr (%s)\n", optarg); + ret = EAFNOSUPPORT; /* XXX not supported */ + break; case 'p': cb->port = htons(optint); - DEBUG_LOG(cb, "port %d\n", (int)optint); + DEBUG_LOG("port %d\n", (int)optint); break; case 'P': cb->poll = 1; - DEBUG_LOG(cb, "server\n"); + DEBUG_LOG("server\n"); break; case 's': cb->server = 1; - DEBUG_LOG(cb, "server\n"); + DEBUG_LOG("server\n"); break; case 'c': cb->server = 0; - DEBUG_LOG(cb, "client\n"); + DEBUG_LOG("client\n"); break; case 'S': cb->size = optint; if ((cb->size < 1) || (cb->size > RPING_BUFSIZE)) { - PRINTF(cb, "Invalid size %d " + printk(KERN_ERR PFX "Invalid size %d " "(valid range is 1 to %d)\n", cb->size, RPING_BUFSIZE); ret = EINVAL; } else - DEBUG_LOG(cb, "size %d\n", (int)optint); + DEBUG_LOG("size %d\n", (int)optint); break; case 'C': cb->count = optint; if (cb->count < 0) { - PRINTF(cb, "Invalid count %d\n", + printk(KERN_ERR PFX "Invalid count %d\n", cb->count); ret = EINVAL; } else - DEBUG_LOG(cb, "count %d\n", (int) cb->count); + DEBUG_LOG("count %d\n", (int) cb->count); break; case 'v': cb->verbose++; - DEBUG_LOG(cb, "verbose\n"); + DEBUG_LOG("verbose\n"); break; case 'V': cb->validate++; - DEBUG_LOG(cb, "validate data\n"); + DEBUG_LOG("validate data\n"); break; case 'l': cb->wlat++; break; case 'L': cb->rlat++; break; case 'B': cb->bw++; break; case 'd': cb->duplex++; break; - case 'm': - if (!strncmp(optarg, "dma", 3)) - cb->mem = DMA; - else if (!strncmp(optarg, "fastreg", 7)) - cb->mem = FASTREG; - else if (!strncmp(optarg, "mw", 2)) - cb->mem = MW; - else if (!strncmp(optarg, "mr", 2)) - cb->mem = MR; - else { - PRINTF(cb, "unknown mem mode %s. " - "Must be dma, fastreg, mw, or mr\n", - optarg); - ret = -EINVAL; - break; - } - break; case 'I': cb->server_invalidate = 1; break; case 'T': cb->txdepth = optint; - DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth); + DEBUG_LOG("txdepth %d\n", (int) cb->txdepth); break; case 'Z': cb->local_dma_lkey = 1; - DEBUG_LOG(cb, "using local dma lkey\n"); + DEBUG_LOG("using local dma lkey\n"); break; case 'R': cb->read_inv = 1; - DEBUG_LOG(cb, "using read-with-inv\n"); + DEBUG_LOG("using read-with-inv\n"); break; case 'f': cb->frtest = 1; - cb->testnum = optint; - DEBUG_LOG(cb, "fast-reg test!\n"); + DEBUG_LOG("fast-reg test!\n"); break; default: - PRINTF(cb, "unknown opt %s\n", optarg); + printk(KERN_ERR PFX "unknown opt %s\n", optarg); ret = -EINVAL; break; } } if (ret) goto out; if (cb->server == -1) { - PRINTF(cb, "must be either client or server\n"); + printk(KERN_ERR PFX "must be either client or server\n"); ret = -EINVAL; goto out; } - if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) { - PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n"); + if (cb->server && cb->frtest) { + printk(KERN_ERR PFX "must be client to run frtest\n"); ret = -EINVAL; goto out; } - if (cb->server_invalidate && cb->mem != FASTREG) { - PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n"); - ret = -EINVAL; - goto out; - } - if (cb->read_inv && cb->mem != FASTREG) { - PRINTF(cb, "read_inv only valid with fastreg mem_mode\n"); + if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) { + printk(KERN_ERR PFX "Pick only one test: fr, bw, rlat, wlat\n"); ret = -EINVAL; goto out; } - if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw || cb->frtest)) { - PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n"); + if (cb->wlat || cb->rlat || cb->bw) { + printk(KERN_ERR PFX "wlat, rlat, and bw tests only support mem_mode MR - which is no longer supported\n"); ret = -EINVAL; goto out; } - cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC); + cb->cm_id = rdma_create_id(&init_net, krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cb->cm_id)) { ret = PTR_ERR(cb->cm_id); - PRINTF(cb, "rdma_create_id error %d\n", ret); + printk(KERN_ERR PFX "rdma_create_id error %d\n", ret); goto out; } - DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id); + DEBUG_LOG("created cm_id %p\n", cb->cm_id); if (cb->server) krping_run_server(cb); else krping_run_client(cb); - DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id); + DEBUG_LOG("destroy cm_id %p\n", cb->cm_id); rdma_destroy_id(cb->cm_id); out: mutex_lock(&krping_mutex); list_del(&cb->list); mutex_unlock(&krping_mutex); kfree(cb); return ret; } void krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg) { struct krping_cb *cb; mutex_lock(&krping_mutex); list_for_each_entry(cb, &krping_cbs, list) (*f)(cb->pd ? &cb->stats : NULL, arg); mutex_unlock(&krping_mutex); -} - -void krping_init(void) -{ - - mutex_init(&krping_mutex); } Index: projects/bsd_rdma_4_9/sys/contrib/rdma/krping/krping.h =================================================================== --- projects/bsd_rdma_4_9/sys/contrib/rdma/krping/krping.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/contrib/rdma/krping/krping.h (revision 319974) @@ -1,20 +1,19 @@ /* * $FreeBSD$ */ struct krping_stats { unsigned long long send_bytes; unsigned long long send_msgs; unsigned long long recv_bytes; unsigned long long recv_msgs; unsigned long long write_bytes; unsigned long long write_msgs; unsigned long long read_bytes; unsigned long long read_msgs; char name[16]; }; -int krping_doit(char *, void *); +int krping_doit(char *); void krping_walk_cb_list(void (*)(struct krping_stats *, void *), void *); -void krping_init(void); int krping_sigpending(void); Index: projects/bsd_rdma_4_9/sys/contrib/rdma/krping/krping_dev.c =================================================================== --- projects/bsd_rdma_4_9/sys/contrib/rdma/krping/krping_dev.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/contrib/rdma/krping/krping_dev.c (revision 319974) @@ -1,220 +1,219 @@ /* * This code lifted from: * Simple `echo' pseudo-device KLD * Murray Stokely * Converted to 5.X by Søren (Xride) Straarup */ /* * /bin/echo "server,port=9999,addr=192.168.69.142,validate" > /dev/krping * /bin/echo "client,port=9999,addr=192.168.69.142,validate" > /dev/krping */ #include __FBSDID("$FreeBSD$"); #include #include #include /* uprintf */ #include #include /* defines used in kernel.h */ #include /* types used in module initialization */ #include /* cdevsw struct */ #include /* uio struct */ #include #include #include #include #include "krping.h" #define BUFFERSIZE 512 SYSCTL_NODE(_dev, OID_AUTO, krping, CTLFLAG_RW, 0, "kernel rping module"); int krping_debug = 0; SYSCTL_INT(_dev_krping, OID_AUTO, debug, CTLFLAG_RW, &krping_debug, 0 , ""); /* Function prototypes */ static d_open_t krping_open; static d_close_t krping_close; static d_read_t krping_read; static d_write_t krping_write; /* Character device entry points */ static struct cdevsw krping_cdevsw = { .d_version = D_VERSION, .d_open = krping_open, .d_close = krping_close, .d_read = krping_read, .d_write = krping_write, .d_name = "krping", }; typedef struct s_krping { char msg[BUFFERSIZE]; int len; } krping_t; struct stats_list_entry { STAILQ_ENTRY(stats_list_entry) link; struct krping_stats *stats; }; STAILQ_HEAD(stats_list, stats_list_entry); /* vars */ static struct cdev *krping_dev; static int krping_loader(struct module *m, int what, void *arg) { int err = 0; switch (what) { case MOD_LOAD: /* kldload */ - krping_init(); krping_dev = make_dev(&krping_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "krping"); printf("Krping device loaded.\n"); break; case MOD_UNLOAD: destroy_dev(krping_dev); printf("Krping device unloaded.\n"); break; default: err = EOPNOTSUPP; break; } return (err); } static int krping_open(struct cdev *dev, int oflags, int devtype, struct thread *p) { return (0); } static int krping_close(struct cdev *dev, int fflag, int devtype, struct thread *p) { return 0; } static void krping_copy_stats(struct krping_stats *stats, void *arg) { struct stats_list_entry *s; struct stats_list *list = arg; s = malloc(sizeof(*s), M_DEVBUF, M_NOWAIT | M_ZERO); if (s == NULL) return; if (stats != NULL) { s->stats = malloc(sizeof(*stats), M_DEVBUF, M_NOWAIT | M_ZERO); if (s->stats == NULL) { free(s, M_DEVBUF); return; } *s->stats = *stats; } STAILQ_INSERT_TAIL(list, s, link); } static int krping_read(struct cdev *dev, struct uio *uio, int ioflag) { int num = 1; struct stats_list list; struct stats_list_entry *e; STAILQ_INIT(&list); krping_walk_cb_list(krping_copy_stats, &list); if (STAILQ_EMPTY(&list)) return (0); uprintf("krping: %4s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n", "num", "device", "snd bytes", "snd msgs", "rcv bytes", "rcv msgs", "wr bytes", "wr msgs", "rd bytes", "rd msgs"); while (!STAILQ_EMPTY(&list)) { e = STAILQ_FIRST(&list); STAILQ_REMOVE_HEAD(&list, link); if (e->stats == NULL) uprintf("krping: %d listen\n", num); else { struct krping_stats *stats = e->stats; uprintf("krping: %4d %10s %10llu %10llu %10llu %10llu " "%10llu %10llu %10llu %10llu\n", num, stats->name, stats->send_bytes, stats->send_msgs, stats->recv_bytes, stats->recv_msgs, stats->write_bytes, stats->write_msgs, stats->read_bytes, stats->read_msgs); free(stats, M_DEVBUF); } num++; free(e, M_DEVBUF); } return (0); } static int krping_write(struct cdev *dev, struct uio *uio, int ioflag) { int err = 0; int amt; int remain = BUFFERSIZE; char *cp; krping_t *krpingmsg; krpingmsg = malloc(sizeof *krpingmsg, M_DEVBUF, M_WAITOK|M_ZERO); if (!krpingmsg) { uprintf("Could not malloc mem!\n"); return ENOMEM; } cp = krpingmsg->msg; while (uio->uio_resid) { amt = MIN(uio->uio_resid, remain); if (amt == 0) break; /* Copy the string in from user memory to kernel memory */ err = uiomove(cp, amt, uio); if (err) { uprintf("Write failed: bad address!\n"); return err; } cp += amt; remain -= amt; } if (uio->uio_resid != 0) { uprintf("Message too big. max size is %d!\n", BUFFERSIZE); return EMSGSIZE; } /* null terminate and remove the \n */ cp--; *cp = 0; krpingmsg->len = (unsigned long)(cp - krpingmsg->msg); uprintf("krping: write string = |%s|\n", krpingmsg->msg); - err = krping_doit(krpingmsg->msg, curproc); + err = krping_doit(krpingmsg->msg); free(krpingmsg, M_DEVBUF); return(err); } int krping_sigpending(void) { return (SIGPENDING(curthread)); } DEV_MODULE(krping, krping_loader, NULL); MODULE_DEPEND(krping, ibcore, 1, 1, 1); Index: projects/bsd_rdma_4_9/sys/modules/ibcore/Makefile =================================================================== --- projects/bsd_rdma_4_9/sys/modules/ibcore/Makefile (revision 319973) +++ projects/bsd_rdma_4_9/sys/modules/ibcore/Makefile (revision 319974) @@ -1,22 +1,41 @@ # $FreeBSD$ .PATH: ${SRCTOP}/sys/ofed/drivers/infiniband/core KMOD= ibcore -SRCS= addr.c iwcm.c sa_query.c ucma.c uverbs_cmd.c \ - agent.c multicast.c smi.c ud_header.c uverbs_main.c \ - mad.c peer_mem.c umem.c uverbs_marshall.c \ - cache.c device.c packer.c sysfs.c user_mad.c verbs.c \ - cm.c fmr_pool.c mad_rmpp.c ucm.c cma.c \ - vnode_if.h device_if.h bus_if.h pci_if.h \ - opt_inet.h opt_inet6.h +SRCS= vnode_if.h device_if.h bus_if.h pci_if.h \ + opt_inet.h opt_inet6.h \ + ib_addr.c \ + ib_agent.c \ + ib_cache.c \ + ib_cm.c \ + ib_cma.c \ + ib_cq.c \ + ib_device.c \ + ib_fmr_pool.c \ + ib_iwcm.c \ + ib_iwpm_msg.c \ + ib_iwpm_util.c \ + ib_mad.c \ + ib_mad_rmpp.c \ + ib_multicast.c \ + ib_packer.c \ + ib_roce_gid_mgmt.c \ + ib_sa_query.c \ + ib_smi.c \ + ib_sysfs.c \ + ib_ucm.c \ + ib_ucma.c \ + ib_ud_header.c \ + ib_umem.c \ + ib_user_mad.c \ + ib_uverbs_cmd.c \ + ib_uverbs_main.c \ + ib_uverbs_marshall.c \ + ib_verbs.c -CFLAGS+= -I${SRCTOP}/sys/ofed/drivers/infiniband/core CFLAGS+= -I${SRCTOP}/sys/ofed/include +CFLAGS+= -I${SRCTOP}/sys/ofed/include/uapi CFLAGS+= -I${SRCTOP}/sys/compat/linuxkpi/common/include -CFLAGS+= -DINET6 -DINET +CFLAGS+= -DINET6 -DINET -DCONFIG_INFINIBAND_USER_MEM .include - -CFLAGS+= -Wno-cast-qual -Wno-pointer-arith - -CWARNFLAGS.cm.c= -Wno-unused-function Index: projects/bsd_rdma_4_9/sys/modules/ipoib/Makefile =================================================================== --- projects/bsd_rdma_4_9/sys/modules/ipoib/Makefile (revision 319973) +++ projects/bsd_rdma_4_9/sys/modules/ipoib/Makefile (revision 319974) @@ -1,16 +1,17 @@ # $FreeBSD$ .PATH: ${SRCTOP}/sys/ofed/drivers/infiniband/ulp/ipoib KMOD= ipoib SRCS= device_if.h bus_if.h vnode_if.h pci_if.h \ opt_inet.h opt_inet6.h opt_ofed.h \ ipoib_cm.c ipoib_ib.c ipoib_main.c \ ipoib_multicast.c ipoib_verbs.c CFLAGS+= -I${SRCTOP}/sys/ofed/drivers/infiniband/ulp/ipoib CFLAGS+= -I${SRCTOP}/sys/ofed/include +CFLAGS+= -I${SRCTOP}/sys/ofed/include/uapi CFLAGS+= -I${SRCTOP}/sys/compat/linuxkpi/common/include .include CFLAGS+= -Wno-cast-qual -Wno-pointer-arith Index: projects/bsd_rdma_4_9/sys/modules/rdma/krping/Makefile =================================================================== --- projects/bsd_rdma_4_9/sys/modules/rdma/krping/Makefile (revision 319973) +++ projects/bsd_rdma_4_9/sys/modules/rdma/krping/Makefile (revision 319974) @@ -1,11 +1,12 @@ # $FreeBSD$ .PATH: ${SRCTOP}/sys/contrib/rdma/krping KMOD= krping SRCS= krping.c krping_dev.c getopt.c SRCS+= bus_if.h device_if.h pci_if.h pcib_if.h vnode_if.h SRCS+= opt_sched.h opt_inet.h opt_inet6.h CFLAGS+= -I${SRCTOP}/sys/ofed/include +CFLAGS+= -I${SRCTOP}/sys/ofed/include/uapi CFLAGS+= -I${SRCTOP}/sys/compat/linuxkpi/common/include .include Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/peer_mem.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/peer_mem.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/peer_mem.c (nonexistent) @@ -1,461 +0,0 @@ -/* - * Copyright (c) 2013, Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include - -static DEFINE_MUTEX(peer_memory_mutex); -static LIST_HEAD(peer_memory_list); - -static int num_registered_peers; - -/* This code uses the sysfs which is not supporeted by the FreeBSD. - * * Will be added in future to the sysctl */ - -#if 0 -static struct kobject *peers_kobj; -static struct ib_peer_memory_client *get_peer_by_kobj(void *kobj); -static ssize_t version_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj); - - if (ib_peer_client) { - sprintf(buf, "%s\n", ib_peer_client->peer_mem->version); - return strlen(buf); - } - /* not found - nothing is return */ - return 0; -} - -static ssize_t num_alloc_mrs_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj); - - if (ib_peer_client) { - sprintf(buf, "%lu\n", ib_peer_client->stats.num_alloc_mrs); - return strlen(buf); - } - /* not found - nothing is return */ - return 0; -} - -static ssize_t num_reg_pages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj); - - if (ib_peer_client) { - sprintf(buf, "%lu\n", ib_peer_client->stats.num_reg_pages); - return strlen(buf); - } - /* not found - nothing is return */ - return 0; -} - -static ssize_t num_dereg_pages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj); - - if (ib_peer_client) { - sprintf(buf, "%lu\n", ib_peer_client->stats.num_dereg_pages); - return strlen(buf); - } - /* not found - nothing is return */ - return 0; -} - -static ssize_t num_free_callbacks_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj); - - if (ib_peer_client) { - sprintf(buf, "%lu\n", ib_peer_client->stats.num_free_callbacks); - return strlen(buf); - } - /* not found - nothing is return */ - return 0; -} - -static struct kobj_attribute version_attr = __ATTR_RO(version); -static struct kobj_attribute num_alloc_mrs = __ATTR_RO(num_alloc_mrs); -static struct kobj_attribute num_reg_pages = __ATTR_RO(num_reg_pages); -static struct kobj_attribute num_dereg_pages = __ATTR_RO(num_dereg_pages); -static struct kobj_attribute num_free_callbacks = __ATTR_RO(num_free_callbacks); - -static struct attribute *peer_mem_attrs[] = { - &version_attr.attr, - &num_alloc_mrs.attr, - &num_reg_pages.attr, - &num_dereg_pages.attr, - &num_free_callbacks.attr, - NULL, -}; -#endif - -#if 0 -static void destroy_peer_sysfs(struct ib_peer_memory_client *ib_peer_client) -{ - kobject_put(ib_peer_client->kobj); - if (!num_registered_peers) - kobject_put(peers_kobj); - - return; -} - -/* This code uses the sysfs which is not supporeted by the FreeBSD. - * Will be added in future to the sysctl */ - -static int create_peer_sysfs(struct ib_peer_memory_client *ib_peer_client) -{ - int ret; - - if (!num_registered_peers) { - /* creating under /sys/kernel/mm */ - peers_kobj = kobject_create_and_add("memory_peers", mm_kobj); - if (!peers_kobj) - return -ENOMEM; - } - - ib_peer_client->peer_mem_attr_group.attrs = peer_mem_attrs; - /* Dir alreday was created explicitly to get its kernel object for further usage */ - ib_peer_client->peer_mem_attr_group.name = NULL; - ib_peer_client->kobj = kobject_create_and_add(ib_peer_client->peer_mem->name, - peers_kobj); - - if (!ib_peer_client->kobj) { - ret = -EINVAL; - goto free; - } - - /* Create the files associated with this kobject */ - ret = sysfs_create_group(ib_peer_client->kobj, - &ib_peer_client->peer_mem_attr_group); - if (ret) - goto peer_free; - - return 0; - -peer_free: - kobject_put(ib_peer_client->kobj); - -free: - if (!num_registered_peers) - kobject_put(peers_kobj); - - return ret; -} -#endif - -static int ib_invalidate_peer_memory(void *reg_handle, - void *core_context) -{ - struct ib_peer_memory_client *ib_peer_client = - (struct ib_peer_memory_client *)reg_handle; - struct invalidation_ctx *invalidation_ctx; - struct core_ticket *core_ticket; - int need_unlock = 1; - - mutex_lock(&ib_peer_client->lock); - ib_peer_client->stats.num_free_callbacks += 1; - core_ticket = ib_peer_search_context(ib_peer_client, - (unsigned long)core_context); - if (!core_ticket) - goto out; - - invalidation_ctx = (struct invalidation_ctx *)core_ticket->context; - /* If context not ready yet mark to be invalidated */ - if (!invalidation_ctx->func) { - invalidation_ctx->peer_invalidated = 1; - goto out; - } - - invalidation_ctx->func(invalidation_ctx->cookie, - invalidation_ctx->umem, 0, 0); - if (invalidation_ctx->inflight_invalidation) { - - /* init the completion to wait on before letting other thread to run */ - init_completion(&invalidation_ctx->comp); - mutex_unlock(&ib_peer_client->lock); - need_unlock = 0; - wait_for_completion(&invalidation_ctx->comp); - } - - kfree(invalidation_ctx); - -out: - if (need_unlock) - mutex_unlock(&ib_peer_client->lock); - - return 0; -} - -/* access to that peer client is under its lock - no extra lock is needed */ -unsigned long ib_peer_insert_context(struct ib_peer_memory_client *ib_peer_client, - void *context) -{ - struct core_ticket *core_ticket = kzalloc(sizeof(*core_ticket), GFP_KERNEL); - - ib_peer_client->last_ticket++; - core_ticket->context = context; - core_ticket->key = ib_peer_client->last_ticket; - - list_add_tail(&core_ticket->ticket_list, - &ib_peer_client->core_ticket_list); - - return core_ticket->key; -} - -int ib_peer_remove_context(struct ib_peer_memory_client *ib_peer_client, - unsigned long key) -{ - struct core_ticket *core_ticket, *tmp; - - list_for_each_entry_safe(core_ticket, tmp, &ib_peer_client->core_ticket_list, - ticket_list) { - if (core_ticket->key == key) { - list_del(&core_ticket->ticket_list); - kfree(core_ticket); - return 0; - } - } - - return 1; -} - -struct core_ticket *ib_peer_search_context(struct ib_peer_memory_client *ib_peer_client, - unsigned long key) -{ - struct core_ticket *core_ticket, *tmp; - list_for_each_entry_safe(core_ticket, tmp, &ib_peer_client->core_ticket_list, - ticket_list) { - if (core_ticket->key == key) - return core_ticket; - } - - return NULL; -} - - -static int ib_memory_peer_check_mandatory(struct peer_memory_client - *peer_client) -{ -#define PEER_MEM_MANDATORY_FUNC(x) {\ - offsetof(struct peer_memory_client, x), #x } - - static const struct { - size_t offset; - char *name; - } mandatory_table[] = { - PEER_MEM_MANDATORY_FUNC(acquire), - PEER_MEM_MANDATORY_FUNC(get_pages), - PEER_MEM_MANDATORY_FUNC(put_pages), - PEER_MEM_MANDATORY_FUNC(get_page_size), - PEER_MEM_MANDATORY_FUNC(dma_map), - PEER_MEM_MANDATORY_FUNC(dma_unmap) - }; - int i; - - for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { - if (!*(void **) ((void *) peer_client + mandatory_table[i].offset)) { - printk(KERN_WARNING "Peer memory %s is missing mandatory function %s\n", - peer_client->name, mandatory_table[i].name); - return -EINVAL; - } - } - - return 0; -} - - - -void *ib_register_peer_memory_client(struct peer_memory_client *peer_client, - invalidate_peer_memory *invalidate_callback) -{ - int ret = 0; - struct ib_peer_memory_client *ib_peer_client = NULL; - - mutex_lock(&peer_memory_mutex); - if (ib_memory_peer_check_mandatory(peer_client)) { - ret = -EINVAL; - goto out; - } - - ib_peer_client = kzalloc(sizeof(*ib_peer_client), GFP_KERNEL); - if (!ib_peer_client) - goto out; - ib_peer_client->peer_mem = peer_client; - - INIT_LIST_HEAD(&ib_peer_client->core_ticket_list); - mutex_init(&ib_peer_client->lock); -#ifdef __FreeBSD__ - ib_peer_client->holdcount = 0; - ib_peer_client->needwakeup = 0; - cv_init(&ib_peer_client->peer_cv, "ibprcl"); -#else - ret = init_srcu_struct(&ib_peer_client->peer_srcu); - if (ret) - goto free; -#endif -#if 0 - if (create_peer_sysfs(ib_peer_client)) - goto free; -#endif - *invalidate_callback = ib_invalidate_peer_memory; - list_add_tail(&ib_peer_client->core_peer_list, &peer_memory_list); - num_registered_peers++; - goto out; -#if 0 -free: - kfree(ib_peer_client); - ib_peer_client = NULL; -#endif -out: - mutex_unlock(&peer_memory_mutex); - return ib_peer_client; -} -EXPORT_SYMBOL(ib_register_peer_memory_client); - -void ib_unregister_peer_memory_client(void *reg_handle) -{ - struct ib_peer_memory_client *ib_peer_client = - (struct ib_peer_memory_client *)reg_handle; - - mutex_lock(&peer_memory_mutex); - /* remove from list to prevent future core clients usage as it goes down */ - list_del(&ib_peer_client->core_peer_list); -#ifdef __FreeBSD__ - while (ib_peer_client->holdcount != 0) { - ib_peer_client->needwakeup = 1; - cv_wait(&ib_peer_client->peer_cv, &peer_memory_mutex.sx); - } - cv_destroy(&ib_peer_client->peer_cv); -#else - mutex_unlock(&peer_memory_mutex); - /* peer memory can't go down while there are active clients */ - synchronize_srcu(&ib_peer_client->peer_srcu); - cleanup_srcu_struct(&ib_peer_client->peer_srcu); - mutex_lock(&peer_memory_mutex); -#endif - num_registered_peers--; -/* This code uses the sysfs which is not supporeted by the FreeBSD. - * Will be added in future to the sysctl */ -#if 0 - destroy_peer_sysfs(ib_peer_client); -#endif - mutex_unlock(&peer_memory_mutex); - - kfree(ib_peer_client); -} -EXPORT_SYMBOL(ib_unregister_peer_memory_client); - -/* This code uses the sysfs which is not supporeted by the FreeBSD. - * Will be added in future to the sysctl */ - -#if 0 -static struct ib_peer_memory_client *get_peer_by_kobj(void *kobj) -{ - struct ib_peer_memory_client *ib_peer_client; - - mutex_lock(&peer_memory_mutex); - list_for_each_entry(ib_peer_client, &peer_memory_list, core_peer_list) { - if (ib_peer_client->kobj == kobj) - goto found; - } - - ib_peer_client = NULL; - -found: - - mutex_unlock(&peer_memory_mutex); - return ib_peer_client; -} -#endif - -struct ib_peer_memory_client *ib_get_peer_client(struct ib_ucontext *context, unsigned long addr, - size_t size, void **peer_client_context, - int *srcu_key) -{ - struct ib_peer_memory_client *ib_peer_client; - int ret; - - mutex_lock(&peer_memory_mutex); - list_for_each_entry(ib_peer_client, &peer_memory_list, core_peer_list) { - ret = ib_peer_client->peer_mem->acquire(addr, size, - context->peer_mem_private_data, - context->peer_mem_name, - peer_client_context); - if (ret == 1) - goto found; - } - - ib_peer_client = NULL; - -found: - if (ib_peer_client) { -#ifdef __FreeBSD__ - ib_peer_client->holdcount++; -#else - *srcu_key = srcu_read_lock(&ib_peer_client->peer_srcu); -#endif - } - - mutex_unlock(&peer_memory_mutex); - return ib_peer_client; - -} -EXPORT_SYMBOL(ib_get_peer_client); - -void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client, - void *peer_client_context, - int srcu_key) -{ - - if (ib_peer_client->peer_mem->release) - ib_peer_client->peer_mem->release(peer_client_context); - -#ifdef __FreeBSD__ - ib_peer_client->holdcount--; - if (ib_peer_client->holdcount == 0 && ib_peer_client->needwakeup) { - cv_signal(&ib_peer_client->peer_cv); - } -#else - srcu_read_unlock(&ib_peer_client->peer_srcu, srcu_key); -#endif - return; -} -EXPORT_SYMBOL(ib_put_peer_client); - Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/peer_mem.c ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/cache.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/cache.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/cache.c (nonexistent) @@ -1,467 +0,0 @@ -/* - * Copyright (c) 2004 Topspin Communications. All rights reserved. - * Copyright (c) 2005 Intel Corporation. All rights reserved. - * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2005 Voltaire, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include - -#include - -#include "core_priv.h" - -struct ib_pkey_cache { - int table_len; - u16 table[0]; -}; - -struct ib_gid_cache { - int table_len; - union ib_gid table[0]; -}; - -struct ib_update_work { - struct work_struct work; - struct ib_device *device; - u8 port_num; -}; - -static inline int start_port(struct ib_device *device) -{ - return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; -} - -static inline int end_port(struct ib_device *device) -{ - return (device->node_type == RDMA_NODE_IB_SWITCH) ? - 0 : device->phys_port_cnt; -} - -int ib_get_cached_gid(struct ib_device *device, - u8 port_num, - int index, - union ib_gid *gid) -{ - struct ib_gid_cache *cache; - unsigned long flags; - int ret = -EINVAL; - - if (port_num < start_port(device) || port_num > end_port(device)) - return -EINVAL; - - read_lock_irqsave(&device->cache.lock, flags); - - if (device->cache.gid_cache) { - cache = device->cache.gid_cache[port_num - start_port(device)]; - - if (cache && index >= 0 && index < cache->table_len) { - *gid = cache->table[index]; - ret = 0; - } - } - - read_unlock_irqrestore(&device->cache.lock, flags); - - return ret; -} -EXPORT_SYMBOL(ib_get_cached_gid); - -int ib_find_cached_gid(struct ib_device *device, - union ib_gid *gid, - u8 *port_num, - u16 *index) -{ - struct ib_gid_cache *cache; - unsigned long flags; - int p, i; - int ret = -ENOENT; - - *port_num = -1; - if (index) - *index = -1; - - read_lock_irqsave(&device->cache.lock, flags); - if (!device->cache.gid_cache) - goto out; - for (p = 0; p <= end_port(device) - start_port(device); ++p) { - cache = device->cache.gid_cache[p]; - if (!cache) - continue; - for (i = 0; i < cache->table_len; ++i) { - if (!memcmp(gid, &cache->table[i], sizeof *gid)) { - *port_num = p + start_port(device); - if (index) - *index = i; - ret = 0; - goto out; - } - } - } -out: - read_unlock_irqrestore(&device->cache.lock, flags); - return ret; -} -EXPORT_SYMBOL(ib_find_cached_gid); - -int ib_get_cached_pkey(struct ib_device *device, - u8 port_num, - int index, - u16 *pkey) -{ - struct ib_pkey_cache *cache; - unsigned long flags; - int ret = -EINVAL; - - if (port_num < start_port(device) || port_num > end_port(device)) - return -EINVAL; - - read_lock_irqsave(&device->cache.lock, flags); - - if (device->cache.pkey_cache) { - cache = device->cache.pkey_cache[port_num - start_port(device)]; - - if (cache && index >= 0 && index < cache->table_len) { - *pkey = cache->table[index]; - ret = 0; - } - } - - read_unlock_irqrestore(&device->cache.lock, flags); - - return ret; -} -EXPORT_SYMBOL(ib_get_cached_pkey); - -int ib_find_cached_pkey(struct ib_device *device, - u8 port_num, - u16 pkey, - u16 *index) -{ - struct ib_pkey_cache *cache; - unsigned long flags; - int i; - int ret = -ENOENT; - int partial_ix = -1; - - if (port_num < start_port(device) || port_num > end_port(device)) - return -EINVAL; - - *index = -1; - - read_lock_irqsave(&device->cache.lock, flags); - - if (!device->cache.pkey_cache) - goto out; - - cache = device->cache.pkey_cache[port_num - start_port(device)]; - if (!cache) - goto out; - - for (i = 0; i < cache->table_len; ++i) - if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) { - if (cache->table[i] & 0x8000) { - *index = i; - ret = 0; - break; - } else - partial_ix = i; - } - - if (ret && partial_ix >= 0) { - *index = partial_ix; - ret = 0; - } -out: - read_unlock_irqrestore(&device->cache.lock, flags); - return ret; -} -EXPORT_SYMBOL(ib_find_cached_pkey); - -int ib_find_exact_cached_pkey(struct ib_device *device, - u8 port_num, - u16 pkey, - u16 *index) -{ - struct ib_pkey_cache *cache; - unsigned long flags; - int i; - int ret = -ENOENT; - - if (port_num < start_port(device) || port_num > end_port(device)) - return -EINVAL; - - *index = -1; - - read_lock_irqsave(&device->cache.lock, flags); - - if (!device->cache.pkey_cache) - goto out; - - cache = device->cache.pkey_cache[port_num - start_port(device)]; - if (!cache) - goto out; - - for (i = 0; i < cache->table_len; ++i) - if (cache->table[i] == pkey) { - *index = i; - ret = 0; - break; - } -out: - read_unlock_irqrestore(&device->cache.lock, flags); - return ret; -} -EXPORT_SYMBOL(ib_find_exact_cached_pkey); - -int ib_get_cached_lmc(struct ib_device *device, - u8 port_num, - u8 *lmc) -{ - unsigned long flags; - int ret = -EINVAL; - - if (port_num < start_port(device) || port_num > end_port(device)) - return -EINVAL; - - read_lock_irqsave(&device->cache.lock, flags); - if (device->cache.lmc_cache) { - *lmc = device->cache.lmc_cache[port_num - start_port(device)]; - ret = 0; - } - read_unlock_irqrestore(&device->cache.lock, flags); - - return ret; -} -EXPORT_SYMBOL(ib_get_cached_lmc); - -static void ib_cache_update(struct ib_device *device, - u8 port) -{ - struct ib_port_attr *tprops = NULL; - struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; - struct ib_gid_cache *gid_cache = NULL, *old_gid_cache; - int i; - int ret; - - if (!(device->cache.pkey_cache && device->cache.gid_cache && - device->cache.lmc_cache)) - return; - - tprops = kmalloc(sizeof *tprops, GFP_KERNEL); - if (!tprops) - return; - - ret = ib_query_port(device, port, tprops); - if (ret) { - printk(KERN_WARNING "ib_query_port failed (%d) for %s\n", - ret, device->name); - goto err; - } - - pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len * - sizeof *pkey_cache->table, GFP_KERNEL); - if (!pkey_cache) - goto err; - - pkey_cache->table_len = tprops->pkey_tbl_len; - - gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len * - sizeof *gid_cache->table, GFP_KERNEL); - if (!gid_cache) - goto err; - - gid_cache->table_len = tprops->gid_tbl_len; - - for (i = 0; i < pkey_cache->table_len; ++i) { - ret = ib_query_pkey(device, port, i, pkey_cache->table + i); - if (ret) { - printk(KERN_WARNING "ib_query_pkey failed (%d) for %s (index %d)\n", - ret, device->name, i); - goto err; - } - } - - for (i = 0; i < gid_cache->table_len; ++i) { - ret = ib_query_gid(device, port, i, gid_cache->table + i); - if (ret) { - printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n", - ret, device->name, i); - goto err; - } - } - - write_lock_irq(&device->cache.lock); - - old_pkey_cache = device->cache.pkey_cache[port - start_port(device)]; - old_gid_cache = device->cache.gid_cache [port - start_port(device)]; - - device->cache.pkey_cache[port - start_port(device)] = pkey_cache; - device->cache.gid_cache [port - start_port(device)] = gid_cache; - - device->cache.lmc_cache[port - start_port(device)] = tprops->lmc; - - write_unlock_irq(&device->cache.lock); - - kfree(old_pkey_cache); - kfree(old_gid_cache); - kfree(tprops); - return; - -err: - kfree(pkey_cache); - kfree(gid_cache); - kfree(tprops); -} - -static void ib_cache_task(struct work_struct *_work) -{ - struct ib_update_work *work = - container_of(_work, struct ib_update_work, work); - - ib_cache_update(work->device, work->port_num); - kfree(work); -} - -static void ib_cache_event(struct ib_event_handler *handler, - struct ib_event *event) -{ - struct ib_update_work *work; - - if (event->event == IB_EVENT_PORT_ERR || - event->event == IB_EVENT_PORT_ACTIVE || - event->event == IB_EVENT_LID_CHANGE || - event->event == IB_EVENT_PKEY_CHANGE || - event->event == IB_EVENT_SM_CHANGE || - event->event == IB_EVENT_CLIENT_REREGISTER || - event->event == IB_EVENT_GID_CHANGE) { - work = kmalloc(sizeof *work, GFP_ATOMIC); - if (work) { - INIT_WORK(&work->work, ib_cache_task); - work->device = event->device; - work->port_num = event->element.port_num; - queue_work(ib_wq, &work->work); - } - } -} - -static void ib_cache_setup_one(struct ib_device *device) -{ - int p; - - rwlock_init(&device->cache.lock); - - device->cache.pkey_cache = - kmalloc(sizeof *device->cache.pkey_cache * - (end_port(device) - start_port(device) + 1), GFP_KERNEL); - device->cache.gid_cache = - kmalloc(sizeof *device->cache.gid_cache * - (end_port(device) - start_port(device) + 1), GFP_KERNEL); - - device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache * - (end_port(device) - - start_port(device) + 1), - GFP_KERNEL); - - if (!device->cache.pkey_cache || !device->cache.gid_cache || - !device->cache.lmc_cache) { - printk(KERN_WARNING "Couldn't allocate cache " - "for %s\n", device->name); - goto err; - } - - for (p = 0; p <= end_port(device) - start_port(device); ++p) { - device->cache.pkey_cache[p] = NULL; - device->cache.gid_cache [p] = NULL; - ib_cache_update(device, p + start_port(device)); - } - - INIT_IB_EVENT_HANDLER(&device->cache.event_handler, - device, ib_cache_event); - if (ib_register_event_handler(&device->cache.event_handler)) - goto err_cache; - - return; - -err_cache: - for (p = 0; p <= end_port(device) - start_port(device); ++p) { - kfree(device->cache.pkey_cache[p]); - kfree(device->cache.gid_cache[p]); - } - -err: - kfree(device->cache.pkey_cache); - kfree(device->cache.gid_cache); - kfree(device->cache.lmc_cache); - device->cache.pkey_cache = NULL; - device->cache.gid_cache = NULL; - device->cache.lmc_cache = NULL; -} - -static void ib_cache_cleanup_one(struct ib_device *device) -{ - int p; - - if (!(device->cache.pkey_cache && device->cache.gid_cache && - device->cache.lmc_cache)) - return; - - ib_unregister_event_handler(&device->cache.event_handler); - flush_workqueue(ib_wq); - - for (p = 0; p <= end_port(device) - start_port(device); ++p) { - kfree(device->cache.pkey_cache[p]); - kfree(device->cache.gid_cache[p]); - } - - kfree(device->cache.pkey_cache); - kfree(device->cache.gid_cache); - kfree(device->cache.lmc_cache); -} - -static struct ib_client cache_client = { - .name = "cache", - .add = ib_cache_setup_one, - .remove = ib_cache_cleanup_one -}; - -int __init ib_cache_setup(void) -{ - return ib_register_client(&cache_client); -} - -void __exit ib_cache_cleanup(void) -{ - ib_unregister_client(&cache_client); -} Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/cache.c ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/verbs.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/verbs.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/verbs.c (nonexistent) @@ -1,1538 +0,0 @@ -/* - * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. - * Copyright (c) 2004 Infinicon Corporation. All rights reserved. - * Copyright (c) 2004 Intel Corporation. All rights reserved. - * Copyright (c) 2004 Topspin Corporation. All rights reserved. - * Copyright (c) 2004 Voltaire Corporation. All rights reserved. - * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -int ib_rate_to_mult(enum ib_rate rate) -{ - switch (rate) { - case IB_RATE_2_5_GBPS: return 1; - case IB_RATE_5_GBPS: return 2; - case IB_RATE_10_GBPS: return 4; - case IB_RATE_20_GBPS: return 8; - case IB_RATE_30_GBPS: return 12; - case IB_RATE_40_GBPS: return 16; - case IB_RATE_60_GBPS: return 24; - case IB_RATE_80_GBPS: return 32; - case IB_RATE_120_GBPS: return 48; - default: return -1; - } -} -EXPORT_SYMBOL(ib_rate_to_mult); - -enum ib_rate mult_to_ib_rate(int mult) -{ - switch (mult) { - case 1: return IB_RATE_2_5_GBPS; - case 2: return IB_RATE_5_GBPS; - case 4: return IB_RATE_10_GBPS; - case 8: return IB_RATE_20_GBPS; - case 12: return IB_RATE_30_GBPS; - case 16: return IB_RATE_40_GBPS; - case 24: return IB_RATE_60_GBPS; - case 32: return IB_RATE_80_GBPS; - case 48: return IB_RATE_120_GBPS; - default: return IB_RATE_PORT_CURRENT; - } -} -EXPORT_SYMBOL(mult_to_ib_rate); - -int ib_rate_to_mbps(enum ib_rate rate) -{ - switch (rate) { - case IB_RATE_2_5_GBPS: return 2500; - case IB_RATE_5_GBPS: return 5000; - case IB_RATE_10_GBPS: return 10000; - case IB_RATE_20_GBPS: return 20000; - case IB_RATE_30_GBPS: return 30000; - case IB_RATE_40_GBPS: return 40000; - case IB_RATE_60_GBPS: return 60000; - case IB_RATE_80_GBPS: return 80000; - case IB_RATE_120_GBPS: return 120000; - case IB_RATE_14_GBPS: return 14062; - case IB_RATE_56_GBPS: return 56250; - case IB_RATE_112_GBPS: return 112500; - case IB_RATE_168_GBPS: return 168750; - case IB_RATE_25_GBPS: return 25781; - case IB_RATE_100_GBPS: return 103125; - case IB_RATE_200_GBPS: return 206250; - case IB_RATE_300_GBPS: return 309375; - default: return -1; - } -} -EXPORT_SYMBOL(ib_rate_to_mbps); - -enum rdma_transport_type -rdma_node_get_transport(enum rdma_node_type node_type) -{ - switch (node_type) { - case RDMA_NODE_IB_CA: - case RDMA_NODE_IB_SWITCH: - case RDMA_NODE_IB_ROUTER: - return RDMA_TRANSPORT_IB; - case RDMA_NODE_RNIC: - return RDMA_TRANSPORT_IWARP; - case RDMA_NODE_MIC: - return RDMA_TRANSPORT_SCIF; - default: - BUG(); - return 0; - } -} -EXPORT_SYMBOL(rdma_node_get_transport); - -enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num) -{ - if (device->get_link_layer) - return device->get_link_layer(device, port_num); - - switch (rdma_node_get_transport(device->node_type)) { - case RDMA_TRANSPORT_IB: - return IB_LINK_LAYER_INFINIBAND; - case RDMA_TRANSPORT_IWARP: - return IB_LINK_LAYER_ETHERNET; - case RDMA_TRANSPORT_SCIF: - return IB_LINK_LAYER_SCIF; - default: - return IB_LINK_LAYER_UNSPECIFIED; - } -} -EXPORT_SYMBOL(rdma_port_get_link_layer); - -/* Protection domains */ - -struct ib_pd *ib_alloc_pd(struct ib_device *device) -{ - struct ib_pd *pd; - - pd = device->alloc_pd(device, NULL, NULL); - - if (!IS_ERR(pd)) { - pd->device = device; - pd->uobject = NULL; - atomic_set(&pd->usecnt, 0); - } - - return pd; -} -EXPORT_SYMBOL(ib_alloc_pd); - -int ib_dealloc_pd(struct ib_pd *pd) -{ - if (atomic_read(&pd->usecnt)) - return -EBUSY; - - return pd->device->dealloc_pd(pd); -} -EXPORT_SYMBOL(ib_dealloc_pd); - -/* Address handles */ - -struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) -{ - struct ib_ah *ah; - - ah = pd->device->create_ah(pd, ah_attr); - - if (!IS_ERR(ah)) { - ah->device = pd->device; - ah->pd = pd; - ah->uobject = NULL; - atomic_inc(&pd->usecnt); - } - - return ah; -} -EXPORT_SYMBOL(ib_create_ah); - -int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, - struct ib_grh *grh, struct ib_ah_attr *ah_attr) -{ - u32 flow_class; - u16 gid_index; - int ret; - int is_eth = (rdma_port_get_link_layer(device, port_num) == - IB_LINK_LAYER_ETHERNET); - - memset(ah_attr, 0, sizeof *ah_attr); - if (is_eth) { - if (!(wc->wc_flags & IB_WC_GRH)) - return -EPROTOTYPE; - - if (wc->wc_flags & IB_WC_WITH_SMAC && - wc->wc_flags & IB_WC_WITH_VLAN) { - memcpy(ah_attr->dmac, wc->smac, ETH_ALEN); - ah_attr->vlan_id = wc->vlan_id; - } else { - u32 scope_id = rdma_get_ipv6_scope_id(device, port_num); - ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid, - ah_attr->dmac, &ah_attr->vlan_id, - scope_id); - if (ret) - return ret; - } - } else { - ah_attr->vlan_id = 0xffff; - } - - - ah_attr->dlid = wc->slid; - ah_attr->sl = wc->sl; - ah_attr->src_path_bits = wc->dlid_path_bits; - ah_attr->port_num = port_num; - - if (wc->wc_flags & IB_WC_GRH) { - ah_attr->ah_flags = IB_AH_GRH; - ah_attr->grh.dgid = grh->sgid; - - ret = ib_find_cached_gid(device, &grh->dgid, &port_num, - &gid_index); - if (ret) - return ret; - - ah_attr->grh.sgid_index = (u8) gid_index; - flow_class = be32_to_cpu(grh->version_tclass_flow); - ah_attr->grh.flow_label = flow_class & 0xFFFFF; - ah_attr->grh.hop_limit = 0xFF; - ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; - } - return 0; -} -EXPORT_SYMBOL(ib_init_ah_from_wc); - -struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, - struct ib_grh *grh, u8 port_num) -{ - struct ib_ah_attr ah_attr; - int ret; - - ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr); - if (ret) - return ERR_PTR(ret); - - return ib_create_ah(pd, &ah_attr); -} -EXPORT_SYMBOL(ib_create_ah_from_wc); - -int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) -{ - return ah->device->modify_ah ? - ah->device->modify_ah(ah, ah_attr) : - -ENOSYS; -} -EXPORT_SYMBOL(ib_modify_ah); - -int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) -{ - return ah->device->query_ah ? - ah->device->query_ah(ah, ah_attr) : - -ENOSYS; -} -EXPORT_SYMBOL(ib_query_ah); - -int ib_destroy_ah(struct ib_ah *ah) -{ - struct ib_pd *pd; - int ret; - - pd = ah->pd; - ret = ah->device->destroy_ah(ah); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_destroy_ah); - -/* Shared receive queues */ - -struct ib_srq *ib_create_srq(struct ib_pd *pd, - struct ib_srq_init_attr *srq_init_attr) -{ - struct ib_srq *srq; - - if (!pd->device->create_srq) - return ERR_PTR(-ENOSYS); - - srq = pd->device->create_srq(pd, srq_init_attr, NULL); - - if (!IS_ERR(srq)) { - srq->device = pd->device; - srq->pd = pd; - srq->uobject = NULL; - srq->event_handler = srq_init_attr->event_handler; - srq->srq_context = srq_init_attr->srq_context; - srq->srq_type = srq_init_attr->srq_type; - if (srq->srq_type == IB_SRQT_XRC) { - srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; - srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq; - atomic_inc(&srq->ext.xrc.xrcd->usecnt); - atomic_inc(&srq->ext.xrc.cq->usecnt); - } - atomic_inc(&pd->usecnt); - atomic_set(&srq->usecnt, 0); - } - - return srq; -} -EXPORT_SYMBOL(ib_create_srq); - -int ib_modify_srq(struct ib_srq *srq, - struct ib_srq_attr *srq_attr, - enum ib_srq_attr_mask srq_attr_mask) -{ - return srq->device->modify_srq ? - srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) : - -ENOSYS; -} -EXPORT_SYMBOL(ib_modify_srq); - -int ib_query_srq(struct ib_srq *srq, - struct ib_srq_attr *srq_attr) -{ - return srq->device->query_srq ? - srq->device->query_srq(srq, srq_attr) : -ENOSYS; -} -EXPORT_SYMBOL(ib_query_srq); - -int ib_query_values(struct ib_device *device, - int q_values, struct ib_device_values *values) -{ - return device->query_values ? - device->query_values(device, q_values, values) : -ENOSYS; -} -EXPORT_SYMBOL(ib_query_values); - -int ib_destroy_srq(struct ib_srq *srq) -{ - struct ib_pd *pd; - enum ib_srq_type srq_type; - struct ib_xrcd *uninitialized_var(xrcd); - struct ib_cq *uninitialized_var(cq); - int ret; - - if (atomic_read(&srq->usecnt)) - return -EBUSY; - - pd = srq->pd; - srq_type = srq->srq_type; - if (srq_type == IB_SRQT_XRC) { - xrcd = srq->ext.xrc.xrcd; - cq = srq->ext.xrc.cq; - } - - ret = srq->device->destroy_srq(srq); - if (!ret) { - atomic_dec(&pd->usecnt); - if (srq_type == IB_SRQT_XRC) { - atomic_dec(&xrcd->usecnt); - atomic_dec(&cq->usecnt); - } - } - - return ret; -} -EXPORT_SYMBOL(ib_destroy_srq); - -/* Queue pairs */ - -static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) -{ - struct ib_qp *qp = context; - unsigned long flags; - - /* The code below must be synced with deletions of existing qps (ib_close_qp) -- - * because a qp from the list may be closed during the scan, resulting in a kernel Oops. - */ - spin_lock_irqsave(&qp->device->event_handler_lock, flags); - list_for_each_entry(event->element.qp, &qp->open_list, open_list) - if (event->element.qp->event_handler) - event->element.qp->event_handler(event, event->element.qp->qp_context); - spin_unlock_irqrestore(&qp->device->event_handler_lock, flags); -} - -static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp) -{ - mutex_lock(&xrcd->tgt_qp_mutex); - list_add(&qp->xrcd_list, &xrcd->tgt_qp_list); - mutex_unlock(&xrcd->tgt_qp_mutex); -} - -static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp, - void (*event_handler)(struct ib_event *, void *), - void *qp_context) -{ - struct ib_qp *qp; - unsigned long flags; - - qp = kzalloc(sizeof *qp, GFP_KERNEL); - if (!qp) - return ERR_PTR(-ENOMEM); - - qp->real_qp = real_qp; - atomic_inc(&real_qp->usecnt); - qp->device = real_qp->device; - qp->event_handler = event_handler; - qp->qp_context = qp_context; - qp->qp_num = real_qp->qp_num; - qp->qp_type = real_qp->qp_type; - - spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); - list_add(&qp->open_list, &real_qp->open_list); - spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); - - return qp; -} - -struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, - struct ib_qp_open_attr *qp_open_attr) -{ - struct ib_qp *qp, *real_qp; - - if (qp_open_attr->qp_type != IB_QPT_XRC_TGT) - return ERR_PTR(-EINVAL); - - qp = ERR_PTR(-EINVAL); - mutex_lock(&xrcd->tgt_qp_mutex); - list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) { - if (real_qp->qp_num == qp_open_attr->qp_num) { - qp = __ib_open_qp(real_qp, qp_open_attr->event_handler, - qp_open_attr->qp_context); - break; - } - } - mutex_unlock(&xrcd->tgt_qp_mutex); - return qp; -} -EXPORT_SYMBOL(ib_open_qp); - -struct ib_qp *ib_create_qp(struct ib_pd *pd, - struct ib_qp_init_attr *qp_init_attr) -{ - struct ib_qp *qp, *real_qp; - struct ib_device *device; - - device = pd ? pd->device : qp_init_attr->xrcd->device; - qp = device->create_qp(pd, qp_init_attr, NULL); - - if (!IS_ERR(qp)) { - qp->device = device; - qp->real_qp = qp; - qp->uobject = NULL; - qp->qp_type = qp_init_attr->qp_type; - - atomic_set(&qp->usecnt, 0); - if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) { - qp->event_handler = __ib_shared_qp_event_handler; - qp->qp_context = qp; - qp->pd = NULL; - qp->send_cq = qp->recv_cq = NULL; - qp->srq = NULL; - qp->xrcd = qp_init_attr->xrcd; - atomic_inc(&qp_init_attr->xrcd->usecnt); - INIT_LIST_HEAD(&qp->open_list); - - real_qp = qp; - qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, - qp_init_attr->qp_context); - if (!IS_ERR(qp)) - __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); - else - real_qp->device->destroy_qp(real_qp); - } else { - qp->event_handler = qp_init_attr->event_handler; - qp->qp_context = qp_init_attr->qp_context; - if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { - qp->recv_cq = NULL; - qp->srq = NULL; - } else { - qp->recv_cq = qp_init_attr->recv_cq; - atomic_inc(&qp_init_attr->recv_cq->usecnt); - qp->srq = qp_init_attr->srq; - if (qp->srq) - atomic_inc(&qp_init_attr->srq->usecnt); - } - - qp->pd = pd; - qp->send_cq = qp_init_attr->send_cq; - qp->xrcd = NULL; - - atomic_inc(&pd->usecnt); - atomic_inc(&qp_init_attr->send_cq->usecnt); - } - } - - return qp; -} -EXPORT_SYMBOL(ib_create_qp); - -static const struct { - int valid; - enum ib_qp_attr_mask req_param[IB_QPT_MAX]; - enum ib_qp_attr_mask req_param_add_eth[IB_QPT_MAX]; - enum ib_qp_attr_mask opt_param[IB_QPT_MAX]; - enum ib_qp_attr_mask opt_param_add_eth[IB_QPT_MAX]; -} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { - [IB_QPS_RESET] = { - [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_INIT] = { - .valid = 1, - .req_param = { - [IB_QPT_UD] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_QKEY), - [IB_QPT_RAW_PACKET] = IB_QP_PORT, - [IB_QPT_UC] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_RC] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_DC_INI] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS | - IB_QP_DC_KEY), - [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - }, - .opt_param = { - [IB_QPT_UD] = IB_QP_GROUP_RSS, - [IB_QPT_RAW_PACKET] = IB_QP_GROUP_RSS - } - }, - }, - [IB_QPS_INIT] = { - [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_ERR] = { .valid = 1 }, - [IB_QPS_INIT] = { - .valid = 1, - .opt_param = { - [IB_QPT_UD] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_QKEY), - [IB_QPT_UC] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_RC] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_DC_INI] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - } - }, - [IB_QPS_RTR] = { - .valid = 1, - .req_param = { - [IB_QPT_UC] = (IB_QP_AV | - IB_QP_PATH_MTU | - IB_QP_DEST_QPN | - IB_QP_RQ_PSN), - [IB_QPT_RC] = (IB_QP_AV | - IB_QP_PATH_MTU | - IB_QP_DEST_QPN | - IB_QP_RQ_PSN | - IB_QP_MAX_DEST_RD_ATOMIC | - IB_QP_MIN_RNR_TIMER), - [IB_QPT_DC_INI] = (IB_QP_PATH_MTU | - IB_QP_MAX_DEST_RD_ATOMIC | - IB_QP_MIN_RNR_TIMER), - [IB_QPT_XRC_INI] = (IB_QP_AV | - IB_QP_PATH_MTU | - IB_QP_DEST_QPN | - IB_QP_RQ_PSN), - [IB_QPT_XRC_TGT] = (IB_QP_AV | - IB_QP_PATH_MTU | - IB_QP_DEST_QPN | - IB_QP_RQ_PSN | - IB_QP_MAX_DEST_RD_ATOMIC | - IB_QP_MIN_RNR_TIMER), - }, - .req_param_add_eth = { - [IB_QPT_RC] = (IB_QP_SMAC), - [IB_QPT_UC] = (IB_QP_SMAC), - [IB_QPT_XRC_INI] = (IB_QP_SMAC), - [IB_QPT_XRC_TGT] = (IB_QP_SMAC) - }, - .opt_param = { - [IB_QPT_UD] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - [IB_QPT_UC] = (IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX), - [IB_QPT_RC] = (IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX), - [IB_QPT_DC_INI] = (IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX), - [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX), - [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX), - [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - [IB_QPT_RAW_PACKET] = IB_QP_AV, - }, - .opt_param_add_eth = { - [IB_QPT_RC] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID), - [IB_QPT_UC] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID), - [IB_QPT_XRC_INI] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID), - [IB_QPT_XRC_TGT] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID) - } - } - }, - [IB_QPS_RTR] = { - [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_ERR] = { .valid = 1 }, - [IB_QPS_RTS] = { - .valid = 1, - .req_param = { - [IB_QPT_UD] = IB_QP_SQ_PSN, - [IB_QPT_UC] = IB_QP_SQ_PSN, - [IB_QPT_RC] = (IB_QP_TIMEOUT | - IB_QP_RETRY_CNT | - IB_QP_RNR_RETRY | - IB_QP_SQ_PSN | - IB_QP_MAX_QP_RD_ATOMIC), - [IB_QPT_DC_INI] = (IB_QP_TIMEOUT | - IB_QP_RETRY_CNT | - IB_QP_RNR_RETRY | - IB_QP_MAX_QP_RD_ATOMIC), - [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT | - IB_QP_RETRY_CNT | - IB_QP_RNR_RETRY | - IB_QP_SQ_PSN | - IB_QP_MAX_QP_RD_ATOMIC), - [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT | - IB_QP_SQ_PSN), - [IB_QPT_SMI] = IB_QP_SQ_PSN, - [IB_QPT_GSI] = IB_QP_SQ_PSN, - }, - .opt_param = { - [IB_QPT_UD] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_UC] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PATH_MIG_STATE), - [IB_QPT_RC] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), - [IB_QPT_DC_INI] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), - [IB_QPT_SMI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - } - } - }, - [IB_QPS_RTS] = { - [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_ERR] = { .valid = 1 }, - [IB_QPS_RTS] = { - .valid = 1, - .opt_param = { - [IB_QPT_UD] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_UC] = (IB_QP_CUR_STATE | - IB_QP_ACCESS_FLAGS | - IB_QP_ALT_PATH | - IB_QP_PATH_MIG_STATE), - [IB_QPT_RC] = (IB_QP_CUR_STATE | - IB_QP_ACCESS_FLAGS | - IB_QP_ALT_PATH | - IB_QP_PATH_MIG_STATE | - IB_QP_MIN_RNR_TIMER), - [IB_QPT_DC_INI] = (IB_QP_CUR_STATE | - IB_QP_ACCESS_FLAGS | - IB_QP_ALT_PATH | - IB_QP_PATH_MIG_STATE | - IB_QP_MIN_RNR_TIMER), - [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | - IB_QP_ACCESS_FLAGS | - IB_QP_ALT_PATH | - IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | - IB_QP_ACCESS_FLAGS | - IB_QP_ALT_PATH | - IB_QP_PATH_MIG_STATE | - IB_QP_MIN_RNR_TIMER), - [IB_QPT_SMI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - } - }, - [IB_QPS_SQD] = { - .valid = 1, - .opt_param = { - [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY, - [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY, - [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY, - [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY, - [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */ - [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY, - [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY - } - }, - }, - [IB_QPS_SQD] = { - [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_ERR] = { .valid = 1 }, - [IB_QPS_RTS] = { - .valid = 1, - .opt_param = { - [IB_QPT_UD] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_UC] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PATH_MIG_STATE), - [IB_QPT_RC] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), - [IB_QPT_SMI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - } - }, - [IB_QPS_SQD] = { - .valid = 1, - .opt_param = { - [IB_QPT_UD] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - [IB_QPT_UC] = (IB_QP_AV | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX | - IB_QP_PATH_MIG_STATE), - [IB_QPT_RC] = (IB_QP_PORT | - IB_QP_AV | - IB_QP_TIMEOUT | - IB_QP_RETRY_CNT | - IB_QP_RNR_RETRY | - IB_QP_MAX_QP_RD_ATOMIC | - IB_QP_MAX_DEST_RD_ATOMIC | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX | - IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC_INI] = (IB_QP_PORT | - IB_QP_AV | - IB_QP_TIMEOUT | - IB_QP_RETRY_CNT | - IB_QP_RNR_RETRY | - IB_QP_MAX_QP_RD_ATOMIC | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX | - IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC_TGT] = (IB_QP_PORT | - IB_QP_AV | - IB_QP_TIMEOUT | - IB_QP_MAX_DEST_RD_ATOMIC | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX | - IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), - [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - } - } - }, - [IB_QPS_SQE] = { - [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_ERR] = { .valid = 1 }, - [IB_QPS_RTS] = { - .valid = 1, - .opt_param = { - [IB_QPT_UD] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_UC] = (IB_QP_CUR_STATE | - IB_QP_ACCESS_FLAGS), - [IB_QPT_SMI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - } - } - }, - [IB_QPS_ERR] = { - [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_ERR] = { .valid = 1 } - } -}; - -int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask, - enum rdma_link_layer ll) -{ - enum ib_qp_attr_mask req_param, opt_param; - - if (cur_state < 0 || cur_state > IB_QPS_ERR || - next_state < 0 || next_state > IB_QPS_ERR) - return 0; - - if (mask & IB_QP_CUR_STATE && - cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS && - cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) - return 0; - - if (!qp_state_table[cur_state][next_state].valid) - return 0; - - req_param = qp_state_table[cur_state][next_state].req_param[type]; - opt_param = qp_state_table[cur_state][next_state].opt_param[type]; - - if (ll == IB_LINK_LAYER_ETHERNET) { - req_param |= qp_state_table[cur_state][next_state]. - req_param_add_eth[type]; - opt_param |= qp_state_table[cur_state][next_state]. - opt_param_add_eth[type]; - } - - if ((mask & req_param) != req_param) - return 0; - - if (mask & ~(req_param | opt_param | IB_QP_STATE)) - return 0; - - return 1; -} -EXPORT_SYMBOL(ib_modify_qp_is_ok); - -int ib_modify_qp(struct ib_qp *qp, - struct ib_qp_attr *qp_attr, - int qp_attr_mask) -{ - int ret; - - ret = qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); - if (!ret && (qp_attr_mask & IB_QP_PORT)) - qp->port_num = qp_attr->port_num; - - return ret; -} -EXPORT_SYMBOL(ib_modify_qp); - -int ib_query_qp(struct ib_qp *qp, - struct ib_qp_attr *qp_attr, - int qp_attr_mask, - struct ib_qp_init_attr *qp_init_attr) -{ - return qp->device->query_qp ? - qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) : - -ENOSYS; -} -EXPORT_SYMBOL(ib_query_qp); - -int ib_close_qp(struct ib_qp *qp) -{ - struct ib_qp *real_qp; - unsigned long flags; - - real_qp = qp->real_qp; - if (real_qp == qp) - return -EINVAL; - - spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); - list_del(&qp->open_list); - spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); - - atomic_dec(&real_qp->usecnt); - kfree(qp); - - return 0; -} -EXPORT_SYMBOL(ib_close_qp); - -static int __ib_destroy_shared_qp(struct ib_qp *qp) -{ - struct ib_xrcd *xrcd; - struct ib_qp *real_qp; - int ret; - - real_qp = qp->real_qp; - xrcd = real_qp->xrcd; - - mutex_lock(&xrcd->tgt_qp_mutex); - ib_close_qp(qp); - if (atomic_read(&real_qp->usecnt) == 0) - list_del(&real_qp->xrcd_list); - else - real_qp = NULL; - mutex_unlock(&xrcd->tgt_qp_mutex); - - if (real_qp) { - ret = ib_destroy_qp(real_qp); - if (!ret) - atomic_dec(&xrcd->usecnt); - else - __ib_insert_xrcd_qp(xrcd, real_qp); - } - - return 0; -} - -int ib_destroy_qp(struct ib_qp *qp) -{ - struct ib_pd *pd; - struct ib_cq *scq, *rcq; - struct ib_srq *srq; - int ret; - - if (atomic_read(&qp->usecnt)) - return -EBUSY; - - if (qp->real_qp != qp) - return __ib_destroy_shared_qp(qp); - - pd = qp->pd; - scq = qp->send_cq; - rcq = qp->recv_cq; - srq = qp->srq; - - ret = qp->device->destroy_qp(qp); - if (!ret) { - if (pd) - atomic_dec(&pd->usecnt); - if (scq) - atomic_dec(&scq->usecnt); - if (rcq) - atomic_dec(&rcq->usecnt); - if (srq) - atomic_dec(&srq->usecnt); - } - - return ret; -} -EXPORT_SYMBOL(ib_destroy_qp); - -/* Completion queues */ - -struct ib_cq *ib_create_cq(struct ib_device *device, - ib_comp_handler comp_handler, - void (*event_handler)(struct ib_event *, void *), - void *cq_context, int cqe, int comp_vector) -{ - struct ib_cq *cq; - struct ib_cq_init_attr attr = { - .cqe = cqe, - .comp_vector = comp_vector, - .flags = 0, - }; - - cq = device->create_cq(device, &attr, NULL, NULL); - - if (!IS_ERR(cq)) { - cq->device = device; - cq->uobject = NULL; - cq->comp_handler = comp_handler; - cq->event_handler = event_handler; - cq->cq_context = cq_context; - atomic_set(&cq->usecnt, 0); - } - - return cq; -} -EXPORT_SYMBOL(ib_create_cq); - -int ib_modify_cq(struct ib_cq *cq, - struct ib_cq_attr *cq_attr, - int cq_attr_mask) -{ - return cq->device->modify_cq ? - cq->device->modify_cq(cq, cq_attr, cq_attr_mask) : -ENOSYS; -} -EXPORT_SYMBOL(ib_modify_cq); - -int ib_destroy_cq(struct ib_cq *cq) -{ - if (atomic_read(&cq->usecnt)) - return -EBUSY; - - return cq->device->destroy_cq(cq); -} -EXPORT_SYMBOL(ib_destroy_cq); - -int ib_resize_cq(struct ib_cq *cq, int cqe) -{ - return cq->device->resize_cq ? - cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS; -} -EXPORT_SYMBOL(ib_resize_cq); - -/* Memory regions */ - -struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) -{ - struct ib_mr *mr; - int err; - - err = ib_check_mr_access(mr_access_flags); - if (err) - return ERR_PTR(err); - - mr = pd->device->get_dma_mr(pd, mr_access_flags); - - if (!IS_ERR(mr)) { - mr->device = pd->device; - mr->pd = pd; - mr->uobject = NULL; - atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); - } - - return mr; -} -EXPORT_SYMBOL(ib_get_dma_mr); - -struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start) -{ - struct ib_mr *mr; - int err; - - err = ib_check_mr_access(mr_access_flags); - if (err) - return ERR_PTR(err); - - if (!pd->device->reg_phys_mr) - return ERR_PTR(-ENOSYS); - - mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf, - mr_access_flags, iova_start); - - if (!IS_ERR(mr)) { - mr->device = pd->device; - mr->pd = pd; - mr->uobject = NULL; - atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); - } - - return mr; -} -EXPORT_SYMBOL(ib_reg_phys_mr); - -int ib_rereg_phys_mr(struct ib_mr *mr, - int mr_rereg_mask, - struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start) -{ - struct ib_pd *old_pd; - int ret; - - ret = ib_check_mr_access(mr_access_flags); - if (ret) - return ret; - - if (!mr->device->rereg_phys_mr) - return -ENOSYS; - - if (atomic_read(&mr->usecnt)) - return -EBUSY; - - old_pd = mr->pd; - - ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd, - phys_buf_array, num_phys_buf, - mr_access_flags, iova_start); - - if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) { - atomic_dec(&old_pd->usecnt); - atomic_inc(&pd->usecnt); - } - - return ret; -} -EXPORT_SYMBOL(ib_rereg_phys_mr); - -int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr) -{ - return mr->device->query_mr ? - mr->device->query_mr(mr, mr_attr) : -ENOSYS; -} -EXPORT_SYMBOL(ib_query_mr); - -int ib_dereg_mr(struct ib_mr *mr) -{ - struct ib_pd *pd; - int ret; - - if (atomic_read(&mr->usecnt)) - return -EBUSY; - - pd = mr->pd; - ret = mr->device->dereg_mr(mr); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_dereg_mr); - -struct ib_mr *ib_create_mr(struct ib_pd *pd, - struct ib_mr_init_attr *mr_init_attr) -{ - struct ib_mr *mr; - - if (!pd->device->create_mr) - return ERR_PTR(-ENOSYS); - - mr = pd->device->create_mr(pd, mr_init_attr); - - if (!IS_ERR(mr)) { - mr->device = pd->device; - mr->pd = pd; - mr->uobject = NULL; - atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); - } - - return mr; -} -EXPORT_SYMBOL(ib_create_mr); - -int ib_destroy_mr(struct ib_mr *mr) -{ - struct ib_pd *pd; - int ret; - - if (atomic_read(&mr->usecnt)) - return -EBUSY; - - pd = mr->pd; - ret = mr->device->destroy_mr(mr); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_destroy_mr); - -struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) -{ - struct ib_mr *mr; - - if (!pd->device->alloc_fast_reg_mr) - return ERR_PTR(-ENOSYS); - - mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len); - - if (!IS_ERR(mr)) { - mr->device = pd->device; - mr->pd = pd; - mr->uobject = NULL; - atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); - } - - return mr; -} -EXPORT_SYMBOL(ib_alloc_fast_reg_mr); - -struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device, - int max_page_list_len) -{ - struct ib_fast_reg_page_list *page_list; - - if (!device->alloc_fast_reg_page_list) - return ERR_PTR(-ENOSYS); - - page_list = device->alloc_fast_reg_page_list(device, max_page_list_len); - - if (!IS_ERR(page_list)) { - page_list->device = device; - page_list->max_page_list_len = max_page_list_len; - } - - return page_list; -} -EXPORT_SYMBOL(ib_alloc_fast_reg_page_list); - -void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) -{ - page_list->device->free_fast_reg_page_list(page_list); -} -EXPORT_SYMBOL(ib_free_fast_reg_page_list); - -/* Memory windows */ - -struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) -{ - struct ib_mw *mw; - - if (!pd->device->alloc_mw) - return ERR_PTR(-ENOSYS); - - mw = pd->device->alloc_mw(pd, type); - if (!IS_ERR(mw)) { - mw->device = pd->device; - mw->pd = pd; - mw->uobject = NULL; - mw->type = type; - atomic_inc(&pd->usecnt); - } - - return mw; -} -EXPORT_SYMBOL(ib_alloc_mw); - -int ib_dealloc_mw(struct ib_mw *mw) -{ - struct ib_pd *pd; - int ret; - - pd = mw->pd; - ret = mw->device->dealloc_mw(mw); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_dealloc_mw); - -/* "Fast" memory regions */ - -struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, - int mr_access_flags, - struct ib_fmr_attr *fmr_attr) -{ - struct ib_fmr *fmr; - - if (!pd->device->alloc_fmr) - return ERR_PTR(-ENOSYS); - - fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr); - if (!IS_ERR(fmr)) { - fmr->device = pd->device; - fmr->pd = pd; - atomic_inc(&pd->usecnt); - } - - return fmr; -} -EXPORT_SYMBOL(ib_alloc_fmr); - -int ib_unmap_fmr(struct list_head *fmr_list) -{ - struct ib_fmr *fmr; - - if (list_empty(fmr_list)) - return 0; - - fmr = list_entry(fmr_list->next, struct ib_fmr, list); - return fmr->device->unmap_fmr(fmr_list); -} -EXPORT_SYMBOL(ib_unmap_fmr); - -int ib_dealloc_fmr(struct ib_fmr *fmr) -{ - struct ib_pd *pd; - int ret; - - pd = fmr->pd; - ret = fmr->device->dealloc_fmr(fmr); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_dealloc_fmr); - -/* Multicast groups */ - -int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) -{ - int ret; - - if (!qp->device->attach_mcast) - return -ENOSYS; - - switch (rdma_node_get_transport(qp->device->node_type)) { - case RDMA_TRANSPORT_IB: - if ((gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) && - qp->qp_type != IB_QPT_RAW_PACKET) - return -EINVAL; - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: - if (qp->qp_type != IB_QPT_RAW_PACKET) - return -EINVAL; - break; - } - - ret = qp->device->attach_mcast(qp, gid, lid); - if (!ret) - atomic_inc(&qp->usecnt); - return ret; -} -EXPORT_SYMBOL(ib_attach_mcast); - -int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) -{ - int ret; - - if (!qp->device->detach_mcast) - return -ENOSYS; - - switch (rdma_node_get_transport(qp->device->node_type)) { - case RDMA_TRANSPORT_IB: - if ((gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) && - qp->qp_type != IB_QPT_RAW_PACKET) - return -EINVAL; - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: - - if (qp->qp_type != IB_QPT_RAW_PACKET) - return -EINVAL; - break; - } - - ret = qp->device->detach_mcast(qp, gid, lid); - if (!ret) - atomic_dec(&qp->usecnt); - return ret; -} -EXPORT_SYMBOL(ib_detach_mcast); - -struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) -{ - struct ib_xrcd *xrcd; - - if (!device->alloc_xrcd) - return ERR_PTR(-ENOSYS); - - xrcd = device->alloc_xrcd(device, NULL, NULL); - if (!IS_ERR(xrcd)) { - xrcd->device = device; - xrcd->inode = NULL; - atomic_set(&xrcd->usecnt, 0); - mutex_init(&xrcd->tgt_qp_mutex); - INIT_LIST_HEAD(&xrcd->tgt_qp_list); - } - - return xrcd; -} -EXPORT_SYMBOL(ib_alloc_xrcd); - -int ib_dealloc_xrcd(struct ib_xrcd *xrcd) -{ - struct ib_qp *qp; - int ret; - - if (atomic_read(&xrcd->usecnt)) - return -EBUSY; - - while (!list_empty(&xrcd->tgt_qp_list)) { - qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list); - ret = ib_destroy_qp(qp); - if (ret) - return ret; - } - - return xrcd->device->dealloc_xrcd(xrcd); -} -EXPORT_SYMBOL(ib_dealloc_xrcd); - -struct ib_flow *ib_create_flow(struct ib_qp *qp, - struct ib_flow_attr *flow_attr, - int domain) -{ - struct ib_flow *flow_id; - if (!qp->device->create_flow) - return ERR_PTR(-ENOSYS); - - flow_id = qp->device->create_flow(qp, flow_attr, domain); - if (!IS_ERR(flow_id)) - atomic_inc(&qp->usecnt); - return flow_id; -} -EXPORT_SYMBOL(ib_create_flow); - -int ib_destroy_flow(struct ib_flow *flow_id) -{ - int err; - struct ib_qp *qp; - - if (!flow_id) - return -EINVAL; - qp = flow_id->qp; - if (!qp->device->destroy_flow) - return -ENOSYS; - err = qp->device->destroy_flow(flow_id); - if (!err) - atomic_dec(&qp->usecnt); - return err; -} -EXPORT_SYMBOL(ib_destroy_flow); - -struct ib_dct *ib_create_dct(struct ib_pd *pd, struct ib_dct_init_attr *attr, - struct ib_udata *udata) -{ - struct ib_dct *dct; - - if (!pd->device->exp_create_dct) - return ERR_PTR(-ENOSYS); - - dct = pd->device->exp_create_dct(pd, attr, udata); - if (!IS_ERR(dct)) { - dct->pd = pd; - dct->srq = attr->srq; - dct->cq = attr->cq; - atomic_inc(&dct->srq->usecnt); - atomic_inc(&dct->cq->usecnt); - atomic_inc(&dct->pd->usecnt); - } - - return dct; -} -EXPORT_SYMBOL(ib_create_dct); - -int ib_destroy_dct(struct ib_dct *dct) -{ - int err; - - if (!dct->device->exp_destroy_dct) - return -ENOSYS; - - err = dct->device->exp_destroy_dct(dct); - if (!err) { - atomic_dec(&dct->srq->usecnt); - atomic_dec(&dct->cq->usecnt); - atomic_dec(&dct->pd->usecnt); - } - - return err; -} -EXPORT_SYMBOL(ib_destroy_dct); - -int ib_query_dct(struct ib_dct *dct, struct ib_dct_attr *attr) -{ - if (!dct->device->exp_query_dct) - return -ENOSYS; - - return dct->device->exp_query_dct(dct, attr); -} -EXPORT_SYMBOL(ib_query_dct); - -int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, - struct ib_mr_status *mr_status) -{ - return mr->device->check_mr_status ? - mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS; -} -EXPORT_SYMBOL(ib_check_mr_status); Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/verbs.c ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/addr.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/addr.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/addr.c (nonexistent) @@ -1,686 +0,0 @@ -/* - * Copyright (c) 2005 Voltaire Inc. All rights reserved. - * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. - * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. - * Copyright (c) 2005 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -MODULE_AUTHOR("Sean Hefty"); -MODULE_DESCRIPTION("IB Address Translation"); -MODULE_LICENSE("Dual BSD/GPL"); - -struct addr_req { - struct list_head list; - struct sockaddr_storage src_addr; - struct sockaddr_storage dst_addr; - struct rdma_dev_addr *addr; - struct rdma_addr_client *client; - void *context; - void (*callback)(int status, struct sockaddr *src_addr, - struct rdma_dev_addr *addr, void *context); - unsigned long timeout; - int status; -}; - -static void process_req(struct work_struct *work); - -static DEFINE_MUTEX(lock); -static LIST_HEAD(req_list); -static struct delayed_work work; -static struct workqueue_struct *addr_wq; - -static struct rdma_addr_client self; -void rdma_addr_register_client(struct rdma_addr_client *client) -{ - atomic_set(&client->refcount, 1); - init_completion(&client->comp); -} -EXPORT_SYMBOL(rdma_addr_register_client); - -static inline void put_client(struct rdma_addr_client *client) -{ - if (atomic_dec_and_test(&client->refcount)) - complete(&client->comp); -} - -void rdma_addr_unregister_client(struct rdma_addr_client *client) -{ - put_client(client); - wait_for_completion(&client->comp); -} -EXPORT_SYMBOL(rdma_addr_unregister_client); - -int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev, - const unsigned char *dst_dev_addr) -{ - if (dev->if_type == IFT_INFINIBAND) - dev_addr->dev_type = ARPHRD_INFINIBAND; - else if (dev->if_type == IFT_ETHER) - dev_addr->dev_type = ARPHRD_ETHER; - else - dev_addr->dev_type = 0; - memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), dev->if_addrlen); - memcpy(dev_addr->broadcast, __DECONST(char *, dev->if_broadcastaddr), - dev->if_addrlen); - if (dst_dev_addr) - memcpy(dev_addr->dst_dev_addr, dst_dev_addr, dev->if_addrlen); - dev_addr->bound_dev_if = dev->if_index; - return 0; -} -EXPORT_SYMBOL(rdma_copy_addr); - -#define SCOPE_ID_CACHE(_scope_id, _addr6) do { \ - (_addr6)->sin6_addr.s6_addr[3] = (_scope_id); \ - (_addr6)->sin6_scope_id = 0; } while (0) - -#define SCOPE_ID_RESTORE(_scope_id, _addr6) do { \ - (_addr6)->sin6_scope_id = (_scope_id); \ - (_addr6)->sin6_addr.s6_addr[3] = 0; } while (0) - -int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, - u16 *vlan_id) -{ - struct net_device *dev; - int ret = -EADDRNOTAVAIL; - - if (dev_addr->bound_dev_if) { - dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); - if (!dev) - return -ENODEV; - ret = rdma_copy_addr(dev_addr, dev, NULL); - dev_put(dev); - return ret; - } - - switch (addr->sa_family) { - case AF_INET: - dev = ip_dev_find(&init_net, - ((struct sockaddr_in *) addr)->sin_addr.s_addr); - - if (!dev) - return ret; - - ret = rdma_copy_addr(dev_addr, dev, NULL); - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(dev); - dev_put(dev); - break; - -#if defined(INET6) - case AF_INET6: - { - struct sockaddr_in6 *sin6; - struct ifaddr *ifa; - in_port_t port; - uint32_t scope_id; - - sin6 = (struct sockaddr_in6 *)addr; - port = sin6->sin6_port; - sin6->sin6_port = 0; - scope_id = sin6->sin6_scope_id; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) - SCOPE_ID_CACHE(scope_id, sin6); - CURVNET_SET_QUIET(&init_net); - ifa = ifa_ifwithaddr(addr); - CURVNET_RESTORE(); - sin6->sin6_port = port; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) - SCOPE_ID_RESTORE(scope_id, sin6); - if (ifa == NULL) { - ret = -ENODEV; - break; - } - ret = rdma_copy_addr(dev_addr, ifa->ifa_ifp, NULL); - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(ifa->ifa_ifp); - ifa_free(ifa); - break; - } -#endif - default: - break; - } - return ret; -} -EXPORT_SYMBOL(rdma_translate_ip); - -static void set_timeout(unsigned long time) -{ - unsigned long delay; - - delay = time - jiffies; - if ((long)delay <= 0) - delay = 1; - - mod_delayed_work(addr_wq, &work, delay); -} - -static void queue_req(struct addr_req *req) -{ - struct addr_req *temp_req; - - mutex_lock(&lock); - list_for_each_entry_reverse(temp_req, &req_list, list) { - if (time_after_eq(req->timeout, temp_req->timeout)) - break; - } - - list_add(&req->list, &temp_req->list); - - if (req_list.next == &req->list) - set_timeout(req->timeout); - mutex_unlock(&lock); -} - -static int addr_resolve(struct sockaddr *src_in, - struct sockaddr *dst_in, - struct rdma_dev_addr *addr) -{ - struct sockaddr_in *sin; - struct sockaddr_in6 *sin6; - struct ifaddr *ifa; - struct ifnet *ifp; - struct rtentry *rte; -#if defined(INET) || defined(INET6) - in_port_t port; -#endif -#ifdef INET6 - uint32_t scope_id; -#endif - u_char edst[MAX_ADDR_LEN]; - int multi; - int bcast; - int is_gw = 0; - int error = 0; - - CURVNET_SET_QUIET(&init_net); - - /* - * Determine whether the address is unicast, multicast, or broadcast - * and whether the source interface is valid. - */ - multi = 0; - bcast = 0; - sin = NULL; - sin6 = NULL; - ifp = NULL; - rte = NULL; - ifa = NULL; - ifp = NULL; - memset(edst, 0, sizeof(edst)); -#ifdef INET6 - scope_id = -1U; -#endif - - switch (dst_in->sa_family) { -#ifdef INET - case AF_INET: - sin = (struct sockaddr_in *)dst_in; - if (sin->sin_addr.s_addr == INADDR_BROADCAST) - bcast = 1; - if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) - multi = 1; - sin = (struct sockaddr_in *)src_in; - if (sin->sin_addr.s_addr != INADDR_ANY) { - /* - * Address comparison fails if the port is set - * cache it here to be restored later. - */ - port = sin->sin_port; - sin->sin_port = 0; - memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); - - /* - * If we have a source address to use look it - * up first and verify that it is a local - * interface: - */ - CURVNET_SET_QUIET(&init_net); - ifa = ifa_ifwithaddr(src_in); - CURVNET_RESTORE(); - sin->sin_port = port; - if (ifa == NULL) { - error = ENETUNREACH; - goto done; - } - ifp = ifa->ifa_ifp; - ifa_free(ifa); - if (bcast || multi) - goto mcast; - } - break; -#endif -#ifdef INET6 - case AF_INET6: - sin6 = (struct sockaddr_in6 *)dst_in; - if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) - multi = 1; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) { - /* - * The IB address comparison fails if the - * scope ID is set and not part of the addr: - */ - scope_id = sin6->sin6_scope_id; - if (scope_id < 256) - SCOPE_ID_CACHE(scope_id, sin6); - } - sin6 = (struct sockaddr_in6 *)src_in; - if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { - port = sin6->sin6_port; - sin6->sin6_port = 0; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) { - if (scope_id < 256) - SCOPE_ID_CACHE(scope_id, sin6); - } - - /* - * If we have a source address to use look it - * up first and verify that it is a local - * interface: - */ - CURVNET_SET_QUIET(&init_net); - ifa = ifa_ifwithaddr(src_in); - CURVNET_RESTORE(); - sin6->sin6_port = port; - if (ifa == NULL) { - error = ENETUNREACH; - goto done; - } - ifp = ifa->ifa_ifp; - ifa_free(ifa); - if (bcast || multi) - goto mcast; - } - break; -#endif - default: - error = EINVAL; - goto done; - } - /* - * Make sure the route exists and has a valid link. - */ - rte = rtalloc1(dst_in, 1, 0); - if (rte == NULL || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) { - if (rte) - RTFREE_LOCKED(rte); - error = EHOSTUNREACH; - goto done; - } - if (rte->rt_flags & RTF_GATEWAY) - is_gw = 1; - /* - * If it's not multicast or broadcast and the route doesn't match the - * requested interface return unreachable. Otherwise fetch the - * correct interface pointer and unlock the route. - */ - if (multi || bcast) { - if (ifp == NULL) { - ifp = rte->rt_ifp; - /* rt_ifa holds the route answer source address */ - ifa = rte->rt_ifa; - } - RTFREE_LOCKED(rte); - } else if (ifp && ifp != rte->rt_ifp) { - RTFREE_LOCKED(rte); - error = ENETUNREACH; - goto done; - } else { - if (ifp == NULL) { - ifp = rte->rt_ifp; - ifa = rte->rt_ifa; - } - RT_UNLOCK(rte); - } -#if defined(INET) || defined(INET6) -mcast: -#endif - if (bcast) { - memcpy(edst, ifp->if_broadcastaddr, ifp->if_addrlen); - goto done; - } else if (multi) { - struct sockaddr *llsa; - struct sockaddr_dl sdl; - - sdl.sdl_len = sizeof(sdl); - llsa = (struct sockaddr *)&sdl; - - if (ifp->if_resolvemulti == NULL) { - error = EOPNOTSUPP; - goto done; - } - error = ifp->if_resolvemulti(ifp, &llsa, dst_in); - if (error == 0) { - memcpy(edst, LLADDR((struct sockaddr_dl *)llsa), - ifp->if_addrlen); - } - goto done; - } - /* - * Resolve the link local address. - */ - switch (dst_in->sa_family) { -#ifdef INET - case AF_INET: - error = arpresolve(ifp, is_gw, NULL, - is_gw ? rte->rt_gateway : dst_in, edst, NULL, NULL); - break; -#endif -#ifdef INET6 - case AF_INET6: - error = nd6_resolve(ifp, is_gw, NULL, - is_gw ? rte->rt_gateway : dst_in, edst, NULL, NULL); - break; -#endif - default: - KASSERT(0, ("rdma_addr_resolve: Unreachable")); - error = EINVAL; - break; - } - RTFREE(rte); -done: - if (error == 0) - error = -rdma_copy_addr(addr, ifp, edst); - if (error == 0) - memcpy(src_in, ifa->ifa_addr, ip_addr_size(ifa->ifa_addr)); -#ifdef INET6 - if (scope_id < 256) { - sin6 = (struct sockaddr_in6 *)src_in; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) - SCOPE_ID_RESTORE(scope_id, sin6); - sin6 = (struct sockaddr_in6 *)dst_in; - SCOPE_ID_RESTORE(scope_id, sin6); - } -#endif - if (error == EWOULDBLOCK) - error = ENODATA; - - CURVNET_RESTORE(); - return -error; -} - -static void process_req(struct work_struct *work) -{ - struct addr_req *req, *temp_req; - struct sockaddr *src_in, *dst_in; - struct list_head done_list; - - INIT_LIST_HEAD(&done_list); - - mutex_lock(&lock); - list_for_each_entry_safe(req, temp_req, &req_list, list) { - if (req->status == -ENODATA) { - src_in = (struct sockaddr *) &req->src_addr; - dst_in = (struct sockaddr *) &req->dst_addr; - req->status = addr_resolve(src_in, dst_in, req->addr); - if (req->status && time_after_eq(jiffies, req->timeout)) - req->status = -ETIMEDOUT; - else if (req->status == -ENODATA) - continue; - } - list_move_tail(&req->list, &done_list); - } - - if (!list_empty(&req_list)) { - req = list_entry(req_list.next, struct addr_req, list); - set_timeout(req->timeout); - } - mutex_unlock(&lock); - - list_for_each_entry_safe(req, temp_req, &done_list, list) { - list_del(&req->list); - req->callback(req->status, (struct sockaddr *) &req->src_addr, - req->addr, req->context); - put_client(req->client); - kfree(req); - } -} - -int rdma_resolve_ip(struct rdma_addr_client *client, - struct sockaddr *src_addr, struct sockaddr *dst_addr, - struct rdma_dev_addr *addr, int timeout_ms, - void (*callback)(int status, struct sockaddr *src_addr, - struct rdma_dev_addr *addr, void *context), - void *context) -{ - struct sockaddr *src_in, *dst_in; - struct addr_req *req; - int ret = 0; - - req = kzalloc(sizeof *req, GFP_KERNEL); - if (!req) - return -ENOMEM; - - src_in = (struct sockaddr *) &req->src_addr; - dst_in = (struct sockaddr *) &req->dst_addr; - - if (src_addr) { - if (src_addr->sa_family != dst_addr->sa_family) { - ret = -EINVAL; - goto err; - } - - memcpy(src_in, src_addr, ip_addr_size(src_addr)); - } else { - src_in->sa_family = dst_addr->sa_family; - } - - memcpy(dst_in, dst_addr, ip_addr_size(dst_addr)); - req->addr = addr; - req->callback = callback; - req->context = context; - req->client = client; - atomic_inc(&client->refcount); - - req->status = addr_resolve(src_in, dst_in, addr); - switch (req->status) { - case 0: - req->timeout = jiffies; - queue_req(req); - break; - case -ENODATA: - req->timeout = msecs_to_jiffies(timeout_ms) + jiffies; - queue_req(req); - break; - default: - ret = req->status; - atomic_dec(&client->refcount); - goto err; - } - return ret; -err: - kfree(req); - return ret; -} -EXPORT_SYMBOL(rdma_resolve_ip); - -void rdma_addr_cancel(struct rdma_dev_addr *addr) -{ - struct addr_req *req, *temp_req; - - mutex_lock(&lock); - list_for_each_entry_safe(req, temp_req, &req_list, list) { - if (req->addr == addr) { - req->status = -ECANCELED; - req->timeout = jiffies; - list_move(&req->list, &req_list); - set_timeout(req->timeout); - break; - } - } - mutex_unlock(&lock); -} -EXPORT_SYMBOL(rdma_addr_cancel); - -struct resolve_cb_context { - struct rdma_dev_addr *addr; - struct completion comp; -}; - -static void resolve_cb(int status, struct sockaddr *src_addr, - struct rdma_dev_addr *addr, void *context) -{ - memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct - rdma_dev_addr)); - complete(&((struct resolve_cb_context *)context)->comp); -} - -int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac, - u16 *vlan_id, u32 scope_id) -{ - int ret = 0; - struct rdma_dev_addr dev_addr; - struct resolve_cb_context ctx; - struct net_device *dev; - - union { - struct sockaddr _sockaddr; - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; - } sgid_addr, dgid_addr; - - - ret = rdma_gid2ip(&sgid_addr._sockaddr, sgid, scope_id); - if (ret) - return ret; - - ret = rdma_gid2ip(&dgid_addr._sockaddr, dgid, scope_id); - if (ret) - return ret; - - memset(&dev_addr, 0, sizeof(dev_addr)); - - ctx.addr = &dev_addr; - init_completion(&ctx.comp); - ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, - &dev_addr, 1000, resolve_cb, &ctx); - if (ret) - return ret; - - wait_for_completion(&ctx.comp); - - memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); - dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if); - if (!dev) - return -ENODEV; - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(dev); - dev_put(dev); - return ret; -} -EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh); - -u32 rdma_get_ipv6_scope_id(struct ib_device *ib, u8 port_num) -{ -#ifdef INET6 - struct ifnet *ifp; - if (ib->get_netdev == NULL) - return (-1U); - ifp = ib->get_netdev(ib, port_num); - if (ifp == NULL) - return (-1U); - return (in6_getscopezone(ifp, IPV6_ADDR_SCOPE_LINKLOCAL)); -#else - return (-1U); -#endif -} - -int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id, - u32 scope_id) -{ - int ret = 0; - struct rdma_dev_addr dev_addr; - union { - struct sockaddr _sockaddr; - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; - } gid_addr; - - ret = rdma_gid2ip(&gid_addr._sockaddr, sgid, scope_id); - if (ret) - return ret; - memset(&dev_addr, 0, sizeof(dev_addr)); - ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id); - if (ret) - return ret; - - memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN); - return ret; -} -EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid); - -static int netevent_callback(struct notifier_block *self, unsigned long event, - void *ctx) -{ - if (event == NETEVENT_NEIGH_UPDATE) { - set_timeout(jiffies); - } - return 0; -} - -static struct notifier_block nb = { - .notifier_call = netevent_callback -}; - -static int __init addr_init(void) -{ - INIT_DELAYED_WORK(&work, process_req); - addr_wq = create_singlethread_workqueue("ib_addr"); - if (!addr_wq) - return -ENOMEM; - - register_netevent_notifier(&nb); - rdma_addr_register_client(&self); - return 0; -} - -static void __exit addr_cleanup(void) -{ - rdma_addr_unregister_client(&self); - unregister_netevent_notifier(&nb); - destroy_workqueue(addr_wq); -} - -module_init(addr_init); -module_exit(addr_cleanup); Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/addr.c ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/smi.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/smi.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/smi.c (nonexistent) @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved. - * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved. - * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. - * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved. - * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved. - * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include -#include "smi.h" - -/* - * Fixup a directed route SMP for sending - * Return 0 if the SMP should be discarded - */ -enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, - u8 node_type, int port_num) -{ - u8 hop_ptr, hop_cnt; - - hop_ptr = smp->hop_ptr; - hop_cnt = smp->hop_cnt; - - /* See section 14.2.2.2, Vol 1 IB spec */ - /* C14-6 -- valid hop_cnt values are from 0 to 63 */ - if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) - return IB_SMI_DISCARD; - - if (!ib_get_smp_direction(smp)) { - /* C14-9:1 */ - if (hop_cnt && hop_ptr == 0) { - smp->hop_ptr++; - return (smp->initial_path[smp->hop_ptr] == - port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-9:2 */ - if (hop_ptr && hop_ptr < hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) - return IB_SMI_DISCARD; - - /* smp->return_path set when received */ - smp->hop_ptr++; - return (smp->initial_path[smp->hop_ptr] == - port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-9:3 -- We're at the end of the DR segment of path */ - if (hop_ptr == hop_cnt) { - /* smp->return_path set when received */ - smp->hop_ptr++; - return (node_type == RDMA_NODE_IB_SWITCH || - smp->dr_dlid == IB_LID_PERMISSIVE ? - IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ - /* C14-9:5 -- Fail unreasonable hop pointer */ - return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); - - } else { - /* C14-13:1 */ - if (hop_cnt && hop_ptr == hop_cnt + 1) { - smp->hop_ptr--; - return (smp->return_path[smp->hop_ptr] == - port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-13:2 */ - if (2 <= hop_ptr && hop_ptr <= hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) - return IB_SMI_DISCARD; - - smp->hop_ptr--; - return (smp->return_path[smp->hop_ptr] == - port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-13:3 -- at the end of the DR segment of path */ - if (hop_ptr == 1) { - smp->hop_ptr--; - /* C14-13:3 -- SMPs destined for SM shouldn't be here */ - return (node_type == RDMA_NODE_IB_SWITCH || - smp->dr_slid == IB_LID_PERMISSIVE ? - IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */ - if (hop_ptr == 0) - return IB_SMI_HANDLE; - - /* C14-13:5 -- Check for unreasonable hop pointer */ - return IB_SMI_DISCARD; - } -} - -/* - * Adjust information for a received SMP - * Return 0 if the SMP should be dropped - */ -enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, - int port_num, int phys_port_cnt) -{ - u8 hop_ptr, hop_cnt; - - hop_ptr = smp->hop_ptr; - hop_cnt = smp->hop_cnt; - - /* See section 14.2.2.2, Vol 1 IB spec */ - /* C14-6 -- valid hop_cnt values are from 0 to 63 */ - if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) - return IB_SMI_DISCARD; - - if (!ib_get_smp_direction(smp)) { - /* C14-9:1 -- sender should have incremented hop_ptr */ - if (hop_cnt && hop_ptr == 0) - return IB_SMI_DISCARD; - - /* C14-9:2 -- intermediate hop */ - if (hop_ptr && hop_ptr < hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) - return IB_SMI_DISCARD; - - smp->return_path[hop_ptr] = port_num; - /* smp->hop_ptr updated when sending */ - return (smp->initial_path[hop_ptr+1] <= phys_port_cnt ? - IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-9:3 -- We're at the end of the DR segment of path */ - if (hop_ptr == hop_cnt) { - if (hop_cnt) - smp->return_path[hop_ptr] = port_num; - /* smp->hop_ptr updated when sending */ - - return (node_type == RDMA_NODE_IB_SWITCH || - smp->dr_dlid == IB_LID_PERMISSIVE ? - IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ - /* C14-9:5 -- fail unreasonable hop pointer */ - return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); - - } else { - - /* C14-13:1 */ - if (hop_cnt && hop_ptr == hop_cnt + 1) { - smp->hop_ptr--; - return (smp->return_path[smp->hop_ptr] == - port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-13:2 */ - if (2 <= hop_ptr && hop_ptr <= hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) - return IB_SMI_DISCARD; - - /* smp->hop_ptr updated when sending */ - return (smp->return_path[hop_ptr-1] <= phys_port_cnt ? - IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-13:3 -- We're at the end of the DR segment of path */ - if (hop_ptr == 1) { - if (smp->dr_slid == IB_LID_PERMISSIVE) { - /* giving SMP to SM - update hop_ptr */ - smp->hop_ptr--; - return IB_SMI_HANDLE; - } - /* smp->hop_ptr updated when sending */ - return (node_type == RDMA_NODE_IB_SWITCH ? - IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-13:4 -- hop_ptr = 0 -> give to SM */ - /* C14-13:5 -- Check for unreasonable hop pointer */ - return (hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD); - } -} - -enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp) -{ - u8 hop_ptr, hop_cnt; - - hop_ptr = smp->hop_ptr; - hop_cnt = smp->hop_cnt; - - if (!ib_get_smp_direction(smp)) { - /* C14-9:2 -- intermediate hop */ - if (hop_ptr && hop_ptr < hop_cnt) - return IB_SMI_FORWARD; - - /* C14-9:3 -- at the end of the DR segment of path */ - if (hop_ptr == hop_cnt) - return (smp->dr_dlid == IB_LID_PERMISSIVE ? - IB_SMI_SEND : IB_SMI_LOCAL); - - /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ - if (hop_ptr == hop_cnt + 1) - return IB_SMI_SEND; - } else { - /* C14-13:2 -- intermediate hop */ - if (2 <= hop_ptr && hop_ptr <= hop_cnt) - return IB_SMI_FORWARD; - - /* C14-13:3 -- at the end of the DR segment of path */ - if (hop_ptr == 1) - return (smp->dr_slid != IB_LID_PERMISSIVE ? - IB_SMI_SEND : IB_SMI_LOCAL); - } - return IB_SMI_LOCAL; -} - -/* - * Return the forwarding port number from initial_path for outgoing SMP and - * from return_path for returning SMP - */ -int smi_get_fwd_port(struct ib_smp *smp) -{ - return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] : - smp->return_path[smp->hop_ptr-1]); -} Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/smi.c ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/sysfs.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/sysfs.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/sysfs.c (nonexistent) @@ -1,1026 +0,0 @@ -/* - * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. - * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "core_priv.h" - -#include -#include -#include -#include - -#include -#include - -struct ib_port { - struct kobject kobj; - struct ib_device *ibdev; - struct attribute_group gid_group; - struct attribute_group pkey_group; - u8 port_num; -}; - -struct port_attribute { - struct attribute attr; - ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf); - ssize_t (*store)(struct ib_port *, struct port_attribute *, - const char *buf, size_t count); -}; - -#define PORT_ATTR(_name, _mode, _show, _store) \ -struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store) - -#define PORT_ATTR_RO(_name) \ -struct port_attribute port_attr_##_name = __ATTR_RO(_name) - -struct port_table_attribute { - struct port_attribute attr; - char name[8]; - int index; -}; - -static ssize_t port_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct port_attribute *port_attr = - container_of(attr, struct port_attribute, attr); - struct ib_port *p = container_of(kobj, struct ib_port, kobj); - - if (!port_attr->show) - return -EIO; - - return port_attr->show(p, port_attr, buf); -} - -static const struct sysfs_ops port_sysfs_ops = { - .show = port_attr_show -}; - -static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - ssize_t ret; - - static const char *state_name[] = { - [IB_PORT_NOP] = "NOP", - [IB_PORT_DOWN] = "DOWN", - [IB_PORT_INIT] = "INIT", - [IB_PORT_ARMED] = "ARMED", - [IB_PORT_ACTIVE] = "ACTIVE", - [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER" - }; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - return sprintf(buf, "%d: %s\n", attr.state, - attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ? - state_name[attr.state] : "UNKNOWN"); -} - -static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - return sprintf(buf, "0x%x\n", attr.lid); -} - -static ssize_t lid_mask_count_show(struct ib_port *p, - struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - return sprintf(buf, "%d\n", attr.lmc); -} - -static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - return sprintf(buf, "0x%x\n", attr.sm_lid); -} - -static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - return sprintf(buf, "%d\n", attr.sm_sl); -} - -static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - return sprintf(buf, "0x%08x\n", attr.port_cap_flags); -} - -static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - char *speed = ""; - int rate; /* in deci-Gb/sec */ - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - ib_active_speed_enum_to_rate(attr.active_speed, - &rate, - &speed); - - rate *= ib_width_enum_to_int(attr.active_width); - if (rate < 0) - return -EINVAL; - - return sprintf(buf, "%d%s Gb/sec (%dX%s)\n", - rate / 10, rate % 10 ? ".5" : "", - ib_width_enum_to_int(attr.active_width), speed); -} - -static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - switch (attr.phys_state) { - case 1: return sprintf(buf, "1: Sleep\n"); - case 2: return sprintf(buf, "2: Polling\n"); - case 3: return sprintf(buf, "3: Disabled\n"); - case 4: return sprintf(buf, "4: PortConfigurationTraining\n"); - case 5: return sprintf(buf, "5: LinkUp\n"); - case 6: return sprintf(buf, "6: LinkErrorRecovery\n"); - case 7: return sprintf(buf, "7: Phy Test\n"); - default: return sprintf(buf, "%d: \n", attr.phys_state); - } -} - -static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) { - case IB_LINK_LAYER_INFINIBAND: - return sprintf(buf, "%s\n", "InfiniBand"); - case IB_LINK_LAYER_ETHERNET: - return sprintf(buf, "%s\n", "Ethernet"); - case IB_LINK_LAYER_SCIF: - return sprintf(buf, "%s\n", "SCIF"); - default: - return sprintf(buf, "%s\n", "Unknown"); - } -} - -static PORT_ATTR_RO(state); -static PORT_ATTR_RO(lid); -static PORT_ATTR_RO(lid_mask_count); -static PORT_ATTR_RO(sm_lid); -static PORT_ATTR_RO(sm_sl); -static PORT_ATTR_RO(cap_mask); -static PORT_ATTR_RO(rate); -static PORT_ATTR_RO(phys_state); -static PORT_ATTR_RO(link_layer); - -static struct attribute *port_default_attrs[] = { - &port_attr_state.attr, - &port_attr_lid.attr, - &port_attr_lid_mask_count.attr, - &port_attr_sm_lid.attr, - &port_attr_sm_sl.attr, - &port_attr_cap_mask.attr, - &port_attr_rate.attr, - &port_attr_phys_state.attr, - &port_attr_link_layer.attr, - NULL -}; - -static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, - char *buf) -{ - struct port_table_attribute *tab_attr = - container_of(attr, struct port_table_attribute, attr); - union ib_gid gid; - ssize_t ret; - - ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid); - if (ret) - return ret; - - return sprintf(buf, GID_PRINT_FMT"\n",GID_PRINT_ARGS(gid.raw)); -} - -static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, - char *buf) -{ - struct port_table_attribute *tab_attr = - container_of(attr, struct port_table_attribute, attr); - u16 pkey; - ssize_t ret; - - ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey); - if (ret) - return ret; - - return sprintf(buf, "0x%04x\n", pkey); -} - -static ssize_t get_pma_counters(struct ib_port *p, struct port_attribute *attr, - char *buf, int c_ext) -{ - struct port_table_attribute *tab_attr = - container_of(attr, struct port_table_attribute, attr); - int offset = tab_attr->index & 0xffff; - int width = (tab_attr->index >> 16) & 0xff; - struct ib_mad *in_mad = NULL; - struct ib_mad *out_mad = NULL; - ssize_t ret; - - if (!p->ibdev->process_mad) - return -ENXIO; - - in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); - out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); - if (!in_mad || !out_mad) { - ret = -ENOMEM; - goto out; - } - - in_mad->mad_hdr.base_version = 1; - in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT; - in_mad->mad_hdr.class_version = 1; - in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; - if (c_ext) - in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS_EXT; - else - in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS; - - in_mad->data[41] = p->port_num; /* PortSelect field */ - - if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY, - p->port_num, NULL, NULL, in_mad, out_mad) & - (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) != - (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) { - ret = -EINVAL; - goto out; - } - - switch (width) { - case 4: - ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >> - (4 - (offset % 8))) & 0xf); - break; - case 8: - ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]); - break; - case 16: - ret = sprintf(buf, "%u\n", - be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8))); - break; - case 32: - ret = sprintf(buf, "%u\n", - be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8))); - break; - case 64: - ret = sprintf(buf, "%llu\n", - (unsigned long long)be64_to_cpup((__be64 *)(out_mad->data + 40 + offset / 8))); - break; - default: - ret = 0; - } - -out: - kfree(in_mad); - kfree(out_mad); - - return ret; -} - -#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ -struct port_table_attribute port_pma_attr_##_name = { \ - .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ - .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ -} - -static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, - char *buf) -{ - return get_pma_counters(p, attr, buf, 0); -} - -static PORT_PMA_ATTR(symbol_error , 0, 16, 32); -static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48); -static PORT_PMA_ATTR(link_downed , 2, 8, 56); -static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64); -static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80); -static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96); -static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112); -static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128); -static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136); -static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152); -static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156); -static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176); -static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192); -static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); -static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); -static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); - -static struct attribute *pma_attrs[] = { - &port_pma_attr_symbol_error.attr.attr, - &port_pma_attr_link_error_recovery.attr.attr, - &port_pma_attr_link_downed.attr.attr, - &port_pma_attr_port_rcv_errors.attr.attr, - &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, - &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, - &port_pma_attr_port_xmit_discards.attr.attr, - &port_pma_attr_port_xmit_constraint_errors.attr.attr, - &port_pma_attr_port_rcv_constraint_errors.attr.attr, - &port_pma_attr_local_link_integrity_errors.attr.attr, - &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, - &port_pma_attr_VL15_dropped.attr.attr, - &port_pma_attr_port_xmit_data.attr.attr, - &port_pma_attr_port_rcv_data.attr.attr, - &port_pma_attr_port_xmit_packets.attr.attr, - &port_pma_attr_port_rcv_packets.attr.attr, - NULL -}; - -static struct attribute_group pma_group = { - .name = "counters", - .attrs = pma_attrs -}; - -#define PORT_PMA_ATTR_EXT(_name, _counter, _width, _offset) \ -struct port_table_attribute port_pma_attr_ext_##_name = { \ - .attr = __ATTR(_name, S_IRUGO, show_pma_counter_ext, NULL), \ - .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ -} - -static ssize_t show_pma_counter_ext(struct ib_port *p, - struct port_attribute *attr, char *buf) -{ - return get_pma_counters(p, attr, buf, 1); -} - -static PORT_PMA_ATTR_EXT(port_xmit_data_64 , 0, 64, 64); -static PORT_PMA_ATTR_EXT(port_rcv_data_64 , 0, 64, 128); -static PORT_PMA_ATTR_EXT(port_xmit_packets_64 , 0, 64, 192); -static PORT_PMA_ATTR_EXT(port_rcv_packets_64 , 0, 64, 256); -static PORT_PMA_ATTR_EXT(port_unicast_xmit_packets , 0, 64, 320); -static PORT_PMA_ATTR_EXT(port_unicast_rcv_packets , 0, 64, 384); -static PORT_PMA_ATTR_EXT(port_multicast_xmit_packets , 0, 64, 448); -static PORT_PMA_ATTR_EXT(port_multicast_rcv_packets , 0, 64, 512); - -static struct attribute *pma_attrs_ext[] = { - &port_pma_attr_ext_port_xmit_data_64.attr.attr, - &port_pma_attr_ext_port_rcv_data_64.attr.attr, - &port_pma_attr_ext_port_xmit_packets_64.attr.attr, - &port_pma_attr_ext_port_rcv_packets_64.attr.attr, - &port_pma_attr_ext_port_unicast_xmit_packets.attr.attr, - &port_pma_attr_ext_port_unicast_rcv_packets.attr.attr, - &port_pma_attr_ext_port_multicast_xmit_packets.attr.attr, - &port_pma_attr_ext_port_multicast_rcv_packets.attr.attr, - NULL -}; - -static struct attribute_group pma_ext_group = { - .name = "counters_ext", - .attrs = pma_attrs_ext -}; - -static void ib_port_release(struct kobject *kobj) -{ - struct ib_port *p = container_of(kobj, struct ib_port, kobj); - struct attribute *a; - int i; - - for (i = 0; (a = p->gid_group.attrs[i]); ++i) - kfree(a); - - kfree(p->gid_group.attrs); - - for (i = 0; (a = p->pkey_group.attrs[i]); ++i) - kfree(a); - - kfree(p->pkey_group.attrs); - - kfree(p); -} - -static struct kobj_type port_type = { - .release = ib_port_release, - .sysfs_ops = &port_sysfs_ops, - .default_attrs = port_default_attrs -}; - -static void ib_device_release(struct device *device) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - kfree(dev); -} - -#ifdef __linux__ -/* BSD supports this through devfs(5) and devd(8). */ -static int ib_device_uevent(struct device *device, - struct kobj_uevent_env *env) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - if (add_uevent_var(env, "NAME=%s", dev->name)) - return -ENOMEM; - - /* - * It would be nice to pass the node GUID with the event... - */ - - return 0; -} -#endif - -static struct attribute ** -alloc_group_attrs(ssize_t (*show)(struct ib_port *, - struct port_attribute *, char *buf), - int len) -{ - struct attribute **tab_attr; - struct port_table_attribute *element; - int i; - - tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL); - if (!tab_attr) - return NULL; - - for (i = 0; i < len; i++) { - element = kzalloc(sizeof(struct port_table_attribute), - GFP_KERNEL); - if (!element) - goto err; - - if (snprintf(element->name, sizeof(element->name), - "%d", i) >= sizeof(element->name)) { - kfree(element); - goto err; - } - - element->attr.attr.name = element->name; - element->attr.attr.mode = S_IRUGO; - element->attr.show = show; - element->index = i; - sysfs_attr_init(&element->attr.attr); - - tab_attr[i] = &element->attr.attr; - } - - return tab_attr; - -err: - while (--i >= 0) - kfree(tab_attr[i]); - kfree(tab_attr); - return NULL; -} - -static int add_port(struct ib_device *device, int port_num, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) -{ - struct ib_port *p; - struct ib_port_attr attr; - int i; - int ret; - - ret = ib_query_port(device, port_num, &attr); - if (ret) - return ret; - - p = kzalloc(sizeof *p, GFP_KERNEL); - if (!p) - return -ENOMEM; - - p->ibdev = device; - p->port_num = port_num; - - ret = kobject_init_and_add(&p->kobj, &port_type, - device->ports_parent, - "%d", port_num); - if (ret) - goto err_put; - - ret = sysfs_create_group(&p->kobj, &pma_group); - if (ret) - goto err_put; - - ret = sysfs_create_group(&p->kobj, &pma_ext_group); - if (ret) - goto err_remove_pma; - - p->gid_group.name = "gids"; - p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); - if (!p->gid_group.attrs) - goto err_remove_pma_ext; - - ret = sysfs_create_group(&p->kobj, &p->gid_group); - if (ret) - goto err_free_gid; - - p->pkey_group.name = "pkeys"; - p->pkey_group.attrs = alloc_group_attrs(show_port_pkey, - attr.pkey_tbl_len); - if (!p->pkey_group.attrs) - goto err_remove_gid; - - ret = sysfs_create_group(&p->kobj, &p->pkey_group); - if (ret) - goto err_free_pkey; - - if (port_callback) { - ret = port_callback(device, port_num, &p->kobj); - if (ret) - goto err_remove_pkey; - } - - list_add_tail(&p->kobj.entry, &device->port_list); -#ifdef __linux__ - kobject_uevent(&p->kobj, KOBJ_ADD); -#endif - return 0; - -err_remove_pkey: - sysfs_remove_group(&p->kobj, &p->pkey_group); - -err_free_pkey: - for (i = 0; i < attr.pkey_tbl_len; ++i) - kfree(p->pkey_group.attrs[i]); - - kfree(p->pkey_group.attrs); - -err_remove_gid: - sysfs_remove_group(&p->kobj, &p->gid_group); - -err_free_gid: - for (i = 0; i < attr.gid_tbl_len; ++i) - kfree(p->gid_group.attrs[i]); - - kfree(p->gid_group.attrs); - -err_remove_pma_ext: - sysfs_remove_group(&p->kobj, &pma_ext_group); - -err_remove_pma: - sysfs_remove_group(&p->kobj, &pma_group); - -err_put: - kobject_put(device->ports_parent); - kfree(p); - return ret; -} - -static ssize_t show_node_type(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - switch (dev->node_type) { - case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); - case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type); - case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); - case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); - case RDMA_NODE_MIC: return sprintf(buf, "%d: MIC\n", dev->node_type); - default: return sprintf(buf, "%d: \n", dev->node_type); - } -} - -static ssize_t show_sys_image_guid(struct device *device, - struct device_attribute *dev_attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - struct ib_device_attr attr; - ssize_t ret; - - ret = ib_query_device(dev, &attr); - if (ret) - return ret; - - return sprintf(buf, "%04x:%04x:%04x:%04x\n", - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3])); -} - -static ssize_t show_node_guid(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - return sprintf(buf, "%04x:%04x:%04x:%04x\n", - be16_to_cpu(((__be16 *) &dev->node_guid)[0]), - be16_to_cpu(((__be16 *) &dev->node_guid)[1]), - be16_to_cpu(((__be16 *) &dev->node_guid)[2]), - be16_to_cpu(((__be16 *) &dev->node_guid)[3])); -} - -static ssize_t show_node_desc(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - return sprintf(buf, "%.64s\n", dev->node_desc); -} - -static ssize_t set_node_desc(struct device *device, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - struct ib_device_modify desc = {}; - int ret; - - if (!dev->modify_device) - return -EIO; - - memcpy(desc.node_desc, buf, min_t(int, count, 64)); - ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc); - if (ret) - return ret; - - return count; -} - -static ssize_t show_cmd_perf(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - return sprintf(buf, "%d\n", dev->cmd_perf); -} - -static ssize_t set_cmd_perf(struct device *device, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - u32 val; - - if (sscanf(buf, "0x%x", &val) != 1) - return -EINVAL; - - dev->cmd_perf = val; - - return count; -} - -static ssize_t show_cmd_avg(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - return sprintf(buf, "%llu\n", (unsigned long long)dev->cmd_avg); -} - -static ssize_t set_cmd_avg(struct device *device, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - spin_lock(&dev->cmd_perf_lock); - dev->cmd_avg = 0; - dev->cmd_n = 0; - spin_unlock(&dev->cmd_perf_lock); - - return count; -} - -static ssize_t show_cmd_n(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - return sprintf(buf, "%d\n", dev->cmd_n); -} - -static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL); -static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL); -static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL); -static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc); -static DEVICE_ATTR(cmd_perf, S_IRUGO | S_IWUSR, show_cmd_perf, set_cmd_perf); -static DEVICE_ATTR(cmd_avg, S_IRUGO | S_IWUSR, show_cmd_avg, set_cmd_avg); -static DEVICE_ATTR(cmd_n, S_IRUGO, show_cmd_n, NULL); - -static struct device_attribute *ib_class_attributes[] = { - &dev_attr_node_type, - &dev_attr_sys_image_guid, - &dev_attr_node_guid, - &dev_attr_node_desc, - &dev_attr_cmd_perf, - &dev_attr_cmd_avg, - &dev_attr_cmd_n, -}; - -static struct class ib_class = { - .name = "infiniband", - .dev_release = ib_device_release, -#ifdef __linux__ - .dev_uevent = ib_device_uevent, -#endif -}; - -/* Show a given an attribute in the statistics group */ -static ssize_t show_protocol_stat(const struct device *device, - struct device_attribute *attr, char *buf, - unsigned offset) -{ - struct ib_device *dev = container_of(__DECONST(struct device *, device), struct ib_device, dev); - union rdma_protocol_stats stats; - ssize_t ret; - - ret = dev->get_protocol_stats(dev, &stats); - if (ret) - return ret; - - return sprintf(buf, "%llu\n", - (unsigned long long) ((u64 *) &stats)[offset]); -} - -/* generate a read-only iwarp statistics attribute */ -#define IW_STATS_ENTRY(name) \ -static ssize_t show_##name(struct device *device, \ - struct device_attribute *attr, char *buf) \ -{ \ - return show_protocol_stat(device, attr, buf, \ - offsetof(struct iw_protocol_stats, name) / \ - sizeof (u64)); \ -} \ -static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) - -IW_STATS_ENTRY(ipInReceives); -IW_STATS_ENTRY(ipInHdrErrors); -IW_STATS_ENTRY(ipInTooBigErrors); -IW_STATS_ENTRY(ipInNoRoutes); -IW_STATS_ENTRY(ipInAddrErrors); -IW_STATS_ENTRY(ipInUnknownProtos); -IW_STATS_ENTRY(ipInTruncatedPkts); -IW_STATS_ENTRY(ipInDiscards); -IW_STATS_ENTRY(ipInDelivers); -IW_STATS_ENTRY(ipOutForwDatagrams); -IW_STATS_ENTRY(ipOutRequests); -IW_STATS_ENTRY(ipOutDiscards); -IW_STATS_ENTRY(ipOutNoRoutes); -IW_STATS_ENTRY(ipReasmTimeout); -IW_STATS_ENTRY(ipReasmReqds); -IW_STATS_ENTRY(ipReasmOKs); -IW_STATS_ENTRY(ipReasmFails); -IW_STATS_ENTRY(ipFragOKs); -IW_STATS_ENTRY(ipFragFails); -IW_STATS_ENTRY(ipFragCreates); -IW_STATS_ENTRY(ipInMcastPkts); -IW_STATS_ENTRY(ipOutMcastPkts); -IW_STATS_ENTRY(ipInBcastPkts); -IW_STATS_ENTRY(ipOutBcastPkts); -IW_STATS_ENTRY(tcpRtoAlgorithm); -IW_STATS_ENTRY(tcpRtoMin); -IW_STATS_ENTRY(tcpRtoMax); -IW_STATS_ENTRY(tcpMaxConn); -IW_STATS_ENTRY(tcpActiveOpens); -IW_STATS_ENTRY(tcpPassiveOpens); -IW_STATS_ENTRY(tcpAttemptFails); -IW_STATS_ENTRY(tcpEstabResets); -IW_STATS_ENTRY(tcpCurrEstab); -IW_STATS_ENTRY(tcpInSegs); -IW_STATS_ENTRY(tcpOutSegs); -IW_STATS_ENTRY(tcpRetransSegs); -IW_STATS_ENTRY(tcpInErrs); -IW_STATS_ENTRY(tcpOutRsts); - -static struct attribute *iw_proto_stats_attrs[] = { - &dev_attr_ipInReceives.attr, - &dev_attr_ipInHdrErrors.attr, - &dev_attr_ipInTooBigErrors.attr, - &dev_attr_ipInNoRoutes.attr, - &dev_attr_ipInAddrErrors.attr, - &dev_attr_ipInUnknownProtos.attr, - &dev_attr_ipInTruncatedPkts.attr, - &dev_attr_ipInDiscards.attr, - &dev_attr_ipInDelivers.attr, - &dev_attr_ipOutForwDatagrams.attr, - &dev_attr_ipOutRequests.attr, - &dev_attr_ipOutDiscards.attr, - &dev_attr_ipOutNoRoutes.attr, - &dev_attr_ipReasmTimeout.attr, - &dev_attr_ipReasmReqds.attr, - &dev_attr_ipReasmOKs.attr, - &dev_attr_ipReasmFails.attr, - &dev_attr_ipFragOKs.attr, - &dev_attr_ipFragFails.attr, - &dev_attr_ipFragCreates.attr, - &dev_attr_ipInMcastPkts.attr, - &dev_attr_ipOutMcastPkts.attr, - &dev_attr_ipInBcastPkts.attr, - &dev_attr_ipOutBcastPkts.attr, - &dev_attr_tcpRtoAlgorithm.attr, - &dev_attr_tcpRtoMin.attr, - &dev_attr_tcpRtoMax.attr, - &dev_attr_tcpMaxConn.attr, - &dev_attr_tcpActiveOpens.attr, - &dev_attr_tcpPassiveOpens.attr, - &dev_attr_tcpAttemptFails.attr, - &dev_attr_tcpEstabResets.attr, - &dev_attr_tcpCurrEstab.attr, - &dev_attr_tcpInSegs.attr, - &dev_attr_tcpOutSegs.attr, - &dev_attr_tcpRetransSegs.attr, - &dev_attr_tcpInErrs.attr, - &dev_attr_tcpOutRsts.attr, - NULL -}; - -static struct attribute_group iw_stats_group = { - .name = "proto_stats", - .attrs = iw_proto_stats_attrs, -}; - -int ib_device_register_sysfs(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) -{ - struct device *class_dev = &device->dev; - int ret; - int i; - - class_dev->class = &ib_class; - class_dev->parent = device->dma_device; - dev_set_name(class_dev, device->name); - dev_set_drvdata(class_dev, device); - - INIT_LIST_HEAD(&device->port_list); - - ret = device_register(class_dev); - if (ret) - goto err; - - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { - ret = device_create_file(class_dev, ib_class_attributes[i]); - if (ret) - goto err_unregister; - } - - device->ports_parent = kobject_create_and_add("ports",&class_dev->kobj); - if (!device->ports_parent) { - ret = -ENOMEM; - goto err_put; - } - - if (device->node_type == RDMA_NODE_IB_SWITCH) { - ret = add_port(device, 0, port_callback); - if (ret) - goto err_put; - } else { - for (i = 1; i <= device->phys_port_cnt; ++i) { - ret = add_port(device, i, port_callback); - if (ret) - goto err_put; - } - } - - if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) { - ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group); - if (ret) - goto err_put; - } - - return 0; - -err_put: - { - struct kobject *p, *t; - struct ib_port *port; - - list_for_each_entry_safe(p, t, &device->port_list, entry) { - list_del(&p->entry); - port = container_of(p, struct ib_port, kobj); - sysfs_remove_group(p, &pma_group); - sysfs_remove_group(p, &port->pkey_group); - sysfs_remove_group(p, &port->gid_group); - kobject_put(p); - } - } - - kobject_put(&class_dev->kobj); - -err_unregister: - - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { - device_remove_file(class_dev, ib_class_attributes[i]); - } - - device_unregister(class_dev); - -err: - return ret; -} - -void ib_device_unregister_sysfs(struct ib_device *device) -{ - int i; - struct kobject *p, *t; - struct ib_port *port; - struct device *class_dev = &device->dev; - - /* Hold kobject until ib_dealloc_device() */ - kobject_get(&device->dev.kobj); - - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { - device_remove_file(class_dev, ib_class_attributes[i]); - } - - list_for_each_entry_safe(p, t, &device->port_list, entry) { - list_del(&p->entry); - port = container_of(p, struct ib_port, kobj); - sysfs_remove_group(p, &pma_group); - sysfs_remove_group(p, &port->pkey_group); - sysfs_remove_group(p, &port->gid_group); - kobject_put(p); - } - - kobject_put(device->ports_parent); - device_unregister(&device->dev); -} - -int ib_sysfs_setup(void) -{ - return class_register(&ib_class); -} - -void ib_sysfs_cleanup(void) -{ - class_unregister(&ib_class); -} Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/sysfs.c ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/umem.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/umem.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/umem.c (nonexistent) @@ -1,445 +0,0 @@ -/* - * Copyright (c) 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005 Cisco Systems. All rights reserved. - * Copyright (c) 2005 Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#define LINUXKPI_PARAM_PREFIX ibcore_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "uverbs.h" - -#define IB_UMEM_MAX_PAGE_CHUNK (PAGE_SIZE / sizeof (struct page *)) - -static int allow_weak_ordering; -module_param_named(weak_ordering, allow_weak_ordering, int, 0444); -MODULE_PARM_DESC(weak_ordering, "Allow weak ordering for data registered memory"); - -static struct ib_umem *peer_umem_get(struct ib_peer_memory_client *ib_peer_mem, - struct ib_umem *umem, unsigned long addr, - int dmasync, int invalidation_supported) -{ - int ret; - const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem; - struct invalidation_ctx *invalidation_ctx = NULL; - - umem->ib_peer_mem = ib_peer_mem; - if (invalidation_supported) { - invalidation_ctx = kzalloc(sizeof(*invalidation_ctx), GFP_KERNEL); - if (!invalidation_ctx) { - ret = -ENOMEM; - goto out; - } - umem->invalidation_ctx = invalidation_ctx; - invalidation_ctx->umem = umem; - mutex_lock(&ib_peer_mem->lock); - invalidation_ctx->context_ticket = - ib_peer_insert_context(ib_peer_mem, invalidation_ctx); - /* unlock before calling get pages to prevent a dead-lock from the callback */ - mutex_unlock(&ib_peer_mem->lock); - } - - ret = peer_mem->get_pages(addr, umem->length, umem->writable, 1, - &umem->sg_head, - umem->peer_mem_client_context, - invalidation_ctx ? - (void *)invalidation_ctx->context_ticket : NULL); - - if (invalidation_ctx) { - /* taking the lock back, checking that wasn't invalidated at that time */ - mutex_lock(&ib_peer_mem->lock); - if (invalidation_ctx->peer_invalidated) { - printk(KERN_ERR "peer_umem_get: pages were invalidated by peer\n"); - ret = -EINVAL; - } - } - - if (ret) - goto out; - - umem->page_size = peer_mem->get_page_size - (umem->peer_mem_client_context); - if (umem->page_size <= 0) - goto put_pages; - - umem->offset = addr & ((unsigned long)umem->page_size - 1); - ret = peer_mem->dma_map(&umem->sg_head, - umem->peer_mem_client_context, - umem->context->device->dma_device, - dmasync, - &umem->nmap); - if (ret) - goto put_pages; - - ib_peer_mem->stats.num_reg_pages += - umem->nmap * (umem->page_size >> PAGE_SHIFT); - ib_peer_mem->stats.num_alloc_mrs += 1; - return umem; - -put_pages: - - peer_mem->put_pages(umem->peer_mem_client_context, - &umem->sg_head); -out: - if (invalidation_ctx) { - ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket); - mutex_unlock(&umem->ib_peer_mem->lock); - kfree(invalidation_ctx); - } - - ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context, - umem->peer_mem_srcu_key); - kfree(umem); - return ERR_PTR(ret); -} - -static void peer_umem_release(struct ib_umem *umem) -{ - struct ib_peer_memory_client *ib_peer_mem = umem->ib_peer_mem; - const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem; - struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx; - - if (invalidation_ctx) { - - int peer_callback; - int inflight_invalidation; - /* If we are not under peer callback we must take the lock before removing - * core ticket from the tree and releasing its umem. - * It will let any inflight callbacks to be ended safely. - * If we are under peer callback or under error flow of reg_mr so that context - * wasn't activated yet lock was already taken. - */ - if (invalidation_ctx->func && !invalidation_ctx->peer_callback) - mutex_lock(&ib_peer_mem->lock); - ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket); - /* make sure to check inflight flag after took the lock and remove from tree. - * in addition, from that point using local variables for peer_callback and - * inflight_invalidation as after the complete invalidation_ctx can't be accessed - * any more as it may be freed by the callback. - */ - peer_callback = invalidation_ctx->peer_callback; - inflight_invalidation = invalidation_ctx->inflight_invalidation; - if (inflight_invalidation) - complete(&invalidation_ctx->comp); - /* On peer callback lock is handled externally */ - if (!peer_callback) - /* unlocking before put_pages */ - mutex_unlock(&ib_peer_mem->lock); - /* in case under callback context or callback is pending let it free the invalidation context */ - if (!peer_callback && !inflight_invalidation) - kfree(invalidation_ctx); - } - - peer_mem->dma_unmap(&umem->sg_head, - umem->peer_mem_client_context, - umem->context->device->dma_device); - peer_mem->put_pages(&umem->sg_head, - umem->peer_mem_client_context); - - ib_peer_mem->stats.num_dereg_pages += - umem->nmap * (umem->page_size >> PAGE_SHIFT); - ib_peer_mem->stats.num_dealloc_mrs += 1; - ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context, - umem->peer_mem_srcu_key); - kfree(umem); - - return; - -} - -static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) -{ - - vm_object_t object; - struct scatterlist *sg; - struct page *page; - int i; - - object = NULL; - if (umem->nmap > 0) - ib_dma_unmap_sg(dev, umem->sg_head.sgl, - umem->nmap, - DMA_BIDIRECTIONAL); - for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { - page = sg_page(sg); - if (umem->writable && dirty) { - if (object && object != page->object) - VM_OBJECT_WUNLOCK(object); - if (object != page->object) { - object = page->object; - VM_OBJECT_WLOCK(object); - } - vm_page_dirty(page); - } - } - sg_free_table(&umem->sg_head); - if (object) - VM_OBJECT_WUNLOCK(object); - -} - -void ib_umem_activate_invalidation_notifier(struct ib_umem *umem, - umem_invalidate_func_t func, - void *cookie) -{ - struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx; - - invalidation_ctx->func = func; - invalidation_ctx->cookie = cookie; - - /* from that point any pending invalidations can be called */ - mutex_unlock(&umem->ib_peer_mem->lock); - return; -} -EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier); -/** - * ib_umem_get - Pin and DMA map userspace memory. - * @context: userspace context to pin memory for - * @addr: userspace virtual address to start at - * @size: length of region to pin - * @access: IB_ACCESS_xxx flags for memory being pinned - * @dmasync: flush in-flight DMA when the memory region is written - */ -struct ib_umem *ib_umem_get_ex(struct ib_ucontext *context, unsigned long addr, - size_t size, int access, int dmasync, - int invalidation_supported) -{ - - struct ib_umem *umem; - struct proc *proc; - pmap_t pmap; - vm_offset_t end, last, start; - vm_size_t npages; - int error; - int ret; - int ents; - int i; - DEFINE_DMA_ATTRS(attrs); - struct scatterlist *sg, *sg_list_start; - int need_release = 0; - - error = priv_check(curthread, PRIV_VM_MLOCK); - if (error) - return ERR_PTR(-error); - - last = addr + size; - start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */ - end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */ - if (last < addr || end < addr) - return ERR_PTR(-EINVAL); - npages = atop(end - start); - if (npages > vm_page_max_wired) - return ERR_PTR(-ENOMEM); - umem = kzalloc(sizeof *umem, GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); - proc = curthread->td_proc; - PROC_LOCK(proc); - if (ptoa(npages + - pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > - lim_cur_proc(proc, RLIMIT_MEMLOCK)) { - PROC_UNLOCK(proc); - kfree(umem); - return ERR_PTR(-ENOMEM); - } - PROC_UNLOCK(proc); - if (npages + vm_cnt.v_wire_count > vm_page_max_wired) { - kfree(umem); - return ERR_PTR(-EAGAIN); - } - error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, - VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES | - (umem->writable ? VM_MAP_WIRE_WRITE : 0)); - if (error != KERN_SUCCESS) { - kfree(umem); - return ERR_PTR(-ENOMEM); - } - - umem->context = context; - umem->length = size; - umem->offset = addr & ~PAGE_MASK; - umem->page_size = PAGE_SIZE; - umem->start = addr; - /* - * We ask for writable memory if any access flags other than - * "remote read" are set. "Local write" and "remote write" - * obviously require write access. "Remote atomic" can do - * things like fetch and add, which will modify memory, and - * "MW bind" can change permissions by binding a window. - */ - umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); - - if (invalidation_supported || context->peer_mem_private_data) { - - struct ib_peer_memory_client *peer_mem_client; - - peer_mem_client = ib_get_peer_client(context, addr, size, - &umem->peer_mem_client_context, - &umem->peer_mem_srcu_key); - if (peer_mem_client) - return peer_umem_get(peer_mem_client, umem, addr, - dmasync, invalidation_supported); - } - - umem->hugetlb = 0; - - pmap = vm_map_pmap(&proc->p_vmspace->vm_map); - - if (npages == 0) { - ret = -EINVAL; - goto out; - } - - ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); - if (ret) - goto out; - - need_release = 1; - sg_list_start = umem->sg_head.sgl; - - while (npages) { - - ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK); - umem->npages += ents; - - for_each_sg(sg_list_start, sg, ents, i) { - vm_paddr_t pa; - - pa = pmap_extract(pmap, start); - if (pa == 0) { - ret = -ENOMEM; - goto out; - } - sg_set_page(sg, PHYS_TO_VM_PAGE(pa), - PAGE_SIZE, 0); - npages--; - start += PAGE_SIZE; - } - - /* preparing for next loop */ - sg_list_start = sg; - } - - umem->nmap = ib_dma_map_sg_attrs(context->device, - umem->sg_head.sgl, - umem->npages, - DMA_BIDIRECTIONAL, - &attrs); - if (umem->nmap != umem->npages) { - ret = -ENOMEM; - goto out; - } - -out: - if (ret < 0) { - if (need_release) - __ib_umem_release(context->device, umem, 0); - kfree(umem); - } - - return ret < 0 ? ERR_PTR(ret) : umem; -} -EXPORT_SYMBOL(ib_umem_get_ex); - -struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, - size_t size, int access, int dmasync) -{ - return ib_umem_get_ex(context, addr, - size, access, dmasync, 0); -} -EXPORT_SYMBOL(ib_umem_get); - -/** - * ib_umem_release - release memory pinned with ib_umem_get - * @umem: umem struct to release - */ -void ib_umem_release(struct ib_umem *umem) -{ - - vm_offset_t addr, end, last, start; - vm_size_t size; - int error; - - if (umem->ib_peer_mem) { - peer_umem_release(umem); - return; - } - - __ib_umem_release(umem->context->device, umem, 1); - - if (umem->context->closing) { - kfree(umem); - return; - } - - error = priv_check(curthread, PRIV_VM_MUNLOCK); - - if (error) - return; - - addr = umem->start; - size = umem->length; - last = addr + size; - start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */ - end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */ - vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end, - VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); - kfree(umem); - -} -EXPORT_SYMBOL(ib_umem_release); - -int ib_umem_page_count(struct ib_umem *umem) -{ - int shift; - int i; - int n; - struct scatterlist *sg; - - shift = ilog2(umem->page_size); - - n = 0; - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) - n += sg_dma_len(sg) >> shift; - - return n; -} -EXPORT_SYMBOL(ib_umem_page_count); Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/umem.c ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/agent.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/agent.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/agent.c (revision 319974) @@ -1,217 +1,222 @@ /* * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved. * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved. * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include #include #include "agent.h" #include "smi.h" #include "mad_priv.h" #define SPFX "ib_agent: " struct ib_agent_port_private { struct list_head port_list; struct ib_mad_agent *agent[2]; }; static DEFINE_SPINLOCK(ib_agent_port_list_lock); static LIST_HEAD(ib_agent_port_list); static struct ib_agent_port_private * -__ib_get_agent_port(struct ib_device *device, int port_num) +__ib_get_agent_port(const struct ib_device *device, int port_num) { struct ib_agent_port_private *entry; list_for_each_entry(entry, &ib_agent_port_list, port_list) { if (entry->agent[1]->device == device && entry->agent[1]->port_num == port_num) return entry; } return NULL; } static struct ib_agent_port_private * -ib_get_agent_port(struct ib_device *device, int port_num) +ib_get_agent_port(const struct ib_device *device, int port_num) { struct ib_agent_port_private *entry; unsigned long flags; spin_lock_irqsave(&ib_agent_port_list_lock, flags); entry = __ib_get_agent_port(device, port_num); spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); return entry; } -void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, - struct ib_wc *wc, struct ib_device *device, - int port_num, int qpn) +void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh, + const struct ib_wc *wc, const struct ib_device *device, + int port_num, int qpn, size_t resp_mad_len, bool opa) { struct ib_agent_port_private *port_priv; struct ib_mad_agent *agent; struct ib_mad_send_buf *send_buf; struct ib_ah *ah; struct ib_mad_send_wr_private *mad_send_wr; - if (device->node_type == RDMA_NODE_IB_SWITCH) + if (rdma_cap_ib_switch(device)) port_priv = ib_get_agent_port(device, 0); else port_priv = ib_get_agent_port(device, port_num); if (!port_priv) { - printk(KERN_ERR SPFX "Unable to find port agent\n"); + dev_err(&device->dev, "Unable to find port agent\n"); return; } agent = port_priv->agent[qpn]; ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num); if (IS_ERR(ah)) { - printk(KERN_ERR SPFX "ib_create_ah_from_wc error %ld\n", + dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n", PTR_ERR(ah)); return; } + if (opa && mad_hdr->base_version != OPA_MGMT_BASE_VERSION) + resp_mad_len = IB_MGMT_MAD_SIZE; + send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0, - IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, - GFP_KERNEL); + IB_MGMT_MAD_HDR, + resp_mad_len - IB_MGMT_MAD_HDR, + GFP_KERNEL, + mad_hdr->base_version); if (IS_ERR(send_buf)) { - printk(KERN_ERR SPFX "ib_create_send_mad error\n"); + dev_err(&device->dev, "ib_create_send_mad error\n"); goto err1; } - memcpy(send_buf->mad, mad, sizeof *mad); + memcpy(send_buf->mad, mad_hdr, resp_mad_len); send_buf->ah = ah; - if (device->node_type == RDMA_NODE_IB_SWITCH) { + if (rdma_cap_ib_switch(device)) { mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); - mad_send_wr->send_wr.wr.ud.port_num = port_num; + mad_send_wr->send_wr.port_num = port_num; } if (ib_post_send_mad(send_buf, NULL)) { - printk(KERN_ERR SPFX "ib_post_send_mad error\n"); + dev_err(&device->dev, "ib_post_send_mad error\n"); goto err2; } return; err2: ib_free_send_mad(send_buf); err1: ib_destroy_ah(ah); } static void agent_send_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_send_wc) { ib_destroy_ah(mad_send_wc->send_buf->ah); ib_free_send_mad(mad_send_wc->send_buf); } int ib_agent_port_open(struct ib_device *device, int port_num) { struct ib_agent_port_private *port_priv; unsigned long flags; int ret; /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); if (!port_priv) { - printk(KERN_ERR SPFX "No memory for ib_agent_port_private\n"); + dev_err(&device->dev, "No memory for ib_agent_port_private\n"); ret = -ENOMEM; goto error1; } - if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) { + if (rdma_cap_ib_smi(device, port_num)) { /* Obtain send only MAD agent for SMI QP */ port_priv->agent[0] = ib_register_mad_agent(device, port_num, IB_QPT_SMI, NULL, 0, &agent_send_handler, - NULL, NULL); + NULL, NULL, 0); if (IS_ERR(port_priv->agent[0])) { ret = PTR_ERR(port_priv->agent[0]); goto error2; } } /* Obtain send only MAD agent for GSI QP */ port_priv->agent[1] = ib_register_mad_agent(device, port_num, IB_QPT_GSI, NULL, 0, &agent_send_handler, - NULL, NULL); + NULL, NULL, 0); if (IS_ERR(port_priv->agent[1])) { ret = PTR_ERR(port_priv->agent[1]); goto error3; } spin_lock_irqsave(&ib_agent_port_list_lock, flags); list_add_tail(&port_priv->port_list, &ib_agent_port_list); spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); return 0; error3: if (port_priv->agent[0]) ib_unregister_mad_agent(port_priv->agent[0]); error2: kfree(port_priv); error1: return ret; } int ib_agent_port_close(struct ib_device *device, int port_num) { struct ib_agent_port_private *port_priv; unsigned long flags; spin_lock_irqsave(&ib_agent_port_list_lock, flags); port_priv = __ib_get_agent_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); - printk(KERN_ERR SPFX "Port %d not found\n", port_num); + dev_err(&device->dev, "Port %d not found\n", port_num); return -ENODEV; } list_del(&port_priv->port_list); spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); ib_unregister_mad_agent(port_priv->agent[1]); if (port_priv->agent[0]) ib_unregister_mad_agent(port_priv->agent[0]); kfree(port_priv); return 0; } Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/agent.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/agent.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/agent.h (revision 319974) @@ -1,51 +1,51 @@ /* * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef __AGENT_H_ #define __AGENT_H_ #include #include extern int ib_agent_port_open(struct ib_device *device, int port_num); extern int ib_agent_port_close(struct ib_device *device, int port_num); -extern void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, - struct ib_wc *wc, struct ib_device *device, - int port_num, int qpn); +extern void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh, + const struct ib_wc *wc, const struct ib_device *device, + int port_num, int qpn, size_t resp_mad_len, bool opa); #endif /* __AGENT_H_ */ Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/cm.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/cm.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/cm.c (revision 319974) @@ -1,3986 +1,4141 @@ /* * Copyright (c) 2004-2007 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include "cm_msgs.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("InfiniBand CM"); MODULE_LICENSE("Dual BSD/GPL"); -#ifdef pr_fmt -#undef pr_fmt -#endif -#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ - static void cm_add_one(struct ib_device *device); -static void cm_remove_one(struct ib_device *device); +static void cm_remove_one(struct ib_device *device, void *client_data); static struct ib_client cm_client = { .name = "cm", .add = cm_add_one, .remove = cm_remove_one }; static struct ib_cm { spinlock_t lock; struct list_head device_list; rwlock_t device_lock; struct rb_root listen_service_table; u64 listen_service_id; /* struct rb_root peer_service_table; todo: fix peer to peer */ struct rb_root remote_qp_table; struct rb_root remote_id_table; struct rb_root remote_sidr_table; struct idr local_id_table; __be32 random_id_operand; struct list_head timewait_list; struct workqueue_struct *wq; + /* Sync on cm change port state */ + spinlock_t state_lock; } cm; /* Counter indexes ordered by attribute ID */ enum { CM_REQ_COUNTER, CM_MRA_COUNTER, CM_REJ_COUNTER, CM_REP_COUNTER, CM_RTU_COUNTER, CM_DREQ_COUNTER, CM_DREP_COUNTER, CM_SIDR_REQ_COUNTER, CM_SIDR_REP_COUNTER, CM_LAP_COUNTER, CM_APR_COUNTER, CM_ATTR_COUNT, CM_ATTR_ID_OFFSET = 0x0010, }; enum { CM_XMIT, CM_XMIT_RETRIES, CM_RECV, CM_RECV_DUPLICATES, CM_COUNTER_GROUPS }; static char const counter_group_names[CM_COUNTER_GROUPS] [sizeof("cm_rx_duplicates")] = { "cm_tx_msgs", "cm_tx_retries", "cm_rx_msgs", "cm_rx_duplicates" }; struct cm_counter_group { struct kobject obj; atomic_long_t counter[CM_ATTR_COUNT]; }; struct cm_counter_attribute { struct attribute attr; int index; }; #define CM_COUNTER_ATTR(_name, _index) \ struct cm_counter_attribute cm_##_name##_counter_attr = { \ .attr = { .name = __stringify(_name), .mode = 0444 }, \ .index = _index \ } static CM_COUNTER_ATTR(req, CM_REQ_COUNTER); static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER); static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER); static CM_COUNTER_ATTR(rep, CM_REP_COUNTER); static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER); static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER); static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER); static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER); static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER); static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER); static CM_COUNTER_ATTR(apr, CM_APR_COUNTER); static struct attribute *cm_counter_default_attrs[] = { &cm_req_counter_attr.attr, &cm_mra_counter_attr.attr, &cm_rej_counter_attr.attr, &cm_rep_counter_attr.attr, &cm_rtu_counter_attr.attr, &cm_dreq_counter_attr.attr, &cm_drep_counter_attr.attr, &cm_sidr_req_counter_attr.attr, &cm_sidr_rep_counter_attr.attr, &cm_lap_counter_attr.attr, &cm_apr_counter_attr.attr, NULL }; struct cm_port { struct cm_device *cm_dev; struct ib_mad_agent *mad_agent; struct kobject port_obj; u8 port_num; + struct list_head cm_priv_prim_list; + struct list_head cm_priv_altr_list; struct cm_counter_group counter_group[CM_COUNTER_GROUPS]; }; struct cm_device { struct list_head list; struct ib_device *ib_device; struct device *device; u8 ack_delay; + int going_down; struct cm_port *port[0]; }; struct cm_av { struct cm_port *port; union ib_gid dgid; struct ib_ah_attr ah_attr; u16 pkey_index; u8 timeout; - u8 valid; - u8 smac[ETH_ALEN]; }; struct cm_work { struct delayed_work work; struct list_head list; struct cm_port *port; struct ib_mad_recv_wc *mad_recv_wc; /* Received MADs */ __be32 local_id; /* Established / timewait */ __be32 remote_id; struct ib_cm_event cm_event; struct ib_sa_path_rec path[0]; }; struct cm_timewait_info { struct cm_work work; /* Must be first. */ struct list_head list; struct rb_node remote_qp_node; struct rb_node remote_id_node; __be64 remote_ca_guid; __be32 remote_qpn; u8 inserted_remote_qp; u8 inserted_remote_id; }; struct cm_id_private { struct ib_cm_id id; struct rb_node service_node; struct rb_node sidr_id_node; spinlock_t lock; /* Do not acquire inside cm.lock */ struct completion comp; atomic_t refcount; + /* Number of clients sharing this ib_cm_id. Only valid for listeners. + * Protected by the cm.lock spinlock. */ + int listen_sharecount; struct ib_mad_send_buf *msg; struct cm_timewait_info *timewait_info; /* todo: use alternate port on send failure */ struct cm_av av; struct cm_av alt_av; - struct ib_cm_compare_data *compare_data; void *private_data; __be64 tid; __be32 local_qpn; __be32 remote_qpn; enum ib_qp_type qp_type; __be32 sq_psn; __be32 rq_psn; int timeout_ms; enum ib_mtu path_mtu; __be16 pkey; u8 private_data_len; u8 max_cm_retries; u8 peer_to_peer; u8 responder_resources; u8 initiator_depth; u8 retry_count; u8 rnr_retry_count; u8 service_timeout; u8 target_ack_delay; + struct list_head prim_list; + struct list_head altr_list; + /* Indicates that the send port mad is registered and av is set */ + int prim_send_port_not_ready; + int altr_send_port_not_ready; + struct list_head work_list; atomic_t work_count; }; static void cm_work_handler(struct work_struct *work); static inline void cm_deref_id(struct cm_id_private *cm_id_priv) { if (atomic_dec_and_test(&cm_id_priv->refcount)) complete(&cm_id_priv->comp); } static int cm_alloc_msg(struct cm_id_private *cm_id_priv, struct ib_mad_send_buf **msg) { struct ib_mad_agent *mad_agent; struct ib_mad_send_buf *m; struct ib_ah *ah; + struct cm_av *av; + unsigned long flags, flags2; + int ret = 0; + /* don't let the port to be released till the agent is down */ + spin_lock_irqsave(&cm.state_lock, flags2); + spin_lock_irqsave(&cm.lock, flags); + if (!cm_id_priv->prim_send_port_not_ready) + av = &cm_id_priv->av; + else if (!cm_id_priv->altr_send_port_not_ready && + (cm_id_priv->alt_av.port)) + av = &cm_id_priv->alt_av; + else { + pr_info("%s: not valid CM id\n", __func__); + ret = -ENODEV; + spin_unlock_irqrestore(&cm.lock, flags); + goto out; + } + spin_unlock_irqrestore(&cm.lock, flags); + /* Make sure the port haven't released the mad yet */ mad_agent = cm_id_priv->av.port->mad_agent; - ah = ib_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr); - if (IS_ERR(ah)) - return PTR_ERR(ah); + if (!mad_agent) { + pr_info("%s: not a valid MAD agent\n", __func__); + ret = -ENODEV; + goto out; + } + ah = ib_create_ah(mad_agent->qp->pd, &av->ah_attr); + if (IS_ERR(ah)) { + ret = PTR_ERR(ah); + goto out; + } m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn, - cm_id_priv->av.pkey_index, + av->pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, - GFP_ATOMIC); + GFP_ATOMIC, + IB_MGMT_BASE_VERSION); if (IS_ERR(m)) { ib_destroy_ah(ah); - return PTR_ERR(m); + ret = PTR_ERR(m); + goto out; } /* Timeout set by caller if response is expected. */ m->ah = ah; m->retries = cm_id_priv->max_cm_retries; atomic_inc(&cm_id_priv->refcount); m->context[0] = cm_id_priv; *msg = m; - return 0; + +out: + spin_unlock_irqrestore(&cm.state_lock, flags2); + return ret; } static int cm_alloc_response_msg(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc, struct ib_mad_send_buf **msg) { struct ib_mad_send_buf *m; struct ib_ah *ah; ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc, mad_recv_wc->recv_buf.grh, port->port_num); if (IS_ERR(ah)) return PTR_ERR(ah); m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, - GFP_ATOMIC); + GFP_ATOMIC, + IB_MGMT_BASE_VERSION); if (IS_ERR(m)) { ib_destroy_ah(ah); return PTR_ERR(m); } m->ah = ah; *msg = m; return 0; } static void cm_free_msg(struct ib_mad_send_buf *msg) { ib_destroy_ah(msg->ah); if (msg->context[0]) cm_deref_id(msg->context[0]); ib_free_send_mad(msg); } static void * cm_copy_private_data(const void *private_data, u8 private_data_len) { void *data; if (!private_data || !private_data_len) return NULL; data = kmemdup(private_data, private_data_len, GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); return data; } static void cm_set_private_data(struct cm_id_private *cm_id_priv, void *private_data, u8 private_data_len) { if (cm_id_priv->private_data && cm_id_priv->private_data_len) kfree(cm_id_priv->private_data); cm_id_priv->private_data = private_data; cm_id_priv->private_data_len = private_data_len; } static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, struct ib_grh *grh, struct cm_av *av) { av->port = port; av->pkey_index = wc->pkey_index; ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc, grh, &av->ah_attr); } -int ib_update_cm_av(struct ib_cm_id *id, const u8 *smac, const u8 *alt_smac) +static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av, + struct cm_id_private *cm_id_priv) { - struct cm_id_private *cm_id_priv; - - cm_id_priv = container_of(id, struct cm_id_private, id); - - if (smac != NULL) - memcpy(cm_id_priv->av.smac, smac, sizeof(cm_id_priv->av.smac)); - - if (alt_smac != NULL) - memcpy(cm_id_priv->alt_av.smac, alt_smac, - sizeof(cm_id_priv->alt_av.smac)); - - return 0; -} -EXPORT_SYMBOL(ib_update_cm_av); - -static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) -{ struct cm_device *cm_dev; struct cm_port *port = NULL; unsigned long flags; int ret; u8 p; + struct net_device *ndev = ib_get_ndev_from_path(path); read_lock_irqsave(&cm.device_lock, flags); list_for_each_entry(cm_dev, &cm.device_list, list) { if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid, - &p, NULL)) { + path->gid_type, ndev, &p, NULL)) { port = cm_dev->port[p-1]; break; } } read_unlock_irqrestore(&cm.device_lock, flags); + if (ndev) + dev_put(ndev); + if (!port) return -EINVAL; ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num, be16_to_cpu(path->pkey), &av->pkey_index); if (ret) return ret; av->port = port; - ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, - &av->ah_attr); + ret = ib_init_ah_from_path(cm_dev->ib_device, port->port_num, + path, &av->ah_attr); + if (ret) + return ret; + av->timeout = path->packet_life_time + 1; - memcpy(av->smac, path->smac, sizeof(av->smac)); - av->valid = 1; - return 0; + spin_lock_irqsave(&cm.lock, flags); + if (&cm_id_priv->av == av) + list_add_tail(&cm_id_priv->prim_list, &port->cm_priv_prim_list); + else if (&cm_id_priv->alt_av == av) + list_add_tail(&cm_id_priv->altr_list, &port->cm_priv_altr_list); + else + ret = -EINVAL; + + spin_unlock_irqrestore(&cm.lock, flags); + + return ret; } static int cm_alloc_id(struct cm_id_private *cm_id_priv) { unsigned long flags; - int ret, id; - static int next_id; + int id; - do { - spin_lock_irqsave(&cm.lock, flags); - ret = idr_get_new_above(&cm.local_id_table, cm_id_priv, - next_id, &id); - if (!ret) - next_id = ((unsigned) id + 1) & MAX_IDR_MASK; - spin_unlock_irqrestore(&cm.lock, flags); - } while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) ); + idr_preload(GFP_KERNEL); + spin_lock_irqsave(&cm.lock, flags); + id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT); + + spin_unlock_irqrestore(&cm.lock, flags); + idr_preload_end(); + cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand; - return ret; + return id < 0 ? id : 0; } static void cm_free_id(__be32 local_id) { spin_lock_irq(&cm.lock); idr_remove(&cm.local_id_table, (__force int) (local_id ^ cm.random_id_operand)); spin_unlock_irq(&cm.lock); } static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id) { struct cm_id_private *cm_id_priv; cm_id_priv = idr_find(&cm.local_id_table, (__force int) (local_id ^ cm.random_id_operand)); if (cm_id_priv) { if (cm_id_priv->id.remote_id == remote_id) atomic_inc(&cm_id_priv->refcount); else cm_id_priv = NULL; } return cm_id_priv; } static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id) { struct cm_id_private *cm_id_priv; spin_lock_irq(&cm.lock); cm_id_priv = cm_get_id(local_id, remote_id); spin_unlock_irq(&cm.lock); return cm_id_priv; } -static void cm_mask_copy(u8 *dst, u8 *src, u8 *mask) -{ - int i; - - for (i = 0; i < IB_CM_COMPARE_SIZE / sizeof(unsigned long); i++) - ((unsigned long *) dst)[i] = ((unsigned long *) src)[i] & - ((unsigned long *) mask)[i]; -} - -static int cm_compare_data(struct ib_cm_compare_data *src_data, - struct ib_cm_compare_data *dst_data) -{ - u8 src[IB_CM_COMPARE_SIZE]; - u8 dst[IB_CM_COMPARE_SIZE]; - - if (!src_data || !dst_data) - return 0; - - cm_mask_copy(src, src_data->data, dst_data->mask); - cm_mask_copy(dst, dst_data->data, src_data->mask); - return memcmp(src, dst, IB_CM_COMPARE_SIZE); -} - -static int cm_compare_private_data(u8 *private_data, - struct ib_cm_compare_data *dst_data) -{ - u8 src[IB_CM_COMPARE_SIZE]; - - if (!dst_data) - return 0; - - cm_mask_copy(src, private_data, dst_data->mask); - return memcmp(src, dst_data->data, IB_CM_COMPARE_SIZE); -} - /* * Trivial helpers to strip endian annotation and compare; the * endianness doesn't actually matter since we just need a stable * order for the RB tree. */ static int be32_lt(__be32 a, __be32 b) { return (__force u32) a < (__force u32) b; } static int be32_gt(__be32 a, __be32 b) { return (__force u32) a > (__force u32) b; } static int be64_lt(__be64 a, __be64 b) { return (__force u64) a < (__force u64) b; } static int be64_gt(__be64 a, __be64 b) { return (__force u64) a > (__force u64) b; } static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) { struct rb_node **link = &cm.listen_service_table.rb_node; struct rb_node *parent = NULL; struct cm_id_private *cur_cm_id_priv; __be64 service_id = cm_id_priv->id.service_id; __be64 service_mask = cm_id_priv->id.service_mask; - int data_cmp; while (*link) { parent = *link; cur_cm_id_priv = rb_entry(parent, struct cm_id_private, service_node); - data_cmp = cm_compare_data(cm_id_priv->compare_data, - cur_cm_id_priv->compare_data); if ((cur_cm_id_priv->id.service_mask & service_id) == (service_mask & cur_cm_id_priv->id.service_id) && - (cm_id_priv->id.device == cur_cm_id_priv->id.device) && - !data_cmp) + (cm_id_priv->id.device == cur_cm_id_priv->id.device)) return cur_cm_id_priv; if (cm_id_priv->id.device < cur_cm_id_priv->id.device) link = &(*link)->rb_left; else if (cm_id_priv->id.device > cur_cm_id_priv->id.device) link = &(*link)->rb_right; else if (be64_lt(service_id, cur_cm_id_priv->id.service_id)) link = &(*link)->rb_left; else if (be64_gt(service_id, cur_cm_id_priv->id.service_id)) link = &(*link)->rb_right; - else if (data_cmp < 0) - link = &(*link)->rb_left; else link = &(*link)->rb_right; } rb_link_node(&cm_id_priv->service_node, parent, link); rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table); return NULL; } static struct cm_id_private * cm_find_listen(struct ib_device *device, - __be64 service_id, - u8 *private_data) + __be64 service_id) { struct rb_node *node = cm.listen_service_table.rb_node; struct cm_id_private *cm_id_priv; - int data_cmp; while (node) { cm_id_priv = rb_entry(node, struct cm_id_private, service_node); - data_cmp = cm_compare_private_data(private_data, - cm_id_priv->compare_data); if ((cm_id_priv->id.service_mask & service_id) == cm_id_priv->id.service_id && - (cm_id_priv->id.device == device) && !data_cmp) + (cm_id_priv->id.device == device)) return cm_id_priv; if (device < cm_id_priv->id.device) node = node->rb_left; else if (device > cm_id_priv->id.device) node = node->rb_right; else if (be64_lt(service_id, cm_id_priv->id.service_id)) node = node->rb_left; else if (be64_gt(service_id, cm_id_priv->id.service_id)) node = node->rb_right; - else if (data_cmp < 0) - node = node->rb_left; else node = node->rb_right; } return NULL; } static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info *timewait_info) { struct rb_node **link = &cm.remote_id_table.rb_node; struct rb_node *parent = NULL; struct cm_timewait_info *cur_timewait_info; __be64 remote_ca_guid = timewait_info->remote_ca_guid; __be32 remote_id = timewait_info->work.remote_id; while (*link) { parent = *link; cur_timewait_info = rb_entry(parent, struct cm_timewait_info, remote_id_node); if (be32_lt(remote_id, cur_timewait_info->work.remote_id)) link = &(*link)->rb_left; else if (be32_gt(remote_id, cur_timewait_info->work.remote_id)) link = &(*link)->rb_right; else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_left; else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_right; else return cur_timewait_info; } timewait_info->inserted_remote_id = 1; rb_link_node(&timewait_info->remote_id_node, parent, link); rb_insert_color(&timewait_info->remote_id_node, &cm.remote_id_table); return NULL; } static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid, __be32 remote_id) { struct rb_node *node = cm.remote_id_table.rb_node; struct cm_timewait_info *timewait_info; while (node) { timewait_info = rb_entry(node, struct cm_timewait_info, remote_id_node); if (be32_lt(remote_id, timewait_info->work.remote_id)) node = node->rb_left; else if (be32_gt(remote_id, timewait_info->work.remote_id)) node = node->rb_right; else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid)) node = node->rb_left; else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid)) node = node->rb_right; else return timewait_info; } return NULL; } static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info *timewait_info) { struct rb_node **link = &cm.remote_qp_table.rb_node; struct rb_node *parent = NULL; struct cm_timewait_info *cur_timewait_info; __be64 remote_ca_guid = timewait_info->remote_ca_guid; __be32 remote_qpn = timewait_info->remote_qpn; while (*link) { parent = *link; cur_timewait_info = rb_entry(parent, struct cm_timewait_info, remote_qp_node); if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn)) link = &(*link)->rb_left; else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn)) link = &(*link)->rb_right; else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_left; else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_right; else return cur_timewait_info; } timewait_info->inserted_remote_qp = 1; rb_link_node(&timewait_info->remote_qp_node, parent, link); rb_insert_color(&timewait_info->remote_qp_node, &cm.remote_qp_table); return NULL; } static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private *cm_id_priv) { struct rb_node **link = &cm.remote_sidr_table.rb_node; struct rb_node *parent = NULL; struct cm_id_private *cur_cm_id_priv; union ib_gid *port_gid = &cm_id_priv->av.dgid; __be32 remote_id = cm_id_priv->id.remote_id; while (*link) { parent = *link; cur_cm_id_priv = rb_entry(parent, struct cm_id_private, sidr_id_node); if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id)) link = &(*link)->rb_left; else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id)) link = &(*link)->rb_right; else { int cmp; cmp = memcmp(port_gid, &cur_cm_id_priv->av.dgid, sizeof *port_gid); if (cmp < 0) link = &(*link)->rb_left; else if (cmp > 0) link = &(*link)->rb_right; else return cur_cm_id_priv; } } rb_link_node(&cm_id_priv->sidr_id_node, parent, link); rb_insert_color(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); return NULL; } static void cm_reject_sidr_req(struct cm_id_private *cm_id_priv, enum ib_cm_sidr_status status) { struct ib_cm_sidr_rep_param param; memset(¶m, 0, sizeof param); param.status = status; ib_send_cm_sidr_rep(&cm_id_priv->id, ¶m); } struct ib_cm_id *ib_create_cm_id(struct ib_device *device, ib_cm_handler cm_handler, void *context) { struct cm_id_private *cm_id_priv; int ret; cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL); if (!cm_id_priv) return ERR_PTR(-ENOMEM); cm_id_priv->id.state = IB_CM_IDLE; cm_id_priv->id.device = device; cm_id_priv->id.cm_handler = cm_handler; cm_id_priv->id.context = context; cm_id_priv->id.remote_cm_qpn = 1; ret = cm_alloc_id(cm_id_priv); if (ret) goto error; spin_lock_init(&cm_id_priv->lock); init_completion(&cm_id_priv->comp); INIT_LIST_HEAD(&cm_id_priv->work_list); + INIT_LIST_HEAD(&cm_id_priv->prim_list); + INIT_LIST_HEAD(&cm_id_priv->altr_list); atomic_set(&cm_id_priv->work_count, -1); atomic_set(&cm_id_priv->refcount, 1); return &cm_id_priv->id; error: kfree(cm_id_priv); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(ib_create_cm_id); static struct cm_work * cm_dequeue_work(struct cm_id_private *cm_id_priv) { struct cm_work *work; if (list_empty(&cm_id_priv->work_list)) return NULL; work = list_entry(cm_id_priv->work_list.next, struct cm_work, list); list_del(&work->list); return work; } static void cm_free_work(struct cm_work *work) { if (work->mad_recv_wc) ib_free_recv_mad(work->mad_recv_wc); kfree(work); } static inline int cm_convert_to_ms(int iba_time) { /* approximate conversion to ms from 4.096us x 2^iba_time */ return 1 << max(iba_time - 8, 0); } /* * calculate: 4.096x2^ack_timeout = 4.096x2^ack_delay + 2x4.096x2^life_time * Because of how ack_timeout is stored, adding one doubles the timeout. * To avoid large timeouts, select the max(ack_delay, life_time + 1), and * increment it (round up) only if the other is within 50%. */ static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time) { int ack_timeout = packet_life_time + 1; if (ack_timeout >= ca_ack_delay) ack_timeout += (ca_ack_delay >= (ack_timeout - 1)); else ack_timeout = ca_ack_delay + (ack_timeout >= (ca_ack_delay - 1)); return min(31, ack_timeout); } static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info) { if (timewait_info->inserted_remote_id) { rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table); timewait_info->inserted_remote_id = 0; } if (timewait_info->inserted_remote_qp) { rb_erase(&timewait_info->remote_qp_node, &cm.remote_qp_table); timewait_info->inserted_remote_qp = 0; } } -static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id, gfp_t flags) +static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id) { struct cm_timewait_info *timewait_info; - timewait_info = kzalloc(sizeof *timewait_info, flags); + timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL); if (!timewait_info) return ERR_PTR(-ENOMEM); timewait_info->work.local_id = local_id; INIT_DELAYED_WORK(&timewait_info->work.work, cm_work_handler); timewait_info->work.cm_event.event = IB_CM_TIMEWAIT_EXIT; return timewait_info; } static void cm_enter_timewait(struct cm_id_private *cm_id_priv) { int wait_time; unsigned long flags; + struct cm_device *cm_dev; + cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client); + if (!cm_dev) + return; + spin_lock_irqsave(&cm.lock, flags); cm_cleanup_timewait(cm_id_priv->timewait_info); list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list); spin_unlock_irqrestore(&cm.lock, flags); /* * The cm_id could be destroyed by the user before we exit timewait. * To protect against this, we search for the cm_id after exiting * timewait before notifying the user that we've exited timewait. */ cm_id_priv->id.state = IB_CM_TIMEWAIT; wait_time = cm_convert_to_ms(cm_id_priv->av.timeout); - queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, - msecs_to_jiffies(wait_time)); + + /* Check if the device started its remove_one */ + spin_lock_irqsave(&cm.lock, flags); + if (!cm_dev->going_down) + queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, + msecs_to_jiffies(wait_time)); + spin_unlock_irqrestore(&cm.lock, flags); + cm_id_priv->timewait_info = NULL; } static void cm_reset_to_idle(struct cm_id_private *cm_id_priv) { unsigned long flags; cm_id_priv->id.state = IB_CM_IDLE; if (cm_id_priv->timewait_info) { spin_lock_irqsave(&cm.lock, flags); cm_cleanup_timewait(cm_id_priv->timewait_info); spin_unlock_irqrestore(&cm.lock, flags); kfree(cm_id_priv->timewait_info); cm_id_priv->timewait_info = NULL; } } static void cm_destroy_id(struct ib_cm_id *cm_id, int err) { struct cm_id_private *cm_id_priv; struct cm_work *work; cm_id_priv = container_of(cm_id, struct cm_id_private, id); retest: spin_lock_irq(&cm_id_priv->lock); switch (cm_id->state) { case IB_CM_LISTEN: - cm_id->state = IB_CM_IDLE; spin_unlock_irq(&cm_id_priv->lock); + spin_lock_irq(&cm.lock); + if (--cm_id_priv->listen_sharecount > 0) { + /* The id is still shared. */ + cm_deref_id(cm_id_priv); + spin_unlock_irq(&cm.lock); + return; + } rb_erase(&cm_id_priv->service_node, &cm.listen_service_table); spin_unlock_irq(&cm.lock); break; case IB_CM_SIDR_REQ_SENT: cm_id->state = IB_CM_IDLE; ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); spin_unlock_irq(&cm_id_priv->lock); break; case IB_CM_SIDR_REQ_RCVD: spin_unlock_irq(&cm_id_priv->lock); cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT); + spin_lock_irq(&cm.lock); + if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) + rb_erase(&cm_id_priv->sidr_id_node, + &cm.remote_sidr_table); + spin_unlock_irq(&cm.lock); break; case IB_CM_REQ_SENT: + case IB_CM_MRA_REQ_RCVD: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); spin_unlock_irq(&cm_id_priv->lock); ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT, &cm_id_priv->id.device->node_guid, sizeof cm_id_priv->id.device->node_guid, NULL, 0); break; case IB_CM_REQ_RCVD: if (err == -ENOMEM) { /* Do not reject to allow future retries. */ cm_reset_to_idle(cm_id_priv); spin_unlock_irq(&cm_id_priv->lock); } else { spin_unlock_irq(&cm_id_priv->lock); ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); } break; - case IB_CM_MRA_REQ_RCVD: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); /* Fall through */ case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: spin_unlock_irq(&cm_id_priv->lock); ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); break; case IB_CM_ESTABLISHED: spin_unlock_irq(&cm_id_priv->lock); if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) break; ib_send_cm_dreq(cm_id, NULL, 0); goto retest; case IB_CM_DREQ_SENT: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); cm_enter_timewait(cm_id_priv); spin_unlock_irq(&cm_id_priv->lock); break; case IB_CM_DREQ_RCVD: spin_unlock_irq(&cm_id_priv->lock); ib_send_cm_drep(cm_id, NULL, 0); break; default: spin_unlock_irq(&cm_id_priv->lock); break; } + spin_lock_irq(&cm.lock); + if (!list_empty(&cm_id_priv->altr_list) && + (!cm_id_priv->altr_send_port_not_ready)) + list_del(&cm_id_priv->altr_list); + if (!list_empty(&cm_id_priv->prim_list) && + (!cm_id_priv->prim_send_port_not_ready)) + list_del(&cm_id_priv->prim_list); + spin_unlock_irq(&cm.lock); + cm_free_id(cm_id->local_id); cm_deref_id(cm_id_priv); wait_for_completion(&cm_id_priv->comp); while ((work = cm_dequeue_work(cm_id_priv)) != NULL) cm_free_work(work); - kfree(cm_id_priv->compare_data); kfree(cm_id_priv->private_data); kfree(cm_id_priv); } void ib_destroy_cm_id(struct ib_cm_id *cm_id) { cm_destroy_id(cm_id, 0); } EXPORT_SYMBOL(ib_destroy_cm_id); -int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask, - struct ib_cm_compare_data *compare_data) +/** + * __ib_cm_listen - Initiates listening on the specified service ID for + * connection and service ID resolution requests. + * @cm_id: Connection identifier associated with the listen request. + * @service_id: Service identifier matched against incoming connection + * and service ID resolution requests. The service ID should be specified + * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will + * assign a service ID to the caller. + * @service_mask: Mask applied to service ID used to listen across a + * range of service IDs. If set to 0, the service ID is matched + * exactly. This parameter is ignored if %service_id is set to + * IB_CM_ASSIGN_SERVICE_ID. + */ +static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, + __be64 service_mask) { struct cm_id_private *cm_id_priv, *cur_cm_id_priv; - unsigned long flags; int ret = 0; service_mask = service_mask ? service_mask : ~cpu_to_be64(0); service_id &= service_mask; if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID && (service_id != IB_CM_ASSIGN_SERVICE_ID)) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); if (cm_id->state != IB_CM_IDLE) return -EINVAL; - if (compare_data) { - cm_id_priv->compare_data = kzalloc(sizeof *compare_data, - GFP_KERNEL); - if (!cm_id_priv->compare_data) - return -ENOMEM; - cm_mask_copy(cm_id_priv->compare_data->data, - compare_data->data, compare_data->mask); - memcpy(cm_id_priv->compare_data->mask, compare_data->mask, - IB_CM_COMPARE_SIZE); - } - cm_id->state = IB_CM_LISTEN; + ++cm_id_priv->listen_sharecount; - spin_lock_irqsave(&cm.lock, flags); if (service_id == IB_CM_ASSIGN_SERVICE_ID) { cm_id->service_id = cpu_to_be64(cm.listen_service_id++); cm_id->service_mask = ~cpu_to_be64(0); } else { cm_id->service_id = service_id; cm_id->service_mask = service_mask; } cur_cm_id_priv = cm_insert_listen(cm_id_priv); - spin_unlock_irqrestore(&cm.lock, flags); if (cur_cm_id_priv) { cm_id->state = IB_CM_IDLE; - kfree(cm_id_priv->compare_data); - cm_id_priv->compare_data = NULL; + --cm_id_priv->listen_sharecount; ret = -EBUSY; } return ret; } + +int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm.lock, flags); + ret = __ib_cm_listen(cm_id, service_id, service_mask); + spin_unlock_irqrestore(&cm.lock, flags); + + return ret; +} EXPORT_SYMBOL(ib_cm_listen); +/** + * Create a new listening ib_cm_id and listen on the given service ID. + * + * If there's an existing ID listening on that same device and service ID, + * return it. + * + * @device: Device associated with the cm_id. All related communication will + * be associated with the specified device. + * @cm_handler: Callback invoked to notify the user of CM events. + * @service_id: Service identifier matched against incoming connection + * and service ID resolution requests. The service ID should be specified + * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will + * assign a service ID to the caller. + * + * Callers should call ib_destroy_cm_id when done with the listener ID. + */ +struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device, + ib_cm_handler cm_handler, + __be64 service_id) +{ + struct cm_id_private *cm_id_priv; + struct ib_cm_id *cm_id; + unsigned long flags; + int err = 0; + + /* Create an ID in advance, since the creation may sleep */ + cm_id = ib_create_cm_id(device, cm_handler, NULL); + if (IS_ERR(cm_id)) + return cm_id; + + spin_lock_irqsave(&cm.lock, flags); + + if (service_id == IB_CM_ASSIGN_SERVICE_ID) + goto new_id; + + /* Find an existing ID */ + cm_id_priv = cm_find_listen(device, service_id); + if (cm_id_priv) { + if (cm_id->cm_handler != cm_handler || cm_id->context) { + /* Sharing an ib_cm_id with different handlers is not + * supported */ + spin_unlock_irqrestore(&cm.lock, flags); + return ERR_PTR(-EINVAL); + } + atomic_inc(&cm_id_priv->refcount); + ++cm_id_priv->listen_sharecount; + spin_unlock_irqrestore(&cm.lock, flags); + + ib_destroy_cm_id(cm_id); + cm_id = &cm_id_priv->id; + return cm_id; + } + +new_id: + /* Use newly created ID */ + err = __ib_cm_listen(cm_id, service_id, 0); + + spin_unlock_irqrestore(&cm.lock, flags); + + if (err) { + ib_destroy_cm_id(cm_id); + return ERR_PTR(err); + } + return cm_id; +} +EXPORT_SYMBOL(ib_cm_insert_listen); + static __be64 cm_form_tid(struct cm_id_private *cm_id_priv, enum cm_msg_sequence msg_seq) { u64 hi_tid, low_tid; hi_tid = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32; low_tid = (u64) ((__force u32)cm_id_priv->id.local_id | (msg_seq << 30)); return cpu_to_be64(hi_tid | low_tid); } static void cm_format_mad_hdr(struct ib_mad_hdr *hdr, __be16 attr_id, __be64 tid) { hdr->base_version = IB_MGMT_BASE_VERSION; hdr->mgmt_class = IB_MGMT_CLASS_CM; hdr->class_version = IB_CM_CLASS_VERSION; hdr->method = IB_MGMT_METHOD_SEND; hdr->attr_id = attr_id; hdr->tid = tid; } static void cm_format_req(struct cm_req_msg *req_msg, struct cm_id_private *cm_id_priv, struct ib_cm_req_param *param) { struct ib_sa_path_rec *pri_path = param->primary_path; struct ib_sa_path_rec *alt_path = param->alternate_path; cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ)); req_msg->local_comm_id = cm_id_priv->id.local_id; req_msg->service_id = param->service_id; req_msg->local_ca_guid = cm_id_priv->id.device->node_guid; cm_req_set_local_qpn(req_msg, cpu_to_be32(param->qp_num)); cm_req_set_init_depth(req_msg, param->initiator_depth); cm_req_set_remote_resp_timeout(req_msg, param->remote_cm_response_timeout); cm_req_set_qp_type(req_msg, param->qp_type); cm_req_set_flow_ctrl(req_msg, param->flow_control); cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn)); cm_req_set_local_resp_timeout(req_msg, param->local_cm_response_timeout); req_msg->pkey = param->primary_path->pkey; cm_req_set_path_mtu(req_msg, param->primary_path->mtu); cm_req_set_max_cm_retries(req_msg, param->max_cm_retries); if (param->qp_type != IB_QPT_XRC_INI) { cm_req_set_resp_res(req_msg, param->responder_resources); cm_req_set_retry_count(req_msg, param->retry_count); cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count); - cm_req_set_srq(req_msg, param->srq); + cm_req_set_srq(req_msg, param->srq); } if (pri_path->hop_limit <= 1) { req_msg->primary_local_lid = pri_path->slid; req_msg->primary_remote_lid = pri_path->dlid; } else { /* Work-around until there's a way to obtain remote LID info */ req_msg->primary_local_lid = IB_LID_PERMISSIVE; req_msg->primary_remote_lid = IB_LID_PERMISSIVE; } req_msg->primary_local_gid = pri_path->sgid; req_msg->primary_remote_gid = pri_path->dgid; cm_req_set_primary_flow_label(req_msg, pri_path->flow_label); cm_req_set_primary_packet_rate(req_msg, pri_path->rate); req_msg->primary_traffic_class = pri_path->traffic_class; req_msg->primary_hop_limit = pri_path->hop_limit; cm_req_set_primary_sl(req_msg, pri_path->sl); cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1)); cm_req_set_primary_local_ack_timeout(req_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, pri_path->packet_life_time)); if (alt_path) { if (alt_path->hop_limit <= 1) { req_msg->alt_local_lid = alt_path->slid; req_msg->alt_remote_lid = alt_path->dlid; } else { req_msg->alt_local_lid = IB_LID_PERMISSIVE; req_msg->alt_remote_lid = IB_LID_PERMISSIVE; } req_msg->alt_local_gid = alt_path->sgid; req_msg->alt_remote_gid = alt_path->dgid; cm_req_set_alt_flow_label(req_msg, alt_path->flow_label); cm_req_set_alt_packet_rate(req_msg, alt_path->rate); req_msg->alt_traffic_class = alt_path->traffic_class; req_msg->alt_hop_limit = alt_path->hop_limit; cm_req_set_alt_sl(req_msg, alt_path->sl); cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1)); cm_req_set_alt_local_ack_timeout(req_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, alt_path->packet_life_time)); } if (param->private_data && param->private_data_len) memcpy(req_msg->private_data, param->private_data, param->private_data_len); } static int cm_validate_req_param(struct ib_cm_req_param *param) { /* peer-to-peer not supported */ if (param->peer_to_peer) return -EINVAL; if (!param->primary_path) return -EINVAL; if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC && param->qp_type != IB_QPT_XRC_INI) return -EINVAL; if (param->private_data && param->private_data_len > IB_CM_REQ_PRIVATE_DATA_SIZE) return -EINVAL; if (param->alternate_path && (param->alternate_path->pkey != param->primary_path->pkey || param->alternate_path->mtu != param->primary_path->mtu)) return -EINVAL; return 0; } int ib_send_cm_req(struct ib_cm_id *cm_id, struct ib_cm_req_param *param) { struct cm_id_private *cm_id_priv; struct cm_req_msg *req_msg; unsigned long flags; int ret; ret = cm_validate_req_param(param); if (ret) return ret; /* Verify that we're not in timewait. */ cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_IDLE) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); - return -EINVAL; + ret = -EINVAL; + goto out; } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> - id.local_id, - GFP_ATOMIC); + id.local_id); if (IS_ERR(cm_id_priv->timewait_info)) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - return (PTR_ERR(cm_id_priv->timewait_info)); + ret = PTR_ERR(cm_id_priv->timewait_info); + goto out; } - ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av); - if (!ret && param->alternate_path) { + ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av, + cm_id_priv); + if (ret) + goto error1; + if (param->alternate_path) { ret = cm_init_av_by_path(param->alternate_path, - &cm_id_priv->alt_av); - } - if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); + &cm_id_priv->alt_av, cm_id_priv); + if (ret) goto error1; } - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - cm_id->service_id = param->service_id; cm_id->service_mask = ~cpu_to_be64(0); cm_id_priv->timeout_ms = cm_convert_to_ms( param->primary_path->packet_life_time) * 2 + cm_convert_to_ms( param->remote_cm_response_timeout); cm_id_priv->max_cm_retries = param->max_cm_retries; cm_id_priv->initiator_depth = param->initiator_depth; cm_id_priv->responder_resources = param->responder_resources; cm_id_priv->retry_count = param->retry_count; cm_id_priv->path_mtu = param->primary_path->mtu; cm_id_priv->pkey = param->primary_path->pkey; cm_id_priv->qp_type = param->qp_type; ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg); if (ret) goto error1; req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad; cm_format_req(req_msg, cm_id_priv, param); cm_id_priv->tid = req_msg->hdr.tid; cm_id_priv->msg->timeout_ms = cm_id_priv->timeout_ms; cm_id_priv->msg->context[1] = (void *) (unsigned long) IB_CM_REQ_SENT; cm_id_priv->local_qpn = cm_req_get_local_qpn(req_msg); cm_id_priv->rq_psn = cm_req_get_starting_psn(req_msg); spin_lock_irqsave(&cm_id_priv->lock, flags); ret = ib_post_send_mad(cm_id_priv->msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); goto error2; } BUG_ON(cm_id->state != IB_CM_IDLE); cm_id->state = IB_CM_REQ_SENT; spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; -error2: - cm_free_msg(cm_id_priv->msg); -error1: - kfree(cm_id_priv->timewait_info); - return ret; +error2: cm_free_msg(cm_id_priv->msg); +error1: kfree(cm_id_priv->timewait_info); +out: return ret; } EXPORT_SYMBOL(ib_send_cm_req); static int cm_issue_rej(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc, enum ib_cm_rej_reason reason, enum cm_msg_response msg_rejected, void *ari, u8 ari_length) { struct ib_mad_send_buf *msg = NULL; struct cm_rej_msg *rej_msg, *rcv_msg; int ret; ret = cm_alloc_response_msg(port, mad_recv_wc, &msg); if (ret) return ret; /* We just need common CM header information. Cast to any message. */ rcv_msg = (struct cm_rej_msg *) mad_recv_wc->recv_buf.mad; rej_msg = (struct cm_rej_msg *) msg->mad; cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid); rej_msg->remote_comm_id = rcv_msg->local_comm_id; rej_msg->local_comm_id = rcv_msg->remote_comm_id; cm_rej_set_msg_rejected(rej_msg, msg_rejected); rej_msg->reason = cpu_to_be16(reason); if (ari && ari_length) { cm_rej_set_reject_info_len(rej_msg, ari_length); memcpy(rej_msg->ari, ari, ari_length); } ret = ib_post_send_mad(msg, NULL); if (ret) cm_free_msg(msg); return ret; } -static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid, - __be32 local_qpn, __be32 remote_qpn) -{ - return (be64_to_cpu(local_ca_guid) > be64_to_cpu(remote_ca_guid) || - ((local_ca_guid == remote_ca_guid) && - (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn)))); -} - static void cm_format_paths_from_req(struct cm_req_msg *req_msg, struct ib_sa_path_rec *primary_path, struct ib_sa_path_rec *alt_path) { memset(primary_path, 0, sizeof *primary_path); primary_path->dgid = req_msg->primary_local_gid; primary_path->sgid = req_msg->primary_remote_gid; primary_path->dlid = req_msg->primary_local_lid; primary_path->slid = req_msg->primary_remote_lid; primary_path->flow_label = cm_req_get_primary_flow_label(req_msg); primary_path->hop_limit = req_msg->primary_hop_limit; primary_path->traffic_class = req_msg->primary_traffic_class; primary_path->reversible = 1; primary_path->pkey = req_msg->pkey; primary_path->sl = cm_req_get_primary_sl(req_msg); primary_path->mtu_selector = IB_SA_EQ; primary_path->mtu = cm_req_get_path_mtu(req_msg); primary_path->rate_selector = IB_SA_EQ; primary_path->rate = cm_req_get_primary_packet_rate(req_msg); primary_path->packet_life_time_selector = IB_SA_EQ; primary_path->packet_life_time = cm_req_get_primary_local_ack_timeout(req_msg); primary_path->packet_life_time -= (primary_path->packet_life_time > 0); + primary_path->service_id = req_msg->service_id; if (req_msg->alt_local_lid) { memset(alt_path, 0, sizeof *alt_path); alt_path->dgid = req_msg->alt_local_gid; alt_path->sgid = req_msg->alt_remote_gid; alt_path->dlid = req_msg->alt_local_lid; alt_path->slid = req_msg->alt_remote_lid; alt_path->flow_label = cm_req_get_alt_flow_label(req_msg); alt_path->hop_limit = req_msg->alt_hop_limit; alt_path->traffic_class = req_msg->alt_traffic_class; alt_path->reversible = 1; alt_path->pkey = req_msg->pkey; alt_path->sl = cm_req_get_alt_sl(req_msg); alt_path->mtu_selector = IB_SA_EQ; alt_path->mtu = cm_req_get_path_mtu(req_msg); alt_path->rate_selector = IB_SA_EQ; alt_path->rate = cm_req_get_alt_packet_rate(req_msg); alt_path->packet_life_time_selector = IB_SA_EQ; alt_path->packet_life_time = cm_req_get_alt_local_ack_timeout(req_msg); alt_path->packet_life_time -= (alt_path->packet_life_time > 0); + alt_path->service_id = req_msg->service_id; } } +static u16 cm_get_bth_pkey(struct cm_work *work) +{ + struct ib_device *ib_dev = work->port->cm_dev->ib_device; + u8 port_num = work->port->port_num; + u16 pkey_index = work->mad_recv_wc->wc->pkey_index; + u16 pkey; + int ret; + + ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey); + if (ret) { + dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n", + port_num, pkey_index, ret); + return 0; + } + + return pkey; +} + static void cm_format_req_event(struct cm_work *work, struct cm_id_private *cm_id_priv, struct ib_cm_id *listen_id) { struct cm_req_msg *req_msg; struct ib_cm_req_event_param *param; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.req_rcvd; param->listen_id = listen_id; + param->bth_pkey = cm_get_bth_pkey(work); param->port = cm_id_priv->av.port->port_num; param->primary_path = &work->path[0]; if (req_msg->alt_local_lid) param->alternate_path = &work->path[1]; else param->alternate_path = NULL; param->remote_ca_guid = req_msg->local_ca_guid; param->remote_qkey = be32_to_cpu(req_msg->local_qkey); param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg)); param->qp_type = cm_req_get_qp_type(req_msg); param->starting_psn = be32_to_cpu(cm_req_get_starting_psn(req_msg)); param->responder_resources = cm_req_get_init_depth(req_msg); param->initiator_depth = cm_req_get_resp_res(req_msg); param->local_cm_response_timeout = cm_req_get_remote_resp_timeout(req_msg); param->flow_control = cm_req_get_flow_ctrl(req_msg); param->remote_cm_response_timeout = cm_req_get_local_resp_timeout(req_msg); param->retry_count = cm_req_get_retry_count(req_msg); param->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg); param->srq = cm_req_get_srq(req_msg); work->cm_event.private_data = &req_msg->private_data; } static void cm_process_work(struct cm_id_private *cm_id_priv, struct cm_work *work) { int ret; /* We will typically only have the current event to report. */ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event); cm_free_work(work); while (!ret && !atomic_add_negative(-1, &cm_id_priv->work_count)) { spin_lock_irq(&cm_id_priv->lock); work = cm_dequeue_work(cm_id_priv); spin_unlock_irq(&cm_id_priv->lock); BUG_ON(!work); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event); cm_free_work(work); } cm_deref_id(cm_id_priv); if (ret) cm_destroy_id(&cm_id_priv->id, ret); } static void cm_format_mra(struct cm_mra_msg *mra_msg, struct cm_id_private *cm_id_priv, enum cm_msg_response msg_mraed, u8 service_timeout, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid); cm_mra_set_msg_mraed(mra_msg, msg_mraed); mra_msg->local_comm_id = cm_id_priv->id.local_id; mra_msg->remote_comm_id = cm_id_priv->id.remote_id; cm_mra_set_service_timeout(mra_msg, service_timeout); if (private_data && private_data_len) memcpy(mra_msg->private_data, private_data, private_data_len); } static void cm_format_rej(struct cm_rej_msg *rej_msg, struct cm_id_private *cm_id_priv, enum ib_cm_rej_reason reason, void *ari, u8 ari_length, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid); rej_msg->remote_comm_id = cm_id_priv->id.remote_id; switch(cm_id_priv->id.state) { case IB_CM_REQ_RCVD: rej_msg->local_comm_id = 0; cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ); break; case IB_CM_MRA_REQ_SENT: rej_msg->local_comm_id = cm_id_priv->id.local_id; cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ); break; case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: rej_msg->local_comm_id = cm_id_priv->id.local_id; cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REP); break; default: rej_msg->local_comm_id = cm_id_priv->id.local_id; cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_OTHER); break; } rej_msg->reason = cpu_to_be16(reason); if (ari && ari_length) { cm_rej_set_reject_info_len(rej_msg, ari_length); memcpy(rej_msg->ari, ari, ari_length); } if (private_data && private_data_len) memcpy(rej_msg->private_data, private_data, private_data_len); } static void cm_dup_req_handler(struct cm_work *work, struct cm_id_private *cm_id_priv) { struct ib_mad_send_buf *msg = NULL; int ret; atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_REQ_COUNTER]); /* Quick state check to discard duplicate REQs. */ if (cm_id_priv->id.state == IB_CM_REQ_RCVD) return; ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); if (ret) return; spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { case IB_CM_MRA_REQ_SENT: cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout, cm_id_priv->private_data, cm_id_priv->private_data_len); break; case IB_CM_TIMEWAIT: cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv, IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0); break; default: goto unlock; } spin_unlock_irq(&cm_id_priv->lock); ret = ib_post_send_mad(msg, NULL); if (ret) goto free; return; unlock: spin_unlock_irq(&cm_id_priv->lock); free: cm_free_msg(msg); } static struct cm_id_private * cm_match_req(struct cm_work *work, struct cm_id_private *cm_id_priv) { struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv; struct cm_timewait_info *timewait_info; struct cm_req_msg *req_msg; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; /* Check for possible duplicate REQ. */ spin_lock_irq(&cm.lock); timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info); if (timewait_info) { cur_cm_id_priv = cm_get_id(timewait_info->work.local_id, timewait_info->work.remote_id); spin_unlock_irq(&cm.lock); if (cur_cm_id_priv) { cm_dup_req_handler(work, cur_cm_id_priv); cm_deref_id(cur_cm_id_priv); } return NULL; } /* Check for stale connections. */ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); if (timewait_info) { cm_cleanup_timewait(cm_id_priv->timewait_info); spin_unlock_irq(&cm.lock); cm_issue_rej(work->port, work->mad_recv_wc, IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ, NULL, 0); return NULL; } /* Find matching listen request. */ listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device, - req_msg->service_id, - req_msg->private_data); + req_msg->service_id); if (!listen_cm_id_priv) { cm_cleanup_timewait(cm_id_priv->timewait_info); spin_unlock_irq(&cm.lock); cm_issue_rej(work->port, work->mad_recv_wc, IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ, NULL, 0); goto out; } atomic_inc(&listen_cm_id_priv->refcount); atomic_inc(&cm_id_priv->refcount); cm_id_priv->id.state = IB_CM_REQ_RCVD; atomic_inc(&cm_id_priv->work_count); spin_unlock_irq(&cm.lock); out: return listen_cm_id_priv; } /* * Work-around for inter-subnet connections. If the LIDs are permissive, * we need to override the LID/SL data in the REQ with the LID information * in the work completion. */ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) { if (!cm_req_get_primary_subnet_local(req_msg)) { if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) { req_msg->primary_local_lid = cpu_to_be16(wc->slid); cm_req_set_primary_sl(req_msg, wc->sl); } if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE) req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits); } if (!cm_req_get_alt_subnet_local(req_msg)) { if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) { req_msg->alt_local_lid = cpu_to_be16(wc->slid); cm_req_set_alt_sl(req_msg, wc->sl); } if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE) req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits); } } static int cm_req_handler(struct cm_work *work) { struct ib_cm_id *cm_id; struct cm_id_private *cm_id_priv, *listen_cm_id_priv; struct cm_req_msg *req_msg; + union ib_gid gid; + struct ib_gid_attr gid_attr; int ret; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); cm_id_priv = container_of(cm_id, struct cm_id_private, id); cm_id_priv->id.remote_id = req_msg->local_comm_id; cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> - id.local_id, - GFP_KERNEL); + id.local_id); if (IS_ERR(cm_id_priv->timewait_info)) { ret = PTR_ERR(cm_id_priv->timewait_info); goto destroy; } cm_id_priv->timewait_info->work.remote_id = req_msg->local_comm_id; cm_id_priv->timewait_info->remote_ca_guid = req_msg->local_ca_guid; cm_id_priv->timewait_info->remote_qpn = cm_req_get_local_qpn(req_msg); listen_cm_id_priv = cm_match_req(work, cm_id_priv); if (!listen_cm_id_priv) { ret = -EINVAL; kfree(cm_id_priv->timewait_info); goto destroy; } cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; cm_id_priv->id.context = listen_cm_id_priv->id.context; cm_id_priv->id.service_id = req_msg->service_id; cm_id_priv->id.service_mask = ~cpu_to_be64(0); cm_process_routed_req(req_msg, work->mad_recv_wc->wc); cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); - /* Workarround: path in req_msg doesn't contain MAC, take it from wc */ - memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, 6); - work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id; - ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); + memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN); + work->path[0].hop_limit = cm_id_priv->av.ah_attr.grh.hop_limit; + ret = ib_get_cached_gid(work->port->cm_dev->ib_device, + work->port->port_num, + cm_id_priv->av.ah_attr.grh.sgid_index, + &gid, &gid_attr); + if (!ret) { + if (gid_attr.ndev) { + work->path[0].ifindex = gid_attr.ndev->if_index; + work->path[0].net = dev_net(gid_attr.ndev); + dev_put(gid_attr.ndev); + } + work->path[0].gid_type = gid_attr.gid_type; + ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av, + cm_id_priv); + } if (ret) { - ib_get_cached_gid(work->port->cm_dev->ib_device, - work->port->port_num, 0, &work->path[0].sgid); + int err = ib_get_cached_gid(work->port->cm_dev->ib_device, + work->port->port_num, 0, + &work->path[0].sgid, + &gid_attr); + if (!err && gid_attr.ndev) { + work->path[0].ifindex = gid_attr.ndev->if_index; + work->path[0].net = dev_net(gid_attr.ndev); + dev_put(gid_attr.ndev); + } + work->path[0].gid_type = gid_attr.gid_type; ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof work->path[0].sgid, NULL, 0); goto rejected; } if (req_msg->alt_local_lid) { - ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av); + ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av, + cm_id_priv); if (ret) { ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID, &work->path[0].sgid, sizeof work->path[0].sgid, NULL, 0); goto rejected; } } cm_id_priv->tid = req_msg->hdr.tid; cm_id_priv->timeout_ms = cm_convert_to_ms( cm_req_get_local_resp_timeout(req_msg)); cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg); cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg); cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg); cm_id_priv->responder_resources = cm_req_get_init_depth(req_msg); cm_id_priv->path_mtu = cm_req_get_path_mtu(req_msg); cm_id_priv->pkey = req_msg->pkey; cm_id_priv->sq_psn = cm_req_get_starting_psn(req_msg); cm_id_priv->retry_count = cm_req_get_retry_count(req_msg); cm_id_priv->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg); cm_id_priv->qp_type = cm_req_get_qp_type(req_msg); cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id); cm_process_work(cm_id_priv, work); cm_deref_id(listen_cm_id_priv); return 0; rejected: atomic_dec(&cm_id_priv->refcount); cm_deref_id(listen_cm_id_priv); destroy: ib_destroy_cm_id(cm_id); return ret; } static void cm_format_rep(struct cm_rep_msg *rep_msg, struct cm_id_private *cm_id_priv, struct ib_cm_rep_param *param) { cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid); rep_msg->local_comm_id = cm_id_priv->id.local_id; rep_msg->remote_comm_id = cm_id_priv->id.remote_id; cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn)); rep_msg->resp_resources = param->responder_resources; cm_rep_set_target_ack_delay(rep_msg, cm_id_priv->av.port->cm_dev->ack_delay); cm_rep_set_failover(rep_msg, param->failover_accepted); cm_rep_set_rnr_retry_count(rep_msg, param->rnr_retry_count); rep_msg->local_ca_guid = cm_id_priv->id.device->node_guid; if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) { rep_msg->initiator_depth = param->initiator_depth; cm_rep_set_flow_ctrl(rep_msg, param->flow_control); cm_rep_set_srq(rep_msg, param->srq); cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num)); } else { cm_rep_set_srq(rep_msg, 1); cm_rep_set_local_eecn(rep_msg, cpu_to_be32(param->qp_num)); } if (param->private_data && param->private_data_len) memcpy(rep_msg->private_data, param->private_data, param->private_data_len); } int ib_send_cm_rep(struct ib_cm_id *cm_id, struct ib_cm_rep_param *param) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; struct cm_rep_msg *rep_msg; unsigned long flags; int ret; if (param->private_data && param->private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REQ_RCVD && cm_id->state != IB_CM_MRA_REQ_SENT) { - pr_debug("cm_id->state: %d\n", cm_id->state); ret = -EINVAL; goto out; } ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto out; rep_msg = (struct cm_rep_msg *) msg->mad; cm_format_rep(rep_msg, cm_id_priv, param); msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT; ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } cm_id->state = IB_CM_REP_SENT; cm_id_priv->msg = msg; cm_id_priv->initiator_depth = param->initiator_depth; cm_id_priv->responder_resources = param->responder_resources; cm_id_priv->rq_psn = cm_rep_get_starting_psn(rep_msg); cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_rep); static void cm_format_rtu(struct cm_rtu_msg *rtu_msg, struct cm_id_private *cm_id_priv, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid); rtu_msg->local_comm_id = cm_id_priv->id.local_id; rtu_msg->remote_comm_id = cm_id_priv->id.remote_id; if (private_data && private_data_len) memcpy(rtu_msg->private_data, private_data, private_data_len); } int ib_send_cm_rtu(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; void *data; int ret; if (private_data && private_data_len > IB_CM_RTU_PRIVATE_DATA_SIZE) return -EINVAL; data = cm_copy_private_data(private_data, private_data_len); if (IS_ERR(data)) return PTR_ERR(data); cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REP_RCVD && cm_id->state != IB_CM_MRA_REP_SENT) { - pr_debug("cm_id->state: %d\n", cm_id->state); ret = -EINVAL; goto error; } ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto error; cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv, private_data, private_data_len); ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); kfree(data); return ret; } cm_id->state = IB_CM_ESTABLISHED; cm_set_private_data(cm_id_priv, data, private_data_len); spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; error: spin_unlock_irqrestore(&cm_id_priv->lock, flags); kfree(data); return ret; } EXPORT_SYMBOL(ib_send_cm_rtu); static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type) { struct cm_rep_msg *rep_msg; struct ib_cm_rep_event_param *param; rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.rep_rcvd; param->remote_ca_guid = rep_msg->local_ca_guid; param->remote_qkey = be32_to_cpu(rep_msg->local_qkey); param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type)); param->starting_psn = be32_to_cpu(cm_rep_get_starting_psn(rep_msg)); param->responder_resources = rep_msg->initiator_depth; param->initiator_depth = rep_msg->resp_resources; param->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg); param->failover_accepted = cm_rep_get_failover(rep_msg); param->flow_control = cm_rep_get_flow_ctrl(rep_msg); param->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg); param->srq = cm_rep_get_srq(rep_msg); work->cm_event.private_data = &rep_msg->private_data; } static void cm_dup_rep_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rep_msg *rep_msg; struct ib_mad_send_buf *msg = NULL; int ret; rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, rep_msg->local_comm_id); if (!cm_id_priv) return; atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_REP_COUNTER]); ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); if (ret) goto deref; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state == IB_CM_ESTABLISHED) cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv, cm_id_priv->private_data, cm_id_priv->private_data_len); else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT) cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout, cm_id_priv->private_data, cm_id_priv->private_data_len); else goto unlock; spin_unlock_irq(&cm_id_priv->lock); ret = ib_post_send_mad(msg, NULL); if (ret) goto free; goto deref; unlock: spin_unlock_irq(&cm_id_priv->lock); free: cm_free_msg(msg); deref: cm_deref_id(cm_id_priv); } static int cm_rep_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rep_msg *rep_msg; int ret; rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0); if (!cm_id_priv) { cm_dup_rep_handler(work); - pr_debug("no cm_id_priv\n"); return -EINVAL; } cm_format_rep_event(work, cm_id_priv->qp_type); spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: break; default: spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; - pr_debug("cm_id_priv->id.state: %d\n", cm_id_priv->id.state); goto error; } cm_id_priv->timewait_info->work.remote_id = rep_msg->local_comm_id; cm_id_priv->timewait_info->remote_ca_guid = rep_msg->local_ca_guid; cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); spin_lock(&cm.lock); /* Check for duplicate REP. */ if (cm_insert_remote_id(cm_id_priv->timewait_info)) { spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; - pr_debug("Failed to insert remote id\n"); goto error; } /* Check for a stale connection. */ if (cm_insert_remote_qpn(cm_id_priv->timewait_info)) { rb_erase(&cm_id_priv->timewait_info->remote_id_node, &cm.remote_id_table); cm_id_priv->timewait_info->inserted_remote_id = 0; spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); cm_issue_rej(work->port, work->mad_recv_wc, IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, NULL, 0); ret = -EINVAL; - pr_debug("Stale connection.\n"); goto error; } spin_unlock(&cm.lock); cm_id_priv->id.state = IB_CM_REP_RCVD; cm_id_priv->id.remote_id = rep_msg->local_comm_id; cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); cm_id_priv->initiator_depth = rep_msg->resp_resources; cm_id_priv->responder_resources = rep_msg->initiator_depth; cm_id_priv->sq_psn = cm_rep_get_starting_psn(rep_msg); cm_id_priv->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg); cm_id_priv->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg); cm_id_priv->av.timeout = cm_ack_timeout(cm_id_priv->target_ack_delay, cm_id_priv->av.timeout - 1); cm_id_priv->alt_av.timeout = cm_ack_timeout(cm_id_priv->target_ack_delay, cm_id_priv->alt_av.timeout - 1); /* todo: handle peer_to_peer */ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; error: cm_deref_id(cm_id_priv); return ret; } static int cm_establish_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; int ret; /* See comment in cm_establish about lookup. */ cm_id_priv = cm_acquire_id(work->local_id, work->remote_id); if (!cm_id_priv) return -EINVAL; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_ESTABLISHED) { spin_unlock_irq(&cm_id_priv->lock); goto out; } ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static int cm_rtu_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rtu_msg *rtu_msg; int ret; rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(rtu_msg->remote_comm_id, rtu_msg->local_comm_id); if (!cm_id_priv) return -EINVAL; work->cm_event.private_data = &rtu_msg->private_data; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_REP_SENT && cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) { spin_unlock_irq(&cm_id_priv->lock); atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_RTU_COUNTER]); goto out; } cm_id_priv->id.state = IB_CM_ESTABLISHED; ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static void cm_format_dreq(struct cm_dreq_msg *dreq_msg, struct cm_id_private *cm_id_priv, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_DREQ)); dreq_msg->local_comm_id = cm_id_priv->id.local_id; dreq_msg->remote_comm_id = cm_id_priv->id.remote_id; cm_dreq_set_remote_qpn(dreq_msg, cm_id_priv->remote_qpn); if (private_data && private_data_len) memcpy(dreq_msg->private_data, private_data, private_data_len); } int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; int ret; if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_ESTABLISHED) { - pr_debug("cm_id->state: %d\n", cm_id->state); ret = -EINVAL; goto out; } if (cm_id->lap_state == IB_CM_LAP_SENT || cm_id->lap_state == IB_CM_MRA_LAP_RCVD) ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) { cm_enter_timewait(cm_id_priv); goto out; } cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, private_data, private_data_len); msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT; ret = ib_post_send_mad(msg, NULL); if (ret) { cm_enter_timewait(cm_id_priv); spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } cm_id->state = IB_CM_DREQ_SENT; cm_id_priv->msg = msg; out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_dreq); static void cm_format_drep(struct cm_drep_msg *drep_msg, struct cm_id_private *cm_id_priv, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid); drep_msg->local_comm_id = cm_id_priv->id.local_id; drep_msg->remote_comm_id = cm_id_priv->id.remote_id; if (private_data && private_data_len) memcpy(drep_msg->private_data, private_data, private_data_len); } int ib_send_cm_drep(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; void *data; int ret; if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE) return -EINVAL; data = cm_copy_private_data(private_data, private_data_len); if (IS_ERR(data)) return PTR_ERR(data); cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_DREQ_RCVD) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); kfree(data); - pr_debug("cm_id->state(%d) != IB_CM_DREQ_RCVD\n", cm_id->state); return -EINVAL; } cm_set_private_data(cm_id_priv, data, private_data_len); cm_enter_timewait(cm_id_priv); ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto out; cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, private_data, private_data_len); ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_drep); static int cm_issue_drep(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_mad_send_buf *msg = NULL; struct cm_dreq_msg *dreq_msg; struct cm_drep_msg *drep_msg; int ret; ret = cm_alloc_response_msg(port, mad_recv_wc, &msg); if (ret) return ret; dreq_msg = (struct cm_dreq_msg *) mad_recv_wc->recv_buf.mad; drep_msg = (struct cm_drep_msg *) msg->mad; cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid); drep_msg->remote_comm_id = dreq_msg->local_comm_id; drep_msg->local_comm_id = dreq_msg->remote_comm_id; ret = ib_post_send_mad(msg, NULL); if (ret) cm_free_msg(msg); return ret; } static int cm_dreq_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_dreq_msg *dreq_msg; struct ib_mad_send_buf *msg = NULL; int ret; dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id, dreq_msg->local_comm_id); if (!cm_id_priv) { atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); cm_issue_drep(work->port, work->mad_recv_wc); - pr_debug("no cm_id_priv\n"); return -EINVAL; } work->cm_event.private_data = &dreq_msg->private_data; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->local_qpn != cm_dreq_get_remote_qpn(dreq_msg)) goto unlock; switch (cm_id_priv->id.state) { case IB_CM_REP_SENT: case IB_CM_DREQ_SENT: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); break; case IB_CM_ESTABLISHED: if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT || cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); break; case IB_CM_MRA_REP_RCVD: break; case IB_CM_TIMEWAIT: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) goto unlock; cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, cm_id_priv->private_data, cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); if (ib_post_send_mad(msg, NULL)) cm_free_msg(msg); goto deref; case IB_CM_DREQ_RCVD: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); goto unlock; default: - pr_debug("cm_id_priv->id.state: %d\n", cm_id_priv->id.state); goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_RCVD; cm_id_priv->tid = dreq_msg->hdr.tid; ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; unlock: spin_unlock_irq(&cm_id_priv->lock); deref: cm_deref_id(cm_id_priv); return -EINVAL; } static int cm_drep_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_drep_msg *drep_msg; int ret; drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(drep_msg->remote_comm_id, drep_msg->local_comm_id); if (!cm_id_priv) return -EINVAL; work->cm_event.private_data = &drep_msg->private_data; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_DREQ_SENT && cm_id_priv->id.state != IB_CM_DREQ_RCVD) { spin_unlock_irq(&cm_id_priv->lock); goto out; } cm_enter_timewait(cm_id_priv); ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } int ib_send_cm_rej(struct ib_cm_id *cm_id, enum ib_cm_rej_reason reason, void *ari, u8 ari_length, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; int ret; if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) || (ari && ari_length > IB_CM_REJ_ARI_LENGTH)) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id->state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: ret = cm_alloc_msg(cm_id_priv, &msg); if (!ret) cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv, reason, ari, ari_length, private_data, private_data_len); cm_reset_to_idle(cm_id_priv); break; case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: ret = cm_alloc_msg(cm_id_priv, &msg); if (!ret) cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv, reason, ari, ari_length, private_data, private_data_len); cm_enter_timewait(cm_id_priv); break; default: - pr_debug("cm_id->state: 0x%x\n", cm_id->state); ret = -EINVAL; goto out; } if (ret) goto out; ret = ib_post_send_mad(msg, NULL); if (ret) cm_free_msg(msg); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_rej); static void cm_format_rej_event(struct cm_work *work) { struct cm_rej_msg *rej_msg; struct ib_cm_rej_event_param *param; rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.rej_rcvd; param->ari = rej_msg->ari; param->ari_length = cm_rej_get_reject_info_len(rej_msg); param->reason = __be16_to_cpu(rej_msg->reason); work->cm_event.private_data = &rej_msg->private_data; } static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg) { struct cm_timewait_info *timewait_info; struct cm_id_private *cm_id_priv; __be32 remote_id; remote_id = rej_msg->local_comm_id; if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_TIMEOUT) { spin_lock_irq(&cm.lock); timewait_info = cm_find_remote_id( *((__be64 *) rej_msg->ari), remote_id); if (!timewait_info) { spin_unlock_irq(&cm.lock); return NULL; } cm_id_priv = idr_find(&cm.local_id_table, (__force int) (timewait_info->work.local_id ^ cm.random_id_operand)); if (cm_id_priv) { if (cm_id_priv->id.remote_id == remote_id) atomic_inc(&cm_id_priv->refcount); else cm_id_priv = NULL; } spin_unlock_irq(&cm.lock); } else if (cm_rej_get_msg_rejected(rej_msg) == CM_MSG_RESPONSE_REQ) cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, 0); else cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, remote_id); return cm_id_priv; } static int cm_rej_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rej_msg *rej_msg; int ret; rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_rejected_id(rej_msg); if (!cm_id_priv) return -EINVAL; cm_format_rej_event(work); spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); /* fall through */ case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_STALE_CONN) cm_enter_timewait(cm_id_priv); else cm_reset_to_idle(cm_id_priv); break; case IB_CM_DREQ_SENT: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); /* fall through */ case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: cm_enter_timewait(cm_id_priv); break; case IB_CM_ESTABLISHED: if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT || cm_id_priv->id.lap_state == IB_CM_LAP_SENT) { if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT) ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - cm_enter_timewait(cm_id_priv); - break; + cm_enter_timewait(cm_id_priv); + break; } /* fall through */ default: spin_unlock_irq(&cm_id_priv->lock); - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; goto out; } ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } int ib_send_cm_mra(struct ib_cm_id *cm_id, u8 service_timeout, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; enum ib_cm_state cm_state; enum ib_cm_lap_state lap_state; enum cm_msg_response msg_response; void *data; unsigned long flags; int ret; if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE) return -EINVAL; data = cm_copy_private_data(private_data, private_data_len); if (IS_ERR(data)) return PTR_ERR(data); cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); switch(cm_id_priv->id.state) { case IB_CM_REQ_RCVD: cm_state = IB_CM_MRA_REQ_SENT; lap_state = cm_id->lap_state; msg_response = CM_MSG_RESPONSE_REQ; break; case IB_CM_REP_RCVD: cm_state = IB_CM_MRA_REP_SENT; lap_state = cm_id->lap_state; msg_response = CM_MSG_RESPONSE_REP; break; case IB_CM_ESTABLISHED: if (cm_id->lap_state == IB_CM_LAP_RCVD) { cm_state = cm_id->state; lap_state = IB_CM_MRA_LAP_SENT; msg_response = CM_MSG_RESPONSE_OTHER; break; } default: - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; goto error1; } if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) { ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto error1; cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, msg_response, service_timeout, private_data, private_data_len); ret = ib_post_send_mad(msg, NULL); if (ret) goto error2; } cm_id->state = cm_state; cm_id->lap_state = lap_state; cm_id_priv->service_timeout = service_timeout; cm_set_private_data(cm_id_priv, data, private_data_len); spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; error1: spin_unlock_irqrestore(&cm_id_priv->lock, flags); kfree(data); return ret; error2: spin_unlock_irqrestore(&cm_id_priv->lock, flags); kfree(data); cm_free_msg(msg); return ret; } EXPORT_SYMBOL(ib_send_cm_mra); static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg) { switch (cm_mra_get_msg_mraed(mra_msg)) { case CM_MSG_RESPONSE_REQ: return cm_acquire_id(mra_msg->remote_comm_id, 0); case CM_MSG_RESPONSE_REP: case CM_MSG_RESPONSE_OTHER: return cm_acquire_id(mra_msg->remote_comm_id, mra_msg->local_comm_id); default: return NULL; } } static int cm_mra_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_mra_msg *mra_msg; int timeout, ret; mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_mraed_id(mra_msg); if (!cm_id_priv) return -EINVAL; work->cm_event.private_data = &mra_msg->private_data; work->cm_event.param.mra_rcvd.service_timeout = cm_mra_get_service_timeout(mra_msg); timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) + cm_convert_to_ms(cm_id_priv->av.timeout); spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { case IB_CM_REQ_SENT: if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REQ || ib_modify_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg, timeout)) goto out; cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD; break; case IB_CM_REP_SENT: if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REP || ib_modify_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg, timeout)) goto out; cm_id_priv->id.state = IB_CM_MRA_REP_RCVD; break; case IB_CM_ESTABLISHED: if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_OTHER || cm_id_priv->id.lap_state != IB_CM_LAP_SENT || ib_modify_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg, timeout)) { if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) atomic_long_inc(&work->port-> counter_group[CM_RECV_DUPLICATES]. counter[CM_MRA_COUNTER]); goto out; } cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD; break; case IB_CM_MRA_REQ_RCVD: case IB_CM_MRA_REP_RCVD: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_MRA_COUNTER]); /* fall through */ default: - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); goto out; } cm_id_priv->msg->context[1] = (void *) (unsigned long) cm_id_priv->id.state; ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; out: spin_unlock_irq(&cm_id_priv->lock); cm_deref_id(cm_id_priv); return -EINVAL; } static void cm_format_lap(struct cm_lap_msg *lap_msg, struct cm_id_private *cm_id_priv, struct ib_sa_path_rec *alternate_path, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP)); lap_msg->local_comm_id = cm_id_priv->id.local_id; lap_msg->remote_comm_id = cm_id_priv->id.remote_id; cm_lap_set_remote_qpn(lap_msg, cm_id_priv->remote_qpn); /* todo: need remote CM response timeout */ cm_lap_set_remote_resp_timeout(lap_msg, 0x1F); lap_msg->alt_local_lid = alternate_path->slid; lap_msg->alt_remote_lid = alternate_path->dlid; lap_msg->alt_local_gid = alternate_path->sgid; lap_msg->alt_remote_gid = alternate_path->dgid; cm_lap_set_flow_label(lap_msg, alternate_path->flow_label); cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class); lap_msg->alt_hop_limit = alternate_path->hop_limit; cm_lap_set_packet_rate(lap_msg, alternate_path->rate); cm_lap_set_sl(lap_msg, alternate_path->sl); cm_lap_set_subnet_local(lap_msg, 1); /* local only... */ cm_lap_set_local_ack_timeout(lap_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, alternate_path->packet_life_time)); if (private_data && private_data_len) memcpy(lap_msg->private_data, private_data, private_data_len); } int ib_send_cm_lap(struct ib_cm_id *cm_id, struct ib_sa_path_rec *alternate_path, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; int ret; if (private_data && private_data_len > IB_CM_LAP_PRIVATE_DATA_SIZE) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_ESTABLISHED || (cm_id->lap_state != IB_CM_LAP_UNINIT && cm_id->lap_state != IB_CM_LAP_IDLE)) { ret = -EINVAL; goto out; } - ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av); + ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av, + cm_id_priv); if (ret) goto out; cm_id_priv->alt_av.timeout = cm_ack_timeout(cm_id_priv->target_ack_delay, cm_id_priv->alt_av.timeout - 1); ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto out; cm_format_lap((struct cm_lap_msg *) msg->mad, cm_id_priv, alternate_path, private_data, private_data_len); msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_ESTABLISHED; ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } cm_id->lap_state = IB_CM_LAP_SENT; cm_id_priv->msg = msg; out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_lap); static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, struct ib_sa_path_rec *path, struct cm_lap_msg *lap_msg) { memset(path, 0, sizeof *path); path->dgid = lap_msg->alt_local_gid; path->sgid = lap_msg->alt_remote_gid; path->dlid = lap_msg->alt_local_lid; path->slid = lap_msg->alt_remote_lid; path->flow_label = cm_lap_get_flow_label(lap_msg); path->hop_limit = lap_msg->alt_hop_limit; path->traffic_class = cm_lap_get_traffic_class(lap_msg); path->reversible = 1; path->pkey = cm_id_priv->pkey; path->sl = cm_lap_get_sl(lap_msg); path->mtu_selector = IB_SA_EQ; path->mtu = cm_id_priv->path_mtu; path->rate_selector = IB_SA_EQ; path->rate = cm_lap_get_packet_rate(lap_msg); path->packet_life_time_selector = IB_SA_EQ; path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg); path->packet_life_time -= (path->packet_life_time > 0); } static int cm_lap_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_lap_msg *lap_msg; struct ib_cm_lap_event_param *param; struct ib_mad_send_buf *msg = NULL; int ret; /* todo: verify LAP request and send reject APR if invalid. */ lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(lap_msg->remote_comm_id, lap_msg->local_comm_id); if (!cm_id_priv) return -EINVAL; param = &work->cm_event.param.lap_rcvd; param->alternate_path = &work->path[0]; cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg); work->cm_event.private_data = &lap_msg->private_data; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_ESTABLISHED) goto unlock; switch (cm_id_priv->id.lap_state) { case IB_CM_LAP_UNINIT: case IB_CM_LAP_IDLE: break; case IB_CM_MRA_LAP_SENT: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_LAP_COUNTER]); if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) goto unlock; cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, CM_MSG_RESPONSE_OTHER, cm_id_priv->service_timeout, cm_id_priv->private_data, cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); if (ib_post_send_mad(msg, NULL)) cm_free_msg(msg); goto deref; case IB_CM_LAP_RCVD: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_LAP_COUNTER]); goto unlock; default: goto unlock; } cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; cm_id_priv->tid = lap_msg->hdr.tid; cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); - if (cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av)) - goto unlock; + cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av, + cm_id_priv); ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; unlock: spin_unlock_irq(&cm_id_priv->lock); deref: cm_deref_id(cm_id_priv); return -EINVAL; } static void cm_format_apr(struct cm_apr_msg *apr_msg, struct cm_id_private *cm_id_priv, enum ib_cm_apr_status status, void *info, u8 info_length, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&apr_msg->hdr, CM_APR_ATTR_ID, cm_id_priv->tid); apr_msg->local_comm_id = cm_id_priv->id.local_id; apr_msg->remote_comm_id = cm_id_priv->id.remote_id; apr_msg->ap_status = (u8) status; if (info && info_length) { apr_msg->info_length = info_length; memcpy(apr_msg->info, info, info_length); } if (private_data && private_data_len) memcpy(apr_msg->private_data, private_data, private_data_len); } int ib_send_cm_apr(struct ib_cm_id *cm_id, enum ib_cm_apr_status status, void *info, u8 info_length, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; int ret; if ((private_data && private_data_len > IB_CM_APR_PRIVATE_DATA_SIZE) || (info && info_length > IB_CM_APR_INFO_LENGTH)) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_ESTABLISHED || (cm_id->lap_state != IB_CM_LAP_RCVD && cm_id->lap_state != IB_CM_MRA_LAP_SENT)) { ret = -EINVAL; goto out; } ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto out; cm_format_apr((struct cm_apr_msg *) msg->mad, cm_id_priv, status, info, info_length, private_data, private_data_len); ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } cm_id->lap_state = IB_CM_LAP_IDLE; out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_apr); static int cm_apr_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_apr_msg *apr_msg; int ret; apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(apr_msg->remote_comm_id, apr_msg->local_comm_id); if (!cm_id_priv) return -EINVAL; /* Unmatched reply. */ work->cm_event.param.apr_rcvd.ap_status = apr_msg->ap_status; work->cm_event.param.apr_rcvd.apr_info = &apr_msg->info; work->cm_event.param.apr_rcvd.info_len = apr_msg->info_length; work->cm_event.private_data = &apr_msg->private_data; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_ESTABLISHED || (cm_id_priv->id.lap_state != IB_CM_LAP_SENT && cm_id_priv->id.lap_state != IB_CM_MRA_LAP_RCVD)) { spin_unlock_irq(&cm_id_priv->lock); goto out; } cm_id_priv->id.lap_state = IB_CM_LAP_IDLE; ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); cm_id_priv->msg = NULL; ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static int cm_timewait_handler(struct cm_work *work) { struct cm_timewait_info *timewait_info; struct cm_id_private *cm_id_priv; int ret; timewait_info = (struct cm_timewait_info *)work; spin_lock_irq(&cm.lock); list_del(&timewait_info->list); spin_unlock_irq(&cm.lock); cm_id_priv = cm_acquire_id(timewait_info->work.local_id, timewait_info->work.remote_id); if (!cm_id_priv) return -EINVAL; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_TIMEWAIT || cm_id_priv->remote_qpn != timewait_info->remote_qpn) { spin_unlock_irq(&cm_id_priv->lock); goto out; } cm_id_priv->id.state = IB_CM_IDLE; ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg, struct cm_id_private *cm_id_priv, struct ib_cm_sidr_req_param *param) { cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_SIDR)); sidr_req_msg->request_id = cm_id_priv->id.local_id; sidr_req_msg->pkey = param->path->pkey; sidr_req_msg->service_id = param->service_id; if (param->private_data && param->private_data_len) memcpy(sidr_req_msg->private_data, param->private_data, param->private_data_len); } int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, struct ib_cm_sidr_req_param *param) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; int ret; if (!param->path || (param->private_data && param->private_data_len > IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE)) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); - - spin_lock_irqsave(&cm_id_priv->lock, flags); - - ret = cm_init_av_by_path(param->path, &cm_id_priv->av); + ret = cm_init_av_by_path(param->path, &cm_id_priv->av, cm_id_priv); if (ret) goto out; cm_id->service_id = param->service_id; cm_id->service_mask = ~cpu_to_be64(0); cm_id_priv->timeout_ms = param->timeout_ms; cm_id_priv->max_cm_retries = param->max_cm_retries; ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto out; cm_format_sidr_req((struct cm_sidr_req_msg *) msg->mad, cm_id_priv, param); msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT; + spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state == IB_CM_IDLE) ret = ib_post_send_mad(msg, NULL); else ret = -EINVAL; if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); goto out; } cm_id->state = IB_CM_SIDR_REQ_SENT; cm_id_priv->msg = msg; -out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); +out: return ret; } EXPORT_SYMBOL(ib_send_cm_sidr_req); static void cm_format_sidr_req_event(struct cm_work *work, struct ib_cm_id *listen_id) { struct cm_sidr_req_msg *sidr_req_msg; struct ib_cm_sidr_req_event_param *param; sidr_req_msg = (struct cm_sidr_req_msg *) work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.sidr_req_rcvd; param->pkey = __be16_to_cpu(sidr_req_msg->pkey); param->listen_id = listen_id; + param->service_id = sidr_req_msg->service_id; + param->bth_pkey = cm_get_bth_pkey(work); param->port = work->port->port_num; work->cm_event.private_data = &sidr_req_msg->private_data; } static int cm_sidr_req_handler(struct cm_work *work) { struct ib_cm_id *cm_id; struct cm_id_private *cm_id_priv, *cur_cm_id_priv; struct cm_sidr_req_msg *sidr_req_msg; struct ib_wc *wc; cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); cm_id_priv = container_of(cm_id, struct cm_id_private, id); /* Record SGID/SLID and request ID for lookup. */ sidr_req_msg = (struct cm_sidr_req_msg *) work->mad_recv_wc->recv_buf.mad; wc = work->mad_recv_wc->wc; cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid); cm_id_priv->av.dgid.global.interface_id = 0; cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); cm_id_priv->id.remote_id = sidr_req_msg->request_id; cm_id_priv->tid = sidr_req_msg->hdr.tid; atomic_inc(&cm_id_priv->work_count); spin_lock_irq(&cm.lock); cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv); if (cur_cm_id_priv) { spin_unlock_irq(&cm.lock); atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_SIDR_REQ_COUNTER]); goto out; /* Duplicate message. */ } cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD; cur_cm_id_priv = cm_find_listen(cm_id->device, - sidr_req_msg->service_id, - sidr_req_msg->private_data); + sidr_req_msg->service_id); if (!cur_cm_id_priv) { spin_unlock_irq(&cm.lock); cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED); goto out; /* No match. */ } atomic_inc(&cur_cm_id_priv->refcount); atomic_inc(&cm_id_priv->refcount); spin_unlock_irq(&cm.lock); cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler; cm_id_priv->id.context = cur_cm_id_priv->id.context; cm_id_priv->id.service_id = sidr_req_msg->service_id; cm_id_priv->id.service_mask = ~cpu_to_be64(0); cm_format_sidr_req_event(work, &cur_cm_id_priv->id); cm_process_work(cm_id_priv, work); cm_deref_id(cur_cm_id_priv); return 0; out: ib_destroy_cm_id(&cm_id_priv->id); return -EINVAL; } static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg, struct cm_id_private *cm_id_priv, struct ib_cm_sidr_rep_param *param) { cm_format_mad_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID, cm_id_priv->tid); sidr_rep_msg->request_id = cm_id_priv->id.remote_id; sidr_rep_msg->status = param->status; cm_sidr_rep_set_qpn(sidr_rep_msg, cpu_to_be32(param->qp_num)); sidr_rep_msg->service_id = cm_id_priv->id.service_id; sidr_rep_msg->qkey = cpu_to_be32(param->qkey); if (param->info && param->info_length) memcpy(sidr_rep_msg->info, param->info, param->info_length); if (param->private_data && param->private_data_len) memcpy(sidr_rep_msg->private_data, param->private_data, param->private_data_len); } int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, struct ib_cm_sidr_rep_param *param) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; int ret; if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) || (param->private_data && param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE)) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_SIDR_REQ_RCVD) { ret = -EINVAL; goto error; } ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto error; cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv, param); ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } cm_id->state = IB_CM_IDLE; spin_unlock_irqrestore(&cm_id_priv->lock, flags); spin_lock_irqsave(&cm.lock, flags); - rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) { + rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + RB_CLEAR_NODE(&cm_id_priv->sidr_id_node); + } spin_unlock_irqrestore(&cm.lock, flags); return 0; error: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_sidr_rep); static void cm_format_sidr_rep_event(struct cm_work *work) { struct cm_sidr_rep_msg *sidr_rep_msg; struct ib_cm_sidr_rep_event_param *param; sidr_rep_msg = (struct cm_sidr_rep_msg *) work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.sidr_rep_rcvd; param->status = sidr_rep_msg->status; param->qkey = be32_to_cpu(sidr_rep_msg->qkey); param->qpn = be32_to_cpu(cm_sidr_rep_get_qpn(sidr_rep_msg)); param->info = &sidr_rep_msg->info; param->info_len = sidr_rep_msg->info_length; work->cm_event.private_data = &sidr_rep_msg->private_data; } static int cm_sidr_rep_handler(struct cm_work *work) { struct cm_sidr_rep_msg *sidr_rep_msg; struct cm_id_private *cm_id_priv; sidr_rep_msg = (struct cm_sidr_rep_msg *) work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(sidr_rep_msg->request_id, 0); if (!cm_id_priv) return -EINVAL; /* Unmatched reply. */ spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_SIDR_REQ_SENT) { spin_unlock_irq(&cm_id_priv->lock); goto out; } cm_id_priv->id.state = IB_CM_IDLE; ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); spin_unlock_irq(&cm_id_priv->lock); cm_format_sidr_rep_event(work); cm_process_work(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static void cm_process_send_error(struct ib_mad_send_buf *msg, enum ib_wc_status wc_status) { struct cm_id_private *cm_id_priv; struct ib_cm_event cm_event; enum ib_cm_state state; int ret; memset(&cm_event, 0, sizeof cm_event); cm_id_priv = msg->context[0]; /* Discard old sends or ones without a response. */ spin_lock_irq(&cm_id_priv->lock); state = (enum ib_cm_state) (unsigned long) msg->context[1]; if (msg != cm_id_priv->msg || state != cm_id_priv->id.state) goto discard; switch (state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: cm_reset_to_idle(cm_id_priv); cm_event.event = IB_CM_REQ_ERROR; break; case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: cm_reset_to_idle(cm_id_priv); cm_event.event = IB_CM_REP_ERROR; break; case IB_CM_DREQ_SENT: cm_enter_timewait(cm_id_priv); cm_event.event = IB_CM_DREQ_ERROR; break; case IB_CM_SIDR_REQ_SENT: cm_id_priv->id.state = IB_CM_IDLE; cm_event.event = IB_CM_SIDR_REQ_ERROR; break; default: goto discard; } spin_unlock_irq(&cm_id_priv->lock); cm_event.param.send_status = wc_status; /* No other events can occur on the cm_id at this point. */ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event); cm_free_msg(msg); if (ret) ib_destroy_cm_id(&cm_id_priv->id); return; discard: spin_unlock_irq(&cm_id_priv->lock); cm_free_msg(msg); } static void cm_send_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_send_wc) { struct ib_mad_send_buf *msg = mad_send_wc->send_buf; struct cm_port *port; u16 attr_index; port = mad_agent->context; attr_index = be16_to_cpu(((struct ib_mad_hdr *) msg->mad)->attr_id) - CM_ATTR_ID_OFFSET; /* * If the send was in response to a received message (context[0] is not * set to a cm_id), and is not a REJ, then it is a send that was * manually retried. */ if (!msg->context[0] && (attr_index != CM_REJ_COUNTER)) msg->retries = 1; atomic_long_add(1 + msg->retries, &port->counter_group[CM_XMIT].counter[attr_index]); if (msg->retries) atomic_long_add(msg->retries, &port->counter_group[CM_XMIT_RETRIES]. counter[attr_index]); switch (mad_send_wc->status) { case IB_WC_SUCCESS: case IB_WC_WR_FLUSH_ERR: cm_free_msg(msg); break; default: if (msg->context[0] && msg->context[1]) cm_process_send_error(msg, mad_send_wc->status); else cm_free_msg(msg); break; } } static void cm_work_handler(struct work_struct *_work) { struct cm_work *work = container_of(_work, struct cm_work, work.work); int ret; switch (work->cm_event.event) { case IB_CM_REQ_RECEIVED: ret = cm_req_handler(work); break; case IB_CM_MRA_RECEIVED: ret = cm_mra_handler(work); break; case IB_CM_REJ_RECEIVED: ret = cm_rej_handler(work); break; case IB_CM_REP_RECEIVED: ret = cm_rep_handler(work); break; case IB_CM_RTU_RECEIVED: ret = cm_rtu_handler(work); break; case IB_CM_USER_ESTABLISHED: ret = cm_establish_handler(work); break; case IB_CM_DREQ_RECEIVED: ret = cm_dreq_handler(work); break; case IB_CM_DREP_RECEIVED: ret = cm_drep_handler(work); break; case IB_CM_SIDR_REQ_RECEIVED: ret = cm_sidr_req_handler(work); break; case IB_CM_SIDR_REP_RECEIVED: ret = cm_sidr_rep_handler(work); break; case IB_CM_LAP_RECEIVED: ret = cm_lap_handler(work); break; case IB_CM_APR_RECEIVED: ret = cm_apr_handler(work); break; case IB_CM_TIMEWAIT_EXIT: ret = cm_timewait_handler(work); break; default: - pr_debug("work->cm_event.event: 0x%x\n", work->cm_event.event); ret = -EINVAL; break; } if (ret) cm_free_work(work); } static int cm_establish(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; struct cm_work *work; unsigned long flags; int ret = 0; + struct cm_device *cm_dev; + cm_dev = ib_get_client_data(cm_id->device, &cm_client); + if (!cm_dev) + return -ENODEV; + work = kmalloc(sizeof *work, GFP_ATOMIC); if (!work) return -ENOMEM; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id->state) { case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: cm_id->state = IB_CM_ESTABLISHED; break; case IB_CM_ESTABLISHED: ret = -EISCONN; break; default: - pr_debug("cm_id->state: 0x%x\n", cm_id->state); ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (ret) { kfree(work); goto out; } /* * The CM worker thread may try to destroy the cm_id before it * can execute this work item. To prevent potential deadlock, * we need to find the cm_id once we're in the context of the * worker thread, rather than holding a reference on it. */ INIT_DELAYED_WORK(&work->work, cm_work_handler); work->local_id = cm_id->local_id; work->remote_id = cm_id->remote_id; work->mad_recv_wc = NULL; work->cm_event.event = IB_CM_USER_ESTABLISHED; - queue_delayed_work(cm.wq, &work->work, 0); + + /* Check if the device started its remove_one */ + spin_lock_irqsave(&cm.lock, flags); + if (!cm_dev->going_down) { + queue_delayed_work(cm.wq, &work->work, 0); + } else { + kfree(work); + ret = -ENODEV; + } + spin_unlock_irqrestore(&cm.lock, flags); + out: return ret; } static int cm_migrate(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; + struct cm_av tmp_av; unsigned long flags; + int tmp_send_port_not_ready; int ret = 0; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state == IB_CM_ESTABLISHED && (cm_id->lap_state == IB_CM_LAP_UNINIT || cm_id->lap_state == IB_CM_LAP_IDLE)) { cm_id->lap_state = IB_CM_LAP_IDLE; + /* Swap address vector */ + tmp_av = cm_id_priv->av; cm_id_priv->av = cm_id_priv->alt_av; + cm_id_priv->alt_av = tmp_av; + /* Swap port send ready state */ + tmp_send_port_not_ready = cm_id_priv->prim_send_port_not_ready; + cm_id_priv->prim_send_port_not_ready = cm_id_priv->altr_send_port_not_ready; + cm_id_priv->altr_send_port_not_ready = tmp_send_port_not_ready; } else ret = -EINVAL; spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event) { int ret; switch (event) { case IB_EVENT_COMM_EST: ret = cm_establish(cm_id); break; case IB_EVENT_PATH_MIG: ret = cm_migrate(cm_id); break; default: ret = -EINVAL; } return ret; } EXPORT_SYMBOL(ib_cm_notify); static void cm_recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct cm_port *port = mad_agent->context; struct cm_work *work; enum ib_cm_event_type event; u16 attr_id; int paths = 0; + int going_down = 0; switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) { case CM_REQ_ATTR_ID: paths = 1 + (((struct cm_req_msg *) mad_recv_wc->recv_buf.mad)-> alt_local_lid != 0); event = IB_CM_REQ_RECEIVED; break; case CM_MRA_ATTR_ID: event = IB_CM_MRA_RECEIVED; break; case CM_REJ_ATTR_ID: event = IB_CM_REJ_RECEIVED; break; case CM_REP_ATTR_ID: event = IB_CM_REP_RECEIVED; break; case CM_RTU_ATTR_ID: event = IB_CM_RTU_RECEIVED; break; case CM_DREQ_ATTR_ID: event = IB_CM_DREQ_RECEIVED; break; case CM_DREP_ATTR_ID: event = IB_CM_DREP_RECEIVED; break; case CM_SIDR_REQ_ATTR_ID: event = IB_CM_SIDR_REQ_RECEIVED; break; case CM_SIDR_REP_ATTR_ID: event = IB_CM_SIDR_REP_RECEIVED; break; case CM_LAP_ATTR_ID: paths = 1; event = IB_CM_LAP_RECEIVED; break; case CM_APR_ATTR_ID: event = IB_CM_APR_RECEIVED; break; default: ib_free_recv_mad(mad_recv_wc); return; } attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id); atomic_long_inc(&port->counter_group[CM_RECV]. counter[attr_id - CM_ATTR_ID_OFFSET]); work = kmalloc(sizeof *work + sizeof(struct ib_sa_path_rec) * paths, GFP_KERNEL); if (!work) { ib_free_recv_mad(mad_recv_wc); return; } INIT_DELAYED_WORK(&work->work, cm_work_handler); work->cm_event.event = event; work->mad_recv_wc = mad_recv_wc; work->port = port; - queue_delayed_work(cm.wq, &work->work, 0); + + /* Check if the device started its remove_one */ + spin_lock_irq(&cm.lock); + if (!port->cm_dev->going_down) + queue_delayed_work(cm.wq, &work->work, 0); + else + going_down = 1; + spin_unlock_irq(&cm.lock); + + if (going_down) { + kfree(work); + ib_free_recv_mad(mad_recv_wc); + } } static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->id.state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: case IB_CM_ESTABLISHED: *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; if (cm_id_priv->responder_resources) qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_ATOMIC; qp_attr->pkey_index = cm_id_priv->av.pkey_index; qp_attr->port_num = cm_id_priv->av.port->port_num; ret = 0; break; default: - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->id.state) { case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: case IB_CM_ESTABLISHED: *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN; qp_attr->ah_attr = cm_id_priv->av.ah_attr; - if (!cm_id_priv->av.valid) - return -EINVAL; - if (cm_id_priv->av.ah_attr.vlan_id != 0xffff) { - qp_attr->vlan_id = cm_id_priv->av.ah_attr.vlan_id; - *qp_attr_mask |= IB_QP_VID; - } - if (!is_zero_ether_addr(cm_id_priv->av.smac)) { - memcpy(qp_attr->smac, cm_id_priv->av.smac, - sizeof(qp_attr->smac)); - *qp_attr_mask |= IB_QP_SMAC; - } - if (cm_id_priv->alt_av.valid) { - if (cm_id_priv->alt_av.ah_attr.vlan_id != 0xffff) { - qp_attr->alt_vlan_id = - cm_id_priv->alt_av.ah_attr.vlan_id; - *qp_attr_mask |= IB_QP_ALT_VID; - } - if (!is_zero_ether_addr(cm_id_priv->alt_av.smac)) { - memcpy(qp_attr->alt_smac, - cm_id_priv->alt_av.smac, - sizeof(qp_attr->alt_smac)); - *qp_attr_mask |= IB_QP_ALT_SMAC; - } - } - qp_attr->path_mtu = cm_id_priv->path_mtu; qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn); qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn); if (cm_id_priv->qp_type == IB_QPT_RC || cm_id_priv->qp_type == IB_QPT_XRC_TGT) { *qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; qp_attr->max_dest_rd_atomic = cm_id_priv->responder_resources; qp_attr->min_rnr_timer = 0; } if (cm_id_priv->alt_av.ah_attr.dlid) { *qp_attr_mask |= IB_QP_ALT_PATH; qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num; qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; qp_attr->alt_timeout = cm_id_priv->alt_av.timeout; qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr; } ret = 0; break; default: - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->id.state) { /* Allow transition to RTS before sending REP */ case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: case IB_CM_ESTABLISHED: if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) { *qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN; qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn); switch (cm_id_priv->qp_type) { case IB_QPT_RC: case IB_QPT_XRC_INI: *qp_attr_mask |= IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC; qp_attr->retry_cnt = cm_id_priv->retry_count; qp_attr->rnr_retry = cm_id_priv->rnr_retry_count; qp_attr->max_rd_atomic = cm_id_priv->initiator_depth; /* fall through */ case IB_QPT_XRC_TGT: *qp_attr_mask |= IB_QP_TIMEOUT; qp_attr->timeout = cm_id_priv->av.timeout; break; default: break; } if (cm_id_priv->alt_av.ah_attr.dlid) { *qp_attr_mask |= IB_QP_PATH_MIG_STATE; qp_attr->path_mig_state = IB_MIG_REARM; } } else { *qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE; qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num; qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; qp_attr->alt_timeout = cm_id_priv->alt_av.timeout; qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr; qp_attr->path_mig_state = IB_MIG_REARM; } ret = 0; break; default: - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } int ib_cm_init_qp_attr(struct ib_cm_id *cm_id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct cm_id_private *cm_id_priv; int ret; cm_id_priv = container_of(cm_id, struct cm_id_private, id); switch (qp_attr->qp_state) { case IB_QPS_INIT: ret = cm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask); break; case IB_QPS_RTR: ret = cm_init_qp_rtr_attr(cm_id_priv, qp_attr, qp_attr_mask); break; case IB_QPS_RTS: ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask); break; default: - pr_debug("qp_attr->qp_state: 0x%x\n", qp_attr->qp_state); ret = -EINVAL; break; } return ret; } EXPORT_SYMBOL(ib_cm_init_qp_attr); -static void cm_get_ack_delay(struct cm_device *cm_dev) -{ - struct ib_device_attr attr; - - if (ib_query_device(cm_dev->ib_device, &attr)) - cm_dev->ack_delay = 0; /* acks will rely on packet life time */ - else - cm_dev->ack_delay = attr.local_ca_ack_delay; -} - static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr, char *buf) { struct cm_counter_group *group; struct cm_counter_attribute *cm_attr; group = container_of(obj, struct cm_counter_group, obj); cm_attr = container_of(attr, struct cm_counter_attribute, attr); return sprintf(buf, "%ld\n", atomic_long_read(&group->counter[cm_attr->index])); } static const struct sysfs_ops cm_counter_ops = { .show = cm_show_counter }; static struct kobj_type cm_counter_obj_type = { .sysfs_ops = &cm_counter_ops, .default_attrs = cm_counter_default_attrs }; static void cm_release_port_obj(struct kobject *obj) { struct cm_port *cm_port; cm_port = container_of(obj, struct cm_port, port_obj); kfree(cm_port); } static struct kobj_type cm_port_obj_type = { .release = cm_release_port_obj }; static char *cm_devnode(struct device *dev, umode_t *mode) { if (mode) *mode = 0666; return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); } struct class cm_class = { .owner = THIS_MODULE, .name = "infiniband_cm", .devnode = cm_devnode, }; EXPORT_SYMBOL(cm_class); static int cm_create_port_fs(struct cm_port *port) { int i, ret; ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type, &port->cm_dev->device->kobj, "%d", port->port_num); if (ret) { kfree(port); return ret; } for (i = 0; i < CM_COUNTER_GROUPS; i++) { ret = kobject_init_and_add(&port->counter_group[i].obj, &cm_counter_obj_type, &port->port_obj, "%s", counter_group_names[i]); if (ret) goto error; } return 0; error: while (i--) kobject_put(&port->counter_group[i].obj); kobject_put(&port->port_obj); return ret; } static void cm_remove_port_fs(struct cm_port *port) { int i; for (i = 0; i < CM_COUNTER_GROUPS; i++) kobject_put(&port->counter_group[i].obj); kobject_put(&port->port_obj); } static void cm_add_one(struct ib_device *ib_device) { struct cm_device *cm_dev; struct cm_port *port; struct ib_mad_reg_req reg_req = { .mgmt_class = IB_MGMT_CLASS_CM, - .mgmt_class_version = IB_CM_CLASS_VERSION + .mgmt_class_version = IB_CM_CLASS_VERSION, }; struct ib_port_modify port_modify = { .set_port_cap_mask = IB_PORT_CM_SUP }; unsigned long flags; int ret; + int count = 0; u8 i; - if (rdma_node_get_transport(ib_device->node_type) != RDMA_TRANSPORT_IB) - return; - cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) * ib_device->phys_port_cnt, GFP_KERNEL); if (!cm_dev) return; cm_dev->ib_device = ib_device; - cm_get_ack_delay(cm_dev); - + cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; + cm_dev->going_down = 0; cm_dev->device = device_create(&cm_class, &ib_device->dev, MKDEV(0, 0), NULL, "%s", ib_device->name); if (IS_ERR(cm_dev->device)) { kfree(cm_dev); return; } set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); for (i = 1; i <= ib_device->phys_port_cnt; i++) { + if (!rdma_cap_ib_cm(ib_device, i)) + continue; + port = kzalloc(sizeof *port, GFP_KERNEL); if (!port) goto error1; cm_dev->port[i-1] = port; port->cm_dev = cm_dev; port->port_num = i; + INIT_LIST_HEAD(&port->cm_priv_prim_list); + INIT_LIST_HEAD(&port->cm_priv_altr_list); + ret = cm_create_port_fs(port); if (ret) goto error1; port->mad_agent = ib_register_mad_agent(ib_device, i, IB_QPT_GSI, ®_req, 0, cm_send_handler, cm_recv_handler, - port); + port, + 0); if (IS_ERR(port->mad_agent)) goto error2; ret = ib_modify_port(ib_device, i, 0, &port_modify); if (ret) goto error3; + + count++; } + + if (!count) + goto free; + ib_set_client_data(ib_device, &cm_client, cm_dev); write_lock_irqsave(&cm.device_lock, flags); list_add_tail(&cm_dev->list, &cm.device_list); write_unlock_irqrestore(&cm.device_lock, flags); return; error3: ib_unregister_mad_agent(port->mad_agent); error2: cm_remove_port_fs(port); error1: port_modify.set_port_cap_mask = 0; port_modify.clr_port_cap_mask = IB_PORT_CM_SUP; while (--i) { + if (!rdma_cap_ib_cm(ib_device, i)) + continue; + port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); ib_unregister_mad_agent(port->mad_agent); cm_remove_port_fs(port); } +free: device_unregister(cm_dev->device); kfree(cm_dev); } -static void cm_remove_one(struct ib_device *ib_device) +static void cm_remove_one(struct ib_device *ib_device, void *client_data) { - struct cm_device *cm_dev; + struct cm_device *cm_dev = client_data; struct cm_port *port; + struct cm_id_private *cm_id_priv; + struct ib_mad_agent *cur_mad_agent; struct ib_port_modify port_modify = { .clr_port_cap_mask = IB_PORT_CM_SUP }; unsigned long flags; int i; - cm_dev = ib_get_client_data(ib_device, &cm_client); if (!cm_dev) return; write_lock_irqsave(&cm.device_lock, flags); list_del(&cm_dev->list); write_unlock_irqrestore(&cm.device_lock, flags); + spin_lock_irq(&cm.lock); + cm_dev->going_down = 1; + spin_unlock_irq(&cm.lock); + for (i = 1; i <= ib_device->phys_port_cnt; i++) { + if (!rdma_cap_ib_cm(ib_device, i)) + continue; + port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); - ib_unregister_mad_agent(port->mad_agent); + /* Mark all the cm_id's as not valid */ + spin_lock_irq(&cm.lock); + list_for_each_entry(cm_id_priv, &port->cm_priv_altr_list, altr_list) + cm_id_priv->altr_send_port_not_ready = 1; + list_for_each_entry(cm_id_priv, &port->cm_priv_prim_list, prim_list) + cm_id_priv->prim_send_port_not_ready = 1; + spin_unlock_irq(&cm.lock); + /* + * We flush the queue here after the going_down set, this + * verify that no new works will be queued in the recv handler, + * after that we can call the unregister_mad_agent + */ flush_workqueue(cm.wq); + spin_lock_irq(&cm.state_lock); + cur_mad_agent = port->mad_agent; + port->mad_agent = NULL; + spin_unlock_irq(&cm.state_lock); + ib_unregister_mad_agent(cur_mad_agent); cm_remove_port_fs(port); } + device_unregister(cm_dev->device); kfree(cm_dev); } static int __init ib_cm_init(void) { int ret; memset(&cm, 0, sizeof cm); INIT_LIST_HEAD(&cm.device_list); rwlock_init(&cm.device_lock); spin_lock_init(&cm.lock); + spin_lock_init(&cm.state_lock); cm.listen_service_table = RB_ROOT; cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID); cm.remote_id_table = RB_ROOT; cm.remote_qp_table = RB_ROOT; cm.remote_sidr_table = RB_ROOT; idr_init(&cm.local_id_table); get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand); - if (!idr_pre_get(&cm.local_id_table, GFP_KERNEL)) - return -ENOMEM; INIT_LIST_HEAD(&cm.timewait_list); ret = class_register(&cm_class); if (ret) { ret = -ENOMEM; goto error1; } cm.wq = create_workqueue("ib_cm"); if (!cm.wq) { ret = -ENOMEM; goto error2; } ret = ib_register_client(&cm_client); if (ret) goto error3; return 0; error3: destroy_workqueue(cm.wq); error2: class_unregister(&cm_class); error1: idr_destroy(&cm.local_id_table); return ret; } static void __exit ib_cm_cleanup(void) { struct cm_timewait_info *timewait_info, *tmp; spin_lock_irq(&cm.lock); list_for_each_entry(timewait_info, &cm.timewait_list, list) cancel_delayed_work(&timewait_info->work.work); spin_unlock_irq(&cm.lock); ib_unregister_client(&cm_client); destroy_workqueue(cm.wq); list_for_each_entry_safe(timewait_info, tmp, &cm.timewait_list, list) { list_del(&timewait_info->list); kfree(timewait_info); } class_unregister(&cm_class); idr_destroy(&cm.local_id_table); } module_init_order(ib_cm_init, SI_ORDER_SECOND); module_exit_order(ib_cm_cleanup, SI_ORDER_FIRST); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/cm_msgs.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/cm_msgs.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/cm_msgs.h (revision 319974) @@ -1,836 +1,836 @@ /* * Copyright (c) 2004, 2011 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING the madirectory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use source and binary forms, with or * withmodification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retathe above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHWARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS THE * SOFTWARE. */ #if !defined(CM_MSGS_H) #define CM_MSGS_H #include #include /* * Parameters to routines below should be in network-byte order, and values * are returned in network-byte order. */ #define IB_CM_CLASS_VERSION 2 /* IB specification 1.2 */ enum cm_msg_sequence { CM_MSG_SEQUENCE_REQ, CM_MSG_SEQUENCE_LAP, CM_MSG_SEQUENCE_DREQ, CM_MSG_SEQUENCE_SIDR }; struct cm_req_msg { struct ib_mad_hdr hdr; __be32 local_comm_id; __be32 rsvd4; __be64 service_id; __be64 local_ca_guid; __be32 rsvd24; __be32 local_qkey; /* local QPN:24, responder resources:8 */ __be32 offset32; /* local EECN:24, initiator depth:8 */ __be32 offset36; /* * remote EECN:24, remote CM response timeout:5, * transport service type:2, end-to-end flow control:1 */ __be32 offset40; /* starting PSN:24, local CM response timeout:5, retry count:3 */ __be32 offset44; __be16 pkey; /* path MTU:4, RDC exists:1, RNR retry count:3. */ u8 offset50; /* max CM Retries:4, SRQ:1, extended transport type:3 */ u8 offset51; __be16 primary_local_lid; __be16 primary_remote_lid; union ib_gid primary_local_gid; union ib_gid primary_remote_gid; /* flow label:20, rsvd:6, packet rate:6 */ __be32 primary_offset88; u8 primary_traffic_class; u8 primary_hop_limit; /* SL:4, subnet local:1, rsvd:3 */ u8 primary_offset94; /* local ACK timeout:5, rsvd:3 */ u8 primary_offset95; __be16 alt_local_lid; __be16 alt_remote_lid; union ib_gid alt_local_gid; union ib_gid alt_remote_gid; /* flow label:20, rsvd:6, packet rate:6 */ __be32 alt_offset132; u8 alt_traffic_class; u8 alt_hop_limit; /* SL:4, subnet local:1, rsvd:3 */ u8 alt_offset138; /* local ACK timeout:5, rsvd:3 */ u8 alt_offset139; - u8 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE]; + u32 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE / sizeof(u32)]; } __attribute__ ((packed)); static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg) { return cpu_to_be32(be32_to_cpu(req_msg->offset32) >> 8); } static inline void cm_req_set_local_qpn(struct cm_req_msg *req_msg, __be32 qpn) { req_msg->offset32 = cpu_to_be32((be32_to_cpu(qpn) << 8) | (be32_to_cpu(req_msg->offset32) & 0x000000FF)); } static inline u8 cm_req_get_resp_res(struct cm_req_msg *req_msg) { return (u8) be32_to_cpu(req_msg->offset32); } static inline void cm_req_set_resp_res(struct cm_req_msg *req_msg, u8 resp_res) { req_msg->offset32 = cpu_to_be32(resp_res | (be32_to_cpu(req_msg->offset32) & 0xFFFFFF00)); } static inline u8 cm_req_get_init_depth(struct cm_req_msg *req_msg) { return (u8) be32_to_cpu(req_msg->offset36); } static inline void cm_req_set_init_depth(struct cm_req_msg *req_msg, u8 init_depth) { req_msg->offset36 = cpu_to_be32(init_depth | (be32_to_cpu(req_msg->offset36) & 0xFFFFFF00)); } static inline u8 cm_req_get_remote_resp_timeout(struct cm_req_msg *req_msg) { return (u8) ((be32_to_cpu(req_msg->offset40) & 0xF8) >> 3); } static inline void cm_req_set_remote_resp_timeout(struct cm_req_msg *req_msg, u8 resp_timeout) { req_msg->offset40 = cpu_to_be32((resp_timeout << 3) | (be32_to_cpu(req_msg->offset40) & 0xFFFFFF07)); } static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg) { u8 transport_type = (u8) (be32_to_cpu(req_msg->offset40) & 0x06) >> 1; switch(transport_type) { case 0: return IB_QPT_RC; case 1: return IB_QPT_UC; case 3: switch (req_msg->offset51 & 0x7) { case 1: return IB_QPT_XRC_TGT; default: return 0; } default: return 0; } } static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg, enum ib_qp_type qp_type) { switch(qp_type) { case IB_QPT_UC: req_msg->offset40 = cpu_to_be32((be32_to_cpu( req_msg->offset40) & 0xFFFFFFF9) | 0x2); break; case IB_QPT_XRC_INI: req_msg->offset40 = cpu_to_be32((be32_to_cpu( req_msg->offset40) & 0xFFFFFFF9) | 0x6); req_msg->offset51 = (req_msg->offset51 & 0xF8) | 1; break; default: req_msg->offset40 = cpu_to_be32(be32_to_cpu( req_msg->offset40) & 0xFFFFFFF9); } } static inline u8 cm_req_get_flow_ctrl(struct cm_req_msg *req_msg) { return be32_to_cpu(req_msg->offset40) & 0x1; } static inline void cm_req_set_flow_ctrl(struct cm_req_msg *req_msg, u8 flow_ctrl) { req_msg->offset40 = cpu_to_be32((flow_ctrl & 0x1) | (be32_to_cpu(req_msg->offset40) & 0xFFFFFFFE)); } static inline __be32 cm_req_get_starting_psn(struct cm_req_msg *req_msg) { return cpu_to_be32(be32_to_cpu(req_msg->offset44) >> 8); } static inline void cm_req_set_starting_psn(struct cm_req_msg *req_msg, __be32 starting_psn) { req_msg->offset44 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) | (be32_to_cpu(req_msg->offset44) & 0x000000FF)); } static inline u8 cm_req_get_local_resp_timeout(struct cm_req_msg *req_msg) { return (u8) ((be32_to_cpu(req_msg->offset44) & 0xF8) >> 3); } static inline void cm_req_set_local_resp_timeout(struct cm_req_msg *req_msg, u8 resp_timeout) { req_msg->offset44 = cpu_to_be32((resp_timeout << 3) | (be32_to_cpu(req_msg->offset44) & 0xFFFFFF07)); } static inline u8 cm_req_get_retry_count(struct cm_req_msg *req_msg) { return (u8) (be32_to_cpu(req_msg->offset44) & 0x7); } static inline void cm_req_set_retry_count(struct cm_req_msg *req_msg, u8 retry_count) { req_msg->offset44 = cpu_to_be32((retry_count & 0x7) | (be32_to_cpu(req_msg->offset44) & 0xFFFFFFF8)); } static inline u8 cm_req_get_path_mtu(struct cm_req_msg *req_msg) { return req_msg->offset50 >> 4; } static inline void cm_req_set_path_mtu(struct cm_req_msg *req_msg, u8 path_mtu) { req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF) | (path_mtu << 4)); } static inline u8 cm_req_get_rnr_retry_count(struct cm_req_msg *req_msg) { return req_msg->offset50 & 0x7; } static inline void cm_req_set_rnr_retry_count(struct cm_req_msg *req_msg, u8 rnr_retry_count) { req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF8) | (rnr_retry_count & 0x7)); } static inline u8 cm_req_get_max_cm_retries(struct cm_req_msg *req_msg) { return req_msg->offset51 >> 4; } static inline void cm_req_set_max_cm_retries(struct cm_req_msg *req_msg, u8 retries) { req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF) | (retries << 4)); } static inline u8 cm_req_get_srq(struct cm_req_msg *req_msg) { return (req_msg->offset51 & 0x8) >> 3; } static inline void cm_req_set_srq(struct cm_req_msg *req_msg, u8 srq) { req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF7) | ((srq & 0x1) << 3)); } static inline __be32 cm_req_get_primary_flow_label(struct cm_req_msg *req_msg) { return cpu_to_be32(be32_to_cpu(req_msg->primary_offset88) >> 12); } static inline void cm_req_set_primary_flow_label(struct cm_req_msg *req_msg, __be32 flow_label) { req_msg->primary_offset88 = cpu_to_be32( (be32_to_cpu(req_msg->primary_offset88) & 0x00000FFF) | (be32_to_cpu(flow_label) << 12)); } static inline u8 cm_req_get_primary_packet_rate(struct cm_req_msg *req_msg) { return (u8) (be32_to_cpu(req_msg->primary_offset88) & 0x3F); } static inline void cm_req_set_primary_packet_rate(struct cm_req_msg *req_msg, u8 rate) { req_msg->primary_offset88 = cpu_to_be32( (be32_to_cpu(req_msg->primary_offset88) & 0xFFFFFFC0) | (rate & 0x3F)); } static inline u8 cm_req_get_primary_sl(struct cm_req_msg *req_msg) { return (u8) (req_msg->primary_offset94 >> 4); } static inline void cm_req_set_primary_sl(struct cm_req_msg *req_msg, u8 sl) { req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0x0F) | (sl << 4)); } static inline u8 cm_req_get_primary_subnet_local(struct cm_req_msg *req_msg) { return (u8) ((req_msg->primary_offset94 & 0x08) >> 3); } static inline void cm_req_set_primary_subnet_local(struct cm_req_msg *req_msg, u8 subnet_local) { req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0xF7) | ((subnet_local & 0x1) << 3)); } static inline u8 cm_req_get_primary_local_ack_timeout(struct cm_req_msg *req_msg) { return (u8) (req_msg->primary_offset95 >> 3); } static inline void cm_req_set_primary_local_ack_timeout(struct cm_req_msg *req_msg, u8 local_ack_timeout) { req_msg->primary_offset95 = (u8) ((req_msg->primary_offset95 & 0x07) | (local_ack_timeout << 3)); } static inline __be32 cm_req_get_alt_flow_label(struct cm_req_msg *req_msg) { return cpu_to_be32(be32_to_cpu(req_msg->alt_offset132) >> 12); } static inline void cm_req_set_alt_flow_label(struct cm_req_msg *req_msg, __be32 flow_label) { req_msg->alt_offset132 = cpu_to_be32( (be32_to_cpu(req_msg->alt_offset132) & 0x00000FFF) | (be32_to_cpu(flow_label) << 12)); } static inline u8 cm_req_get_alt_packet_rate(struct cm_req_msg *req_msg) { return (u8) (be32_to_cpu(req_msg->alt_offset132) & 0x3F); } static inline void cm_req_set_alt_packet_rate(struct cm_req_msg *req_msg, u8 rate) { req_msg->alt_offset132 = cpu_to_be32( (be32_to_cpu(req_msg->alt_offset132) & 0xFFFFFFC0) | (rate & 0x3F)); } static inline u8 cm_req_get_alt_sl(struct cm_req_msg *req_msg) { return (u8) (req_msg->alt_offset138 >> 4); } static inline void cm_req_set_alt_sl(struct cm_req_msg *req_msg, u8 sl) { req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0x0F) | (sl << 4)); } static inline u8 cm_req_get_alt_subnet_local(struct cm_req_msg *req_msg) { return (u8) ((req_msg->alt_offset138 & 0x08) >> 3); } static inline void cm_req_set_alt_subnet_local(struct cm_req_msg *req_msg, u8 subnet_local) { req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0xF7) | ((subnet_local & 0x1) << 3)); } static inline u8 cm_req_get_alt_local_ack_timeout(struct cm_req_msg *req_msg) { return (u8) (req_msg->alt_offset139 >> 3); } static inline void cm_req_set_alt_local_ack_timeout(struct cm_req_msg *req_msg, u8 local_ack_timeout) { req_msg->alt_offset139 = (u8) ((req_msg->alt_offset139 & 0x07) | (local_ack_timeout << 3)); } /* Message REJected or MRAed */ enum cm_msg_response { CM_MSG_RESPONSE_REQ = 0x0, CM_MSG_RESPONSE_REP = 0x1, CM_MSG_RESPONSE_OTHER = 0x2 }; struct cm_mra_msg { struct ib_mad_hdr hdr; __be32 local_comm_id; __be32 remote_comm_id; /* message MRAed:2, rsvd:6 */ u8 offset8; /* service timeout:5, rsvd:3 */ u8 offset9; u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE]; } __attribute__ ((packed)); static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg) { return (u8) (mra_msg->offset8 >> 6); } static inline void cm_mra_set_msg_mraed(struct cm_mra_msg *mra_msg, u8 msg) { mra_msg->offset8 = (u8) ((mra_msg->offset8 & 0x3F) | (msg << 6)); } static inline u8 cm_mra_get_service_timeout(struct cm_mra_msg *mra_msg) { return (u8) (mra_msg->offset9 >> 3); } static inline void cm_mra_set_service_timeout(struct cm_mra_msg *mra_msg, u8 service_timeout) { mra_msg->offset9 = (u8) ((mra_msg->offset9 & 0x07) | (service_timeout << 3)); } struct cm_rej_msg { struct ib_mad_hdr hdr; __be32 local_comm_id; __be32 remote_comm_id; /* message REJected:2, rsvd:6 */ u8 offset8; /* reject info length:7, rsvd:1. */ u8 offset9; __be16 reason; u8 ari[IB_CM_REJ_ARI_LENGTH]; u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE]; } __attribute__ ((packed)); static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg) { return (u8) (rej_msg->offset8 >> 6); } static inline void cm_rej_set_msg_rejected(struct cm_rej_msg *rej_msg, u8 msg) { rej_msg->offset8 = (u8) ((rej_msg->offset8 & 0x3F) | (msg << 6)); } static inline u8 cm_rej_get_reject_info_len(struct cm_rej_msg *rej_msg) { return (u8) (rej_msg->offset9 >> 1); } static inline void cm_rej_set_reject_info_len(struct cm_rej_msg *rej_msg, u8 len) { rej_msg->offset9 = (u8) ((rej_msg->offset9 & 0x1) | (len << 1)); } struct cm_rep_msg { struct ib_mad_hdr hdr; __be32 local_comm_id; __be32 remote_comm_id; __be32 local_qkey; /* local QPN:24, rsvd:8 */ __be32 offset12; /* local EECN:24, rsvd:8 */ __be32 offset16; /* starting PSN:24 rsvd:8 */ __be32 offset20; u8 resp_resources; u8 initiator_depth; /* target ACK delay:5, failover accepted:2, end-to-end flow control:1 */ u8 offset26; /* RNR retry count:3, SRQ:1, rsvd:5 */ u8 offset27; __be64 local_ca_guid; u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE]; } __attribute__ ((packed)); static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg) { return cpu_to_be32(be32_to_cpu(rep_msg->offset12) >> 8); } static inline void cm_rep_set_local_qpn(struct cm_rep_msg *rep_msg, __be32 qpn) { rep_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) | (be32_to_cpu(rep_msg->offset12) & 0x000000FF)); } static inline __be32 cm_rep_get_local_eecn(struct cm_rep_msg *rep_msg) { return cpu_to_be32(be32_to_cpu(rep_msg->offset16) >> 8); } static inline void cm_rep_set_local_eecn(struct cm_rep_msg *rep_msg, __be32 eecn) { rep_msg->offset16 = cpu_to_be32((be32_to_cpu(eecn) << 8) | (be32_to_cpu(rep_msg->offset16) & 0x000000FF)); } static inline __be32 cm_rep_get_qpn(struct cm_rep_msg *rep_msg, enum ib_qp_type qp_type) { return (qp_type == IB_QPT_XRC_INI) ? cm_rep_get_local_eecn(rep_msg) : cm_rep_get_local_qpn(rep_msg); } static inline __be32 cm_rep_get_starting_psn(struct cm_rep_msg *rep_msg) { return cpu_to_be32(be32_to_cpu(rep_msg->offset20) >> 8); } static inline void cm_rep_set_starting_psn(struct cm_rep_msg *rep_msg, __be32 starting_psn) { rep_msg->offset20 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) | (be32_to_cpu(rep_msg->offset20) & 0x000000FF)); } static inline u8 cm_rep_get_target_ack_delay(struct cm_rep_msg *rep_msg) { return (u8) (rep_msg->offset26 >> 3); } static inline void cm_rep_set_target_ack_delay(struct cm_rep_msg *rep_msg, u8 target_ack_delay) { rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0x07) | (target_ack_delay << 3)); } static inline u8 cm_rep_get_failover(struct cm_rep_msg *rep_msg) { return (u8) ((rep_msg->offset26 & 0x06) >> 1); } static inline void cm_rep_set_failover(struct cm_rep_msg *rep_msg, u8 failover) { rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xF9) | ((failover & 0x3) << 1)); } static inline u8 cm_rep_get_flow_ctrl(struct cm_rep_msg *rep_msg) { return (u8) (rep_msg->offset26 & 0x01); } static inline void cm_rep_set_flow_ctrl(struct cm_rep_msg *rep_msg, u8 flow_ctrl) { rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xFE) | (flow_ctrl & 0x1)); } static inline u8 cm_rep_get_rnr_retry_count(struct cm_rep_msg *rep_msg) { return (u8) (rep_msg->offset27 >> 5); } static inline void cm_rep_set_rnr_retry_count(struct cm_rep_msg *rep_msg, u8 rnr_retry_count) { rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0x1F) | (rnr_retry_count << 5)); } static inline u8 cm_rep_get_srq(struct cm_rep_msg *rep_msg) { return (u8) ((rep_msg->offset27 >> 4) & 0x1); } static inline void cm_rep_set_srq(struct cm_rep_msg *rep_msg, u8 srq) { rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0xEF) | ((srq & 0x1) << 4)); } struct cm_rtu_msg { struct ib_mad_hdr hdr; __be32 local_comm_id; __be32 remote_comm_id; u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE]; } __attribute__ ((packed)); struct cm_dreq_msg { struct ib_mad_hdr hdr; __be32 local_comm_id; __be32 remote_comm_id; /* remote QPN/EECN:24, rsvd:8 */ __be32 offset8; u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE]; } __attribute__ ((packed)); static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg) { return cpu_to_be32(be32_to_cpu(dreq_msg->offset8) >> 8); } static inline void cm_dreq_set_remote_qpn(struct cm_dreq_msg *dreq_msg, __be32 qpn) { dreq_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) | (be32_to_cpu(dreq_msg->offset8) & 0x000000FF)); } struct cm_drep_msg { struct ib_mad_hdr hdr; __be32 local_comm_id; __be32 remote_comm_id; u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE]; } __attribute__ ((packed)); struct cm_lap_msg { struct ib_mad_hdr hdr; __be32 local_comm_id; __be32 remote_comm_id; __be32 rsvd8; /* remote QPN/EECN:24, remote CM response timeout:5, rsvd:3 */ __be32 offset12; __be32 rsvd16; __be16 alt_local_lid; __be16 alt_remote_lid; union ib_gid alt_local_gid; union ib_gid alt_remote_gid; /* flow label:20, rsvd:4, traffic class:8 */ __be32 offset56; u8 alt_hop_limit; /* rsvd:2, packet rate:6 */ u8 offset61; /* SL:4, subnet local:1, rsvd:3 */ u8 offset62; /* local ACK timeout:5, rsvd:3 */ u8 offset63; u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE]; } __attribute__ ((packed)); static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg) { return cpu_to_be32(be32_to_cpu(lap_msg->offset12) >> 8); } static inline void cm_lap_set_remote_qpn(struct cm_lap_msg *lap_msg, __be32 qpn) { lap_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) | (be32_to_cpu(lap_msg->offset12) & 0x000000FF)); } static inline u8 cm_lap_get_remote_resp_timeout(struct cm_lap_msg *lap_msg) { return (u8) ((be32_to_cpu(lap_msg->offset12) & 0xF8) >> 3); } static inline void cm_lap_set_remote_resp_timeout(struct cm_lap_msg *lap_msg, u8 resp_timeout) { lap_msg->offset12 = cpu_to_be32((resp_timeout << 3) | (be32_to_cpu(lap_msg->offset12) & 0xFFFFFF07)); } static inline __be32 cm_lap_get_flow_label(struct cm_lap_msg *lap_msg) { return cpu_to_be32(be32_to_cpu(lap_msg->offset56) >> 12); } static inline void cm_lap_set_flow_label(struct cm_lap_msg *lap_msg, __be32 flow_label) { lap_msg->offset56 = cpu_to_be32( (be32_to_cpu(lap_msg->offset56) & 0x00000FFF) | (be32_to_cpu(flow_label) << 12)); } static inline u8 cm_lap_get_traffic_class(struct cm_lap_msg *lap_msg) { return (u8) be32_to_cpu(lap_msg->offset56); } static inline void cm_lap_set_traffic_class(struct cm_lap_msg *lap_msg, u8 traffic_class) { lap_msg->offset56 = cpu_to_be32(traffic_class | (be32_to_cpu(lap_msg->offset56) & 0xFFFFFF00)); } static inline u8 cm_lap_get_packet_rate(struct cm_lap_msg *lap_msg) { return lap_msg->offset61 & 0x3F; } static inline void cm_lap_set_packet_rate(struct cm_lap_msg *lap_msg, u8 packet_rate) { lap_msg->offset61 = (packet_rate & 0x3F) | (lap_msg->offset61 & 0xC0); } static inline u8 cm_lap_get_sl(struct cm_lap_msg *lap_msg) { return lap_msg->offset62 >> 4; } static inline void cm_lap_set_sl(struct cm_lap_msg *lap_msg, u8 sl) { lap_msg->offset62 = (sl << 4) | (lap_msg->offset62 & 0x0F); } static inline u8 cm_lap_get_subnet_local(struct cm_lap_msg *lap_msg) { return (lap_msg->offset62 >> 3) & 0x1; } static inline void cm_lap_set_subnet_local(struct cm_lap_msg *lap_msg, u8 subnet_local) { lap_msg->offset62 = ((subnet_local & 0x1) << 3) | (lap_msg->offset61 & 0xF7); } static inline u8 cm_lap_get_local_ack_timeout(struct cm_lap_msg *lap_msg) { return lap_msg->offset63 >> 3; } static inline void cm_lap_set_local_ack_timeout(struct cm_lap_msg *lap_msg, u8 local_ack_timeout) { lap_msg->offset63 = (local_ack_timeout << 3) | (lap_msg->offset63 & 0x07); } struct cm_apr_msg { struct ib_mad_hdr hdr; __be32 local_comm_id; __be32 remote_comm_id; u8 info_length; u8 ap_status; __be16 rsvd; u8 info[IB_CM_APR_INFO_LENGTH]; u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE]; } __attribute__ ((packed)); struct cm_sidr_req_msg { struct ib_mad_hdr hdr; __be32 request_id; __be16 pkey; __be16 rsvd; __be64 service_id; - u8 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE]; + u32 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE / sizeof(u32)]; } __attribute__ ((packed)); struct cm_sidr_rep_msg { struct ib_mad_hdr hdr; __be32 request_id; u8 status; u8 info_length; __be16 rsvd; /* QPN:24, rsvd:8 */ __be32 offset8; __be64 service_id; __be32 qkey; u8 info[IB_CM_SIDR_REP_INFO_LENGTH]; u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE]; } __attribute__ ((packed)); static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg) { return cpu_to_be32(be32_to_cpu(sidr_rep_msg->offset8) >> 8); } static inline void cm_sidr_rep_set_qpn(struct cm_sidr_rep_msg *sidr_rep_msg, __be32 qpn) { sidr_rep_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) | (be32_to_cpu(sidr_rep_msg->offset8) & 0x000000FF)); } #endif /* CM_MSGS_H */ Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/cma.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/cma.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/cma.c (revision 319974) @@ -1,3886 +1,4307 @@ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. - * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define LINUXKPI_PARAM_PREFIX ibcore_ #include #include #include #include #include #include #include #include #include -#include #include #include #include +#include +#include + #include #include +#include +#include #include #include #include #include +#include + +#include "core_priv.h" + MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 +#define CMA_QUERY_CLASSPORT_INFO_TIMEOUT 3000 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 18 -static int cma_response_timeout = CMA_CM_RESPONSE_TIMEOUT; -module_param_named(cma_response_timeout, cma_response_timeout, int, 0644); -MODULE_PARM_DESC(cma_response_timeout, "CMA_CM_RESPONSE_TIMEOUT (default=20)"); +static const char * const cma_events[] = { + [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", + [RDMA_CM_EVENT_ADDR_ERROR] = "address error", + [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ", + [RDMA_CM_EVENT_ROUTE_ERROR] = "route error", + [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request", + [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response", + [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error", + [RDMA_CM_EVENT_UNREACHABLE] = "unreachable", + [RDMA_CM_EVENT_REJECTED] = "rejected", + [RDMA_CM_EVENT_ESTABLISHED] = "established", + [RDMA_CM_EVENT_DISCONNECTED] = "disconnected", + [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal", + [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join", + [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error", + [RDMA_CM_EVENT_ADDR_CHANGE] = "address change", + [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", +}; -static int def_prec2sl = 3; -module_param_named(def_prec2sl, def_prec2sl, int, 0644); -MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7"); +const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) +{ + size_t index = event; -static int unify_tcp_port_space = 1; -module_param(unify_tcp_port_space, int, 0644); -MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port " - "space allocation (default=1)"); + return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ? + cma_events[index] : "unrecognized event"; +} +EXPORT_SYMBOL(rdma_event_msg); -static int debug_level = 0; -#define cma_pr(level, priv, format, arg...) \ - printk(level "CMA: %p: %s: " format, ((struct rdma_id_priv *) priv) , __func__, ## arg) - -#define cma_dbg(priv, format, arg...) \ - do { if (debug_level) cma_pr(KERN_DEBUG, priv, format, ## arg); } while (0) - -#define cma_warn(priv, format, arg...) \ - cma_pr(KERN_WARNING, priv, format, ## arg) - -#define CMA_GID_FMT "%2.2x%2.2x:%2.2x%2.2x" -#define CMA_GID_RAW_ARG(gid) ((u8 *)(gid))[12],\ - ((u8 *)(gid))[13],\ - ((u8 *)(gid))[14],\ - ((u8 *)(gid))[15] - -#define CMA_GID_ARG(gid) CMA_GID_RAW_ARG((gid).raw) -#define cma_debug_path(priv, pfx, p) \ - cma_dbg(priv, pfx "sgid=" CMA_GID_FMT ",dgid=" \ - CMA_GID_FMT "\n", CMA_GID_ARG(p.sgid), \ - CMA_GID_ARG(p.dgid)) - -#define cma_debug_gid(priv, g) \ - cma_dbg(priv, "gid=" CMA_GID_FMT "\n", CMA_GID_ARG(g) - -module_param_named(debug_level, debug_level, int, 0644); -MODULE_PARM_DESC(debug_level, "debug level default=0"); - static void cma_add_one(struct ib_device *device); -static void cma_remove_one(struct ib_device *device); +static void cma_remove_one(struct ib_device *device, void *client_data); static struct ib_client cma_client = { .name = "cma", .add = cma_add_one, .remove = cma_remove_one }; static struct ib_sa_client sa_client; static struct rdma_addr_client addr_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct workqueue_struct *cma_wq; -static struct workqueue_struct *cma_free_wq; -static DEFINE_IDR(sdp_ps); -static DEFINE_IDR(tcp_ps); -static DEFINE_IDR(udp_ps); -static DEFINE_IDR(ipoib_ps); -static DEFINE_IDR(ib_ps); +struct cma_pernet { + struct idr tcp_ps; + struct idr udp_ps; + struct idr ipoib_ps; + struct idr ib_ps; +}; + +VNET_DEFINE(struct cma_pernet, cma_pernet); + +static struct cma_pernet *cma_pernet_ptr(struct vnet *vnet) +{ + struct cma_pernet *retval; + + CURVNET_SET_QUIET(vnet); + retval = &VNET(cma_pernet); + CURVNET_RESTORE(); + + return (retval); +} + +static struct idr *cma_pernet_idr(struct vnet *net, enum rdma_port_space ps) +{ + struct cma_pernet *pernet = cma_pernet_ptr(net); + + switch (ps) { + case RDMA_PS_TCP: + return &pernet->tcp_ps; + case RDMA_PS_UDP: + return &pernet->udp_ps; + case RDMA_PS_IPOIB: + return &pernet->ipoib_ps; + case RDMA_PS_IB: + return &pernet->ib_ps; + default: + return NULL; + } +} + struct cma_device { struct list_head list; struct ib_device *device; struct completion comp; atomic_t refcount; struct list_head id_list; + struct sysctl_ctx_list sysctl_ctx; + enum ib_gid_type *default_gid_type; }; struct rdma_bind_list { - struct idr *ps; + enum rdma_port_space ps; struct hlist_head owners; unsigned short port; }; +struct class_port_info_context { + struct ib_class_port_info *class_port_info; + struct ib_device *device; + struct completion done; + struct ib_sa_query *sa_query; + u8 port_num; +}; + +static int cma_ps_alloc(struct vnet *vnet, enum rdma_port_space ps, + struct rdma_bind_list *bind_list, int snum) +{ + struct idr *idr = cma_pernet_idr(vnet, ps); + + return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL); +} + +static struct rdma_bind_list *cma_ps_find(struct vnet *net, + enum rdma_port_space ps, int snum) +{ + struct idr *idr = cma_pernet_idr(net, ps); + + return idr_find(idr, snum); +} + +static void cma_ps_remove(struct vnet *net, enum rdma_port_space ps, int snum) +{ + struct idr *idr = cma_pernet_idr(net, ps); + + idr_remove(idr, snum); +} + enum { CMA_OPTION_AFONLY, }; +void cma_ref_dev(struct cma_device *cma_dev) +{ + atomic_inc(&cma_dev->refcount); +} + +struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, + void *cookie) +{ + struct cma_device *cma_dev; + struct cma_device *found_cma_dev = NULL; + + mutex_lock(&lock); + + list_for_each_entry(cma_dev, &dev_list, list) + if (filter(cma_dev->device, cookie)) { + found_cma_dev = cma_dev; + break; + } + + if (found_cma_dev) + cma_ref_dev(found_cma_dev); + mutex_unlock(&lock); + return found_cma_dev; +} + +int cma_get_default_gid_type(struct cma_device *cma_dev, + unsigned int port) +{ + if (port < rdma_start_port(cma_dev->device) || + port > rdma_end_port(cma_dev->device)) + return -EINVAL; + + return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)]; +} + +int cma_set_default_gid_type(struct cma_device *cma_dev, + unsigned int port, + enum ib_gid_type default_gid_type) +{ + unsigned long supported_gids; + + if (port < rdma_start_port(cma_dev->device) || + port > rdma_end_port(cma_dev->device)) + return -EINVAL; + + supported_gids = roce_gid_type_mask_support(cma_dev->device, port); + + if (!(supported_gids & 1 << default_gid_type)) + return -EINVAL; + + cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] = + default_gid_type; + + return 0; +} + +struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) +{ + return cma_dev->device; +} + /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. * We do this by disabling removal notification while a callback is in process, * and reporting it after the callback completes. */ struct rdma_id_private { struct rdma_cm_id id; struct rdma_bind_list *bind_list; - struct socket *sock; struct hlist_node node; struct list_head list; /* listen_any_list or cma_device.list */ struct list_head listen_list; /* per device listens */ struct cma_device *cma_dev; struct list_head mc_list; int internal_id; enum rdma_cm_state state; spinlock_t lock; - spinlock_t cm_lock; struct mutex qp_mutex; struct completion comp; atomic_t refcount; struct mutex handler_mutex; - struct work_struct work; /* garbage coll */ int backlog; int timeout_ms; struct ib_sa_query *query; int query_id; union { struct ib_cm_id *ib; struct iw_cm_id *iw; } cm_id; u32 seq_num; u32 qkey; u32 qp_num; pid_t owner; u32 options; u8 srq; u8 tos; u8 reuseaddr; u8 afonly; - int qp_timeout; - /* cache for mc record params */ - struct ib_sa_mcmember_rec rec; - int is_valid_rec; + enum ib_gid_type gid_type; }; struct cma_multicast { struct rdma_id_private *id_priv; union { struct ib_sa_multicast *ib; } multicast; struct list_head list; void *context; struct sockaddr_storage addr; struct kref mcref; + bool igmp_joined; + u8 join_state; }; struct cma_work { struct work_struct work; struct rdma_id_private *id; enum rdma_cm_state old_state; enum rdma_cm_state new_state; struct rdma_cm_event event; }; struct cma_ndev_work { struct work_struct work; struct rdma_id_private *id; struct rdma_cm_event event; }; struct iboe_mcast_work { struct work_struct work; struct rdma_id_private *id; struct cma_multicast *mc; }; union cma_ip_addr { struct in6_addr ip6; struct { __be32 pad[3]; __be32 addr; } ip4; }; struct cma_hdr { u8 cma_version; u8 ip_version; /* IP version: 7:4 */ __be16 port; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; -struct sdp_hh { - u8 bsdh[16]; - u8 sdp_version; /* Major version: 7:4 */ - u8 ip_version; /* IP version: 7:4 */ - u8 sdp_specific1[10]; - __be16 port; - __be16 sdp_specific2; - union cma_ip_addr src_addr; - union cma_ip_addr dst_addr; -}; +#define CMA_VERSION 0x00 -struct sdp_hah { - u8 bsdh[16]; - u8 sdp_version; +struct cma_req_info { + struct ib_device *device; + int port; + union ib_gid local_gid; + __be64 service_id; + u16 pkey; + bool has_gid:1; }; -#define CMA_VERSION 0x00 -#define SDP_MAJ_VERSION 0x2 - static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) { unsigned long flags; int ret; spin_lock_irqsave(&id_priv->lock, flags); ret = (id_priv->state == comp); spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static int cma_comp_exch(struct rdma_id_private *id_priv, enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv, enum rdma_cm_state exch) { unsigned long flags; enum rdma_cm_state old; spin_lock_irqsave(&id_priv->lock, flags); old = id_priv->state; id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return old; } -static inline u8 cma_get_ip_ver(struct cma_hdr *hdr) +static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) { return hdr->ip_version >> 4; } static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) { hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } -static inline u8 sdp_get_majv(u8 sdp_version) +static void _cma_attach_to_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) { - return sdp_version >> 4; -} - -static inline u8 sdp_get_ip_ver(struct sdp_hh *hh) -{ - return hh->ip_version >> 4; -} - -static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver) -{ - hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF); -} - -static void cma_attach_to_dev(struct rdma_id_private *id_priv, - struct cma_device *cma_dev) -{ - atomic_inc(&cma_dev->refcount); + cma_ref_dev(cma_dev); id_priv->cma_dev = cma_dev; + id_priv->gid_type = 0; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); } -static inline void cma_deref_dev(struct cma_device *cma_dev) +static void cma_attach_to_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) { + _cma_attach_to_dev(id_priv, cma_dev); + id_priv->gid_type = + cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(cma_dev->device)]; +} + +void cma_deref_dev(struct cma_device *cma_dev) +{ if (atomic_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); } static inline void release_mc(struct kref *kref) { struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref); kfree(mc->multicast.ib); kfree(mc); } static void cma_release_dev(struct rdma_id_private *id_priv) { mutex_lock(&lock); list_del(&id_priv->list); cma_deref_dev(id_priv->cma_dev); id_priv->cma_dev = NULL; mutex_unlock(&lock); } -static int cma_set_qkey(struct rdma_id_private *id_priv) +static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) { + return (struct sockaddr *) &id_priv->id.route.addr.src_addr; +} + +static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) +{ + return (struct sockaddr *) &id_priv->id.route.addr.dst_addr; +} + +static inline unsigned short cma_family(struct rdma_id_private *id_priv) +{ + return id_priv->id.route.addr.src_addr.ss_family; +} + +static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) +{ struct ib_sa_mcmember_rec rec; int ret = 0; - if (id_priv->qkey) + if (id_priv->qkey) { + if (qkey && id_priv->qkey != qkey) + return -EINVAL; return 0; + } + if (qkey) { + id_priv->qkey = qkey; + return 0; + } + switch (id_priv->id.ps) { case RDMA_PS_UDP: + case RDMA_PS_IB: id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (!ret) id_priv->qkey = be32_to_cpu(rec.qkey); break; default: break; } return ret; } -static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num) +static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { - int i; - int err; - struct ib_port_attr props; - union ib_gid tmp; + dev_addr->dev_type = ARPHRD_INFINIBAND; + rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr); + ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey)); +} - err = ib_query_port(device, port_num, &props); - if (err) - return 1; +static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) +{ + int ret; - for (i = 0; i < props.gid_tbl_len; ++i) { - err = ib_query_gid(device, port_num, i, &tmp); - if (err) - return 1; - if (!memcmp(&tmp, gid, sizeof tmp)) - return 0; + if (addr->sa_family != AF_IB) { + ret = rdma_translate_ip(addr, dev_addr, NULL); + } else { + cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); + ret = 0; } - return -EAGAIN; + return ret; } -int -rdma_find_cmid_laddr(struct sockaddr_in *local_addr, unsigned short dev_type, - void **cm_id) +static inline int cma_validate_port(struct ib_device *device, u8 port, + enum ib_gid_type gid_type, + union ib_gid *gid, int dev_type, + struct vnet *net, + int bound_if_index) { - int ret; - u8 port; - int found_dev = 0, found_cmid = 0; - struct rdma_id_private *id_priv; - struct rdma_id_private *dev_id_priv; - struct cma_device *cma_dev; - struct rdma_dev_addr dev_addr; - union ib_gid gid; - enum rdma_link_layer dev_ll = dev_type == ARPHRD_INFINIBAND ? - IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; + int ret = -ENODEV; + struct net_device *ndev = NULL; - memset(&dev_addr, 0, sizeof(dev_addr)); + if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) + return ret; - ret = rdma_translate_ip((struct sockaddr *)local_addr, - &dev_addr, NULL); - if (ret) - goto err; + if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) + return ret; - /* find rdma device based on MAC address/gid */ - mutex_lock(&lock); + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { + ndev = dev_get_by_index(net, bound_if_index); + if (ndev && ndev->if_flags & IFF_LOOPBACK) { + pr_info("detected loopback device\n"); + dev_put(ndev); - memcpy(&gid, dev_addr.src_dev_addr + - rdma_addr_gid_offset(&dev_addr), sizeof(gid)); + if (!device->get_netdev) + return -EOPNOTSUPP; - list_for_each_entry(cma_dev, &dev_list, list) - for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) - if ((rdma_port_get_link_layer(cma_dev->device, port) == - dev_ll) && - (rdma_node_get_transport(cma_dev->device->node_type) == - RDMA_TRANSPORT_IWARP)) { - ret = find_gid_port(cma_dev->device, - &gid, port); - if (!ret) { - found_dev = 1; - goto out; - } else if (ret == 1) { - mutex_unlock(&lock); - goto err; - } - } -out: - mutex_unlock(&lock); + ndev = device->get_netdev(device, port); + if (!ndev) + return -ENODEV; + } + } else { + gid_type = IB_GID_TYPE_IB; + } - if (!found_dev) - goto err; + ret = ib_find_cached_gid_by_port(device, gid, gid_type, port, + ndev, NULL); - /* Traverse through the list of listening cm_id's to find the - * desired cm_id based on rdma device & port number. - */ - list_for_each_entry(id_priv, &listen_any_list, list) - list_for_each_entry(dev_id_priv, &id_priv->listen_list, - listen_list) - if (dev_id_priv->cma_dev == cma_dev) - if (dev_id_priv->cm_id.iw->local_addr.sin_port - == local_addr->sin_port) { - *cm_id = (void *)dev_id_priv->cm_id.iw; - found_cmid = 1; - } - return found_cmid ? 0 : -ENODEV; + if (ndev) + dev_put(ndev); -err: - return -ENODEV; + return ret; } -EXPORT_SYMBOL(rdma_find_cmid_laddr); static int cma_acquire_dev(struct rdma_id_private *id_priv, struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct cma_device *cma_dev; - union ib_gid gid, iboe_gid; + union ib_gid gid, iboe_gid, *gidp; int ret = -ENODEV; - u8 port, found_port; - enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ? - IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; + u8 port; - if (dev_ll != IB_LINK_LAYER_INFINIBAND && + if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; mutex_lock(&lock); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof gid); - if (listen_id_priv && - rdma_port_get_link_layer(listen_id_priv->id.device, - listen_id_priv->id.port_num) == dev_ll) { + + if (listen_id_priv) { cma_dev = listen_id_priv->cma_dev; port = listen_id_priv->id.port_num; - if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && - rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) - ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, - &found_port, NULL); - else - ret = ib_find_cached_gid(cma_dev->device, &gid, - &found_port, NULL); + gidp = rdma_protocol_roce(cma_dev->device, port) ? + &iboe_gid : &gid; - if (!ret && (port == found_port)) { - id_priv->id.port_num = found_port; + ret = cma_validate_port(cma_dev->device, port, + rdma_protocol_ib(cma_dev->device, port) ? + IB_GID_TYPE_IB : + listen_id_priv->gid_type, gidp, + dev_addr->dev_type, + dev_addr->net, + dev_addr->bound_dev_if); + if (!ret) { + id_priv->id.port_num = port; goto out; } } + list_for_each_entry(cma_dev, &dev_list, list) { for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { if (listen_id_priv && listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; - if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) { - if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && - rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) - ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL); - else - ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL); - if (!ret && (port == found_port)) { - id_priv->id.port_num = port; - goto out; - } else if (ret == 1) - break; + gidp = rdma_protocol_roce(cma_dev->device, port) ? + &iboe_gid : &gid; + + ret = cma_validate_port(cma_dev->device, port, + rdma_protocol_ib(cma_dev->device, port) ? + IB_GID_TYPE_IB : + cma_dev->default_gid_type[port - 1], + gidp, dev_addr->dev_type, + dev_addr->net, + dev_addr->bound_dev_if); + if (!ret) { + id_priv->id.port_num = port; + goto out; } } } out: if (!ret) cma_attach_to_dev(id_priv, cma_dev); mutex_unlock(&lock); return ret; } -static void cma_deref_id(struct rdma_id_private *id_priv) +/* + * Select the source IB device and address to reach the destination IB address. + */ +static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) { - if (atomic_dec_and_test(&id_priv->refcount)) - complete(&id_priv->comp); -} + struct cma_device *cma_dev, *cur_dev; + struct sockaddr_ib *addr; + union ib_gid gid, sgid, *dgid; + u16 pkey, index; + u8 p; + int i; -static int cma_disable_callback(struct rdma_id_private *id_priv, - enum rdma_cm_state state) -{ - mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != state) { - mutex_unlock(&id_priv->handler_mutex); - return -EINVAL; + cma_dev = NULL; + addr = (struct sockaddr_ib *) cma_dst_addr(id_priv); + dgid = (union ib_gid *) &addr->sib_addr; + pkey = ntohs(addr->sib_pkey); + + list_for_each_entry(cur_dev, &dev_list, list) { + for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + if (!rdma_cap_af_ib(cur_dev->device, p)) + continue; + + if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) + continue; + + for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, + &gid, NULL); + i++) { + if (!memcmp(&gid, dgid, sizeof(gid))) { + cma_dev = cur_dev; + sgid = gid; + id_priv->id.port_num = p; + goto found; + } + + if (!cma_dev && (gid.global.subnet_prefix == + dgid->global.subnet_prefix)) { + cma_dev = cur_dev; + sgid = gid; + id_priv->id.port_num = p; + } + } + } } + + if (!cma_dev) + return -ENODEV; + +found: + cma_attach_to_dev(id_priv, cma_dev); + addr = (struct sockaddr_ib *) cma_src_addr(id_priv); + memcpy(&addr->sib_addr, &sgid, sizeof sgid); + cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); return 0; } -struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, +static void cma_deref_id(struct rdma_id_private *id_priv) +{ + if (atomic_dec_and_test(&id_priv->refcount)) + complete(&id_priv->comp); +} + +struct rdma_cm_id *rdma_create_id(struct vnet *net, + rdma_cm_event_handler event_handler, void *context, enum rdma_port_space ps, enum ib_qp_type qp_type) { struct rdma_id_private *id_priv; id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL); if (!id_priv) return ERR_PTR(-ENOMEM); - id_priv->owner = curthread->td_proc->p_pid; + id_priv->owner = task_pid_nr(current); id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; spin_lock_init(&id_priv->lock); - spin_lock_init(&id_priv->cm_lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); atomic_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); + id_priv->id.route.addr.dev_addr.net = TD_TO_VNET(curthread); return &id_priv->id; } EXPORT_SYMBOL(rdma_create_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTS; qp_attr.sq_psn = 0; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); return ret; } static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; return ib_modify_qp(qp, &qp_attr, qp_attr_mask); } int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { struct rdma_id_private *id_priv; struct ib_qp *qp; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id->device != pd->device) return -EINVAL; + qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) return PTR_ERR(qp); if (id->qp_type == IB_QPT_UD) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); if (ret) goto err; id->qp = qp; id_priv->qp_num = qp->qp_num; id_priv->srq = (qp->srq != NULL); return 0; err: ib_destroy_qp(qp); return ret; } EXPORT_SYMBOL(rdma_create_qp); void rdma_destroy_qp(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); ib_destroy_qp(id_priv->id.qp); id_priv->id.qp = NULL; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_destroy_qp); static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; union ib_gid sgid; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } /* Need to update QP attributes from default values. */ qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); if (ret) goto out; qp_attr.qp_state = IB_QPS_RTR; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; + ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num, - qp_attr.ah_attr.grh.sgid_index, &sgid); + qp_attr.ah_attr.grh.sgid_index, &sgid, NULL); if (ret) goto out; - if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) - == RDMA_TRANSPORT_IB && - rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) - == IB_LINK_LAYER_ETHERNET) { - u32 scope_id = rdma_get_ipv6_scope_id(id_priv->id.device, - id_priv->id.port_num); + BUG_ON(id_priv->cma_dev->device != id_priv->id.device); - ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL, - scope_id); - if (ret) - goto out; - } - if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_rts(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_RTS; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; if (conn_param) qp_attr.max_rd_atomic = conn_param->initiator_depth; - - if (id_priv->qp_timeout && id_priv->id.qp->qp_type == IB_QPT_RC) { - qp_attr.timeout = id_priv->qp_timeout; - qp_attr_mask |= IB_QP_TIMEOUT; - } - ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_err(struct rdma_id_private *id_priv) { struct ib_qp_attr qp_attr; int ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_ERR; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int ret; u16 pkey; - if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) == - IB_LINK_LAYER_INFINIBAND) - pkey = ib_addr_get_pkey(dev_addr); - else + if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num)) pkey = 0xffff; + else + pkey = ib_addr_get_pkey(dev_addr); ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, pkey, &qp_attr->pkey_index); if (ret) return ret; qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { - ret = cma_set_qkey(id_priv); + ret = cma_set_qkey(id_priv, 0); if (ret) return ret; qp_attr->qkey = id_priv->qkey; *qp_attr_mask |= IB_QP_QKEY; } else { qp_attr->qp_access_flags = 0; *qp_attr_mask |= IB_QP_ACCESS_FLAGS; } return 0; } int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_id_private *id_priv; int ret = 0; id_priv = container_of(id, struct rdma_id_private, id); - switch (rdma_node_get_transport(id_priv->id.device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); + if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); - break; - default: + } else ret = -ENOSYS; - break; - } return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); static inline int cma_zero_addr(struct sockaddr *addr) { - struct in6_addr *ip6; - - if (addr->sa_family == AF_INET) - return ipv4_is_zeronet( - ((struct sockaddr_in *)addr)->sin_addr.s_addr); - else { - ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr; - return (ip6->s6_addr32[0] | ip6->s6_addr32[1] | - ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0; + switch (addr->sa_family) { + case AF_INET: + return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); + case AF_INET6: + return ipv6_addr_any(&((struct sockaddr_in6 *) addr)->sin6_addr); + case AF_IB: + return ib_addr_any(&((struct sockaddr_ib *) addr)->sib_addr); + default: + return 0; } } static inline int cma_loopback_addr(struct sockaddr *addr) { - if (addr->sa_family == AF_INET) - return ipv4_is_loopback( - ((struct sockaddr_in *) addr)->sin_addr.s_addr); - else - return ipv6_addr_loopback( - &((struct sockaddr_in6 *) addr)->sin6_addr); + switch (addr->sa_family) { + case AF_INET: + return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr); + case AF_INET6: + return ipv6_addr_loopback(&((struct sockaddr_in6 *) addr)->sin6_addr); + case AF_IB: + return ib_addr_loopback(&((struct sockaddr_ib *) addr)->sib_addr); + default: + return 0; + } } static inline int cma_any_addr(struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } -int -rdma_cma_any_addr(struct sockaddr *addr) -{ - return cma_any_addr(addr); -} -EXPORT_SYMBOL(rdma_cma_any_addr); static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) { if (src->sa_family != dst->sa_family) return -1; switch (src->sa_family) { case AF_INET: return ((struct sockaddr_in *) src)->sin_addr.s_addr != ((struct sockaddr_in *) dst)->sin_addr.s_addr; - default: + case AF_INET6: return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr, &((struct sockaddr_in6 *) dst)->sin6_addr); + default: + return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, + &((struct sockaddr_ib *) dst)->sib_addr); } } -static inline __be16 cma_port(struct sockaddr *addr) +static __be16 cma_port(struct sockaddr *addr) { - if (addr->sa_family == AF_INET) + struct sockaddr_ib *sib; + + switch (addr->sa_family) { + case AF_INET: return ((struct sockaddr_in *) addr)->sin_port; - else + case AF_INET6: return ((struct sockaddr_in6 *) addr)->sin6_port; + case AF_IB: + sib = (struct sockaddr_ib *) addr; + return htons((u16) (be64_to_cpu(sib->sib_sid) & + be64_to_cpu(sib->sib_sid_mask))); + default: + return 0; + } } static inline int cma_any_port(struct sockaddr *addr) { return !cma_port(addr); } -static int cma_get_net_info(void *hdr, enum rdma_port_space ps, - u8 *ip_ver, __be16 *port, - union cma_ip_addr **src, union cma_ip_addr **dst) +static void cma_save_ib_info(struct sockaddr *src_addr, + struct sockaddr *dst_addr, + struct rdma_cm_id *listen_id, + struct ib_sa_path_rec *path) { - switch (ps) { - case RDMA_PS_SDP: - if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) != - SDP_MAJ_VERSION) - return -EINVAL; + struct sockaddr_ib *listen_ib, *ib; - *ip_ver = sdp_get_ip_ver(hdr); - *port = ((struct sdp_hh *) hdr)->port; - *src = &((struct sdp_hh *) hdr)->src_addr; - *dst = &((struct sdp_hh *) hdr)->dst_addr; + listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; + if (src_addr) { + ib = (struct sockaddr_ib *)src_addr; + ib->sib_family = AF_IB; + if (path) { + ib->sib_pkey = path->pkey; + ib->sib_flowinfo = path->flow_label; + memcpy(&ib->sib_addr, &path->sgid, 16); + ib->sib_sid = path->service_id; + ib->sib_scope_id = 0; + } else { + ib->sib_pkey = listen_ib->sib_pkey; + ib->sib_flowinfo = listen_ib->sib_flowinfo; + ib->sib_addr = listen_ib->sib_addr; + ib->sib_sid = listen_ib->sib_sid; + ib->sib_scope_id = listen_ib->sib_scope_id; + } + ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); + } + if (dst_addr) { + ib = (struct sockaddr_ib *)dst_addr; + ib->sib_family = AF_IB; + if (path) { + ib->sib_pkey = path->pkey; + ib->sib_flowinfo = path->flow_label; + memcpy(&ib->sib_addr, &path->dgid, 16); + } + } +} + +static void cma_save_ip4_info(struct sockaddr_in *src_addr, + struct sockaddr_in *dst_addr, + struct cma_hdr *hdr, + __be16 local_port) +{ + if (src_addr) { + *src_addr = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_addr.s_addr = hdr->dst_addr.ip4.addr, + .sin_port = local_port, + }; + } + + if (dst_addr) { + *dst_addr = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_addr.s_addr = hdr->src_addr.ip4.addr, + .sin_port = hdr->port, + }; + } +} + +static void cma_save_ip6_info(struct sockaddr_in6 *src_addr, + struct sockaddr_in6 *dst_addr, + struct cma_hdr *hdr, + __be16 local_port) +{ + if (src_addr) { + *src_addr = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_addr = hdr->dst_addr.ip6, + .sin6_port = local_port, + }; + } + + if (dst_addr) { + *dst_addr = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_addr = hdr->src_addr.ip6, + .sin6_port = hdr->port, + }; + } +} + +static u16 cma_port_from_service_id(__be64 service_id) +{ + return (u16)be64_to_cpu(service_id); +} + +static int cma_save_ip_info(struct sockaddr *src_addr, + struct sockaddr *dst_addr, + struct ib_cm_event *ib_event, + __be64 service_id) +{ + struct cma_hdr *hdr; + __be16 port; + + hdr = ib_event->private_data; + if (hdr->cma_version != CMA_VERSION) + return -EINVAL; + + port = htons(cma_port_from_service_id(service_id)); + + switch (cma_get_ip_ver(hdr)) { + case 4: + cma_save_ip4_info((struct sockaddr_in *)src_addr, + (struct sockaddr_in *)dst_addr, hdr, port); break; + case 6: + cma_save_ip6_info((struct sockaddr_in6 *)src_addr, + (struct sockaddr_in6 *)dst_addr, hdr, port); + break; default: - if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION) - return -EINVAL; + return -EAFNOSUPPORT; + } - *ip_ver = cma_get_ip_ver(hdr); - *port = ((struct cma_hdr *) hdr)->port; - *src = &((struct cma_hdr *) hdr)->src_addr; - *dst = &((struct cma_hdr *) hdr)->dst_addr; - break; + return 0; +} + +static int cma_save_net_info(struct sockaddr *src_addr, + struct sockaddr *dst_addr, + struct rdma_cm_id *listen_id, + struct ib_cm_event *ib_event, + sa_family_t sa_family, __be64 service_id) +{ + if (sa_family == AF_IB) { + if (ib_event->event == IB_CM_REQ_RECEIVED) + cma_save_ib_info(src_addr, dst_addr, listen_id, + ib_event->param.req_rcvd.primary_path); + else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) + cma_save_ib_info(src_addr, dst_addr, listen_id, NULL); + return 0; } - if (*ip_ver != 4 && *ip_ver != 6) + return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id); +} + +static int cma_save_req_info(const struct ib_cm_event *ib_event, + struct cma_req_info *req) +{ + const struct ib_cm_req_event_param *req_param = + &ib_event->param.req_rcvd; + const struct ib_cm_sidr_req_event_param *sidr_param = + &ib_event->param.sidr_req_rcvd; + + switch (ib_event->event) { + case IB_CM_REQ_RECEIVED: + req->device = req_param->listen_id->device; + req->port = req_param->port; + memcpy(&req->local_gid, &req_param->primary_path->sgid, + sizeof(req->local_gid)); + req->has_gid = true; + req->service_id = req_param->primary_path->service_id; + req->pkey = be16_to_cpu(req_param->primary_path->pkey); + if (req->pkey != req_param->bth_pkey) + pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" + "RDMA CMA: in the future this may cause the request to be dropped\n", + req_param->bth_pkey, req->pkey); + break; + case IB_CM_SIDR_REQ_RECEIVED: + req->device = sidr_param->listen_id->device; + req->port = sidr_param->port; + req->has_gid = false; + req->service_id = sidr_param->service_id; + req->pkey = sidr_param->pkey; + if (req->pkey != sidr_param->bth_pkey) + pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n" + "RDMA CMA: in the future this may cause the request to be dropped\n", + sidr_param->bth_pkey, req->pkey); + break; + default: return -EINVAL; + } + return 0; } -static void cma_save_net_info(struct rdma_addr *addr, - struct rdma_addr *listen_addr, - u8 ip_ver, __be16 port, - union cma_ip_addr *src, union cma_ip_addr *dst) +static bool validate_ipv4_net_dev(struct net_device *net_dev, + const struct sockaddr_in *dst_addr, + const struct sockaddr_in *src_addr) { - struct sockaddr_in *listen4, *ip4; - struct sockaddr_in6 *listen6, *ip6; +#ifdef INET + struct sockaddr_in dst_tmp = *dst_addr; + __be32 daddr = dst_addr->sin_addr.s_addr, + saddr = src_addr->sin_addr.s_addr; + struct net_device *src_dev; + struct rtentry *rte; + bool ret; - switch (ip_ver) { - case 4: - listen4 = (struct sockaddr_in *) &listen_addr->src_addr; - ip4 = (struct sockaddr_in *) &addr->src_addr; - ip4->sin_family = listen4->sin_family; - ip4->sin_addr.s_addr = dst->ip4.addr; - ip4->sin_port = listen4->sin_port; - ip4->sin_len = sizeof(struct sockaddr_in); + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || + ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) || + ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) || + ipv4_is_loopback(saddr)) + return false; - ip4 = (struct sockaddr_in *) &addr->dst_addr; - ip4->sin_family = listen4->sin_family; - ip4->sin_addr.s_addr = src->ip4.addr; - ip4->sin_port = port; - ip4->sin_len = sizeof(struct sockaddr_in); - break; - case 6: - listen6 = (struct sockaddr_in6 *) &listen_addr->src_addr; - ip6 = (struct sockaddr_in6 *) &addr->src_addr; - ip6->sin6_family = listen6->sin6_family; - ip6->sin6_addr = dst->ip6; - ip6->sin6_port = listen6->sin6_port; - ip6->sin6_len = sizeof(struct sockaddr_in6); - ip6->sin6_scope_id = listen6->sin6_scope_id; + src_dev = ip_dev_find(net_dev->if_vnet, saddr); + if (src_dev != net_dev) + return false; - ip6 = (struct sockaddr_in6 *) &addr->dst_addr; - ip6->sin6_family = listen6->sin6_family; - ip6->sin6_addr = src->ip6; - ip6->sin6_port = port; - ip6->sin6_len = sizeof(struct sockaddr_in6); - ip6->sin6_scope_id = listen6->sin6_scope_id; - break; + /* + * Make sure the socket address length field + * is set, else rtalloc1() will fail. + */ + dst_tmp.sin_len = sizeof(dst_tmp); + + CURVNET_SET(net_dev->if_vnet); + rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0); + CURVNET_RESTORE(); + if (rte != NULL) { + ret = (rte->rt_ifp == net_dev); + RTFREE_LOCKED(rte); + } else { + ret = false; + } + return ret; +#else + return false; +#endif +} + +static bool validate_ipv6_net_dev(struct net_device *net_dev, + const struct sockaddr_in6 *dst_addr, + const struct sockaddr_in6 *src_addr) +{ +#ifdef INET6 + struct sockaddr_in6 dst_tmp = *dst_addr; + struct in6_addr in6_addr = src_addr->sin6_addr; + struct net_device *src_dev; + struct rtentry *rte; + bool ret; + + /* embed scope ID */ + in6_addr.s6_addr[3] = src_addr->sin6_scope_id; + + src_dev = ip6_dev_find(net_dev->if_vnet, in6_addr); + if (src_dev != net_dev) + return false; + + /* + * Make sure the socket address length field + * is set, else rtalloc1() will fail. + */ + dst_tmp.sin6_len = sizeof(dst_tmp); + + CURVNET_SET(net_dev->if_vnet); + rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0); + CURVNET_RESTORE(); + if (rte != NULL) { + ret = (rte->rt_ifp == net_dev); + RTFREE_LOCKED(rte); + } else { + ret = false; + } + return ret; +#else + return false; +#endif +} + +static bool validate_net_dev(struct net_device *net_dev, + const struct sockaddr *daddr, + const struct sockaddr *saddr) +{ + const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr; + const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr; + const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; + const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr; + + switch (daddr->sa_family) { + case AF_INET: + return saddr->sa_family == AF_INET && + validate_ipv4_net_dev(net_dev, daddr4, saddr4); + + case AF_INET6: + return saddr->sa_family == AF_INET6 && + validate_ipv6_net_dev(net_dev, daddr6, saddr6); + default: - break; + return false; } } -static inline int cma_user_data_offset(enum rdma_port_space ps) +static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event, + const struct cma_req_info *req) { - switch (ps) { - case RDMA_PS_SDP: - return 0; + struct sockaddr_storage listen_addr_storage, src_addr_storage; + struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage, + *src_addr = (struct sockaddr *)&src_addr_storage; + struct net_device *net_dev; + const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL; + int err; + + err = cma_save_ip_info(listen_addr, src_addr, ib_event, + req->service_id); + if (err) + return ERR_PTR(err); + + net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey, + gid, listen_addr); + if (!net_dev) + return ERR_PTR(-ENODEV); + + if (!validate_net_dev(net_dev, listen_addr, src_addr)) { + dev_put(net_dev); + return ERR_PTR(-EHOSTUNREACH); + } + + return net_dev; +} + +static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id) +{ + return (be64_to_cpu(service_id) >> 16) & 0xffff; +} + +static bool cma_match_private_data(struct rdma_id_private *id_priv, + const struct cma_hdr *hdr) +{ + struct sockaddr *addr = cma_src_addr(id_priv); + __be32 ip4_addr; + struct in6_addr ip6_addr; + + if (cma_any_addr(addr) && !id_priv->afonly) + return true; + + switch (addr->sa_family) { + case AF_INET: + ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr; + if (cma_get_ip_ver(hdr) != 4) + return false; + if (!cma_any_addr(addr) && + hdr->dst_addr.ip4.addr != ip4_addr) + return false; + break; + case AF_INET6: + ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr; + if (cma_get_ip_ver(hdr) != 6) + return false; + if (!cma_any_addr(addr) && + memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr))) + return false; + break; + case AF_IB: + return true; default: - return sizeof(struct cma_hdr); + return false; } + + return true; } +static bool cma_protocol_roce_dev_port(struct ib_device *device, int port_num) +{ + enum rdma_link_layer ll = rdma_port_get_link_layer(device, port_num); + enum rdma_transport_type transport = + rdma_node_get_transport(device->node_type); + + return ll == IB_LINK_LAYER_ETHERNET && transport == RDMA_TRANSPORT_IB; +} + +static bool cma_protocol_roce(const struct rdma_cm_id *id) +{ + struct ib_device *device = id->device; + const int port_num = id->port_num ?: rdma_start_port(device); + + return cma_protocol_roce_dev_port(device, port_num); +} + +static bool cma_match_net_dev(const struct rdma_cm_id *id, + const struct net_device *net_dev, + u8 port_num) +{ + const struct rdma_addr *addr = &id->route.addr; + + if (!net_dev) + /* This request is an AF_IB request or a RoCE request */ + return (!id->port_num || id->port_num == port_num) && + (addr->src_addr.ss_family == AF_IB || + cma_protocol_roce_dev_port(id->device, port_num)); + + return !addr->dev_addr.bound_dev_if || + (net_eq(dev_net(net_dev), addr->dev_addr.net) && + addr->dev_addr.bound_dev_if == net_dev->if_index); +} + +static struct rdma_id_private *cma_find_listener( + const struct rdma_bind_list *bind_list, + const struct ib_cm_id *cm_id, + const struct ib_cm_event *ib_event, + const struct cma_req_info *req, + const struct net_device *net_dev) +{ + struct rdma_id_private *id_priv, *id_priv_dev; + + if (!bind_list) + return ERR_PTR(-EINVAL); + + hlist_for_each_entry(id_priv, &bind_list->owners, node) { + if (cma_match_private_data(id_priv, ib_event->private_data)) { + if (id_priv->id.device == cm_id->device && + cma_match_net_dev(&id_priv->id, net_dev, req->port)) + return id_priv; + list_for_each_entry(id_priv_dev, + &id_priv->listen_list, + listen_list) { + if (id_priv_dev->id.device == cm_id->device && + cma_match_net_dev(&id_priv_dev->id, net_dev, req->port)) + return id_priv_dev; + } + } + } + + return ERR_PTR(-EINVAL); +} + +static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id, + struct ib_cm_event *ib_event, + struct net_device **net_dev) +{ + struct cma_req_info req; + struct rdma_bind_list *bind_list; + struct rdma_id_private *id_priv; + int err; + + err = cma_save_req_info(ib_event, &req); + if (err) + return ERR_PTR(err); + + *net_dev = cma_get_net_dev(ib_event, &req); + if (IS_ERR(*net_dev)) { + if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { + /* Assuming the protocol is AF_IB */ + *net_dev = NULL; + } else if (cma_protocol_roce_dev_port(req.device, req.port)) { + /* TODO find the net dev matching the request parameters + * through the RoCE GID table */ + *net_dev = NULL; + } else { + return ERR_CAST(*net_dev); + } + } + + bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, + rdma_ps_from_service_id(req.service_id), + cma_port_from_service_id(req.service_id)); + id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev); + if (IS_ERR(id_priv) && *net_dev) { + dev_put(*net_dev); + *net_dev = NULL; + } + + return id_priv; +} + +static inline int cma_user_data_offset(struct rdma_id_private *id_priv) +{ + return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); +} + static void cma_cancel_route(struct rdma_id_private *id_priv) { - switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) { - case IB_LINK_LAYER_INFINIBAND: + if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) { if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); - break; - default: - break; } } static void cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ mutex_lock(&lock); list_del(&id_priv->list); while (!list_empty(&id_priv->listen_list)) { dev_id_priv = list_entry(id_priv->listen_list.next, struct rdma_id_private, listen_list); /* sync with device removal to avoid duplicate destruction */ list_del_init(&dev_id_priv->list); list_del(&dev_id_priv->listen_list); mutex_unlock(&lock); rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } mutex_unlock(&lock); } static void cma_cancel_operation(struct rdma_id_private *id_priv, enum rdma_cm_state state) { switch (state) { case RDMA_CM_ADDR_QUERY: rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: cma_cancel_route(id_priv); break; case RDMA_CM_LISTEN: - if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr) - && !id_priv->cma_dev) + if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: break; } } static void cma_release_port(struct rdma_id_private *id_priv) { - struct rdma_bind_list *bind_list; + struct rdma_bind_list *bind_list = id_priv->bind_list; + struct vnet *net = id_priv->id.route.addr.dev_addr.net; - mutex_lock(&lock); - bind_list = id_priv->bind_list; - if (!bind_list) { - mutex_unlock(&lock); + if (!bind_list) return; - } + + mutex_lock(&lock); hlist_del(&id_priv->node); - id_priv->bind_list = NULL; if (hlist_empty(&bind_list->owners)) { - idr_remove(bind_list->ps, bind_list->port); + cma_ps_remove(net, bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) { struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { mc = container_of(id_priv->mc_list.next, struct cma_multicast, list); list_del(&mc->list); - switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) { - case IB_LINK_LAYER_INFINIBAND: + if (rdma_cap_ib_mcast(id_priv->cma_dev->device, + id_priv->id.port_num)) { ib_sa_free_multicast(mc->multicast.ib); kfree(mc); - break; - case IB_LINK_LAYER_ETHERNET: + } else { + if (mc->igmp_joined) { + struct rdma_dev_addr *dev_addr = + &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(dev_addr->net, + dev_addr->bound_dev_if); + if (ndev) { + dev_put(ndev); + } + } kref_put(&mc->mcref, release_mc); - break; - default: - break; } } } -static void __rdma_free(struct work_struct *work) -{ - struct rdma_id_private *id_priv; - id_priv = container_of(work, struct rdma_id_private, work); - wait_for_completion(&id_priv->comp); - - if (id_priv->internal_id) - cma_deref_id(id_priv->id.context); - - kfree(id_priv->id.route.path_rec); - kfree(id_priv); -} - void rdma_destroy_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; enum rdma_cm_state state; - unsigned long flags; - struct ib_cm_id *ib; id_priv = container_of(id, struct rdma_id_private, id); state = cma_exch(id_priv, RDMA_CM_DESTROYING); cma_cancel_operation(id_priv, state); /* * Wait for any active callback to finish. New callbacks will find * the id_priv state set to destroying and abort. */ mutex_lock(&id_priv->handler_mutex); mutex_unlock(&id_priv->handler_mutex); if (id_priv->cma_dev) { - switch (rdma_node_get_transport(id_priv->id.device->node_type)) { - case RDMA_TRANSPORT_IB: - spin_lock_irqsave(&id_priv->cm_lock, flags); - if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) { - ib = id_priv->cm_id.ib; - id_priv->cm_id.ib = NULL; - spin_unlock_irqrestore(&id_priv->cm_lock, flags); - ib_destroy_cm_id(ib); - } else - spin_unlock_irqrestore(&id_priv->cm_lock, flags); - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + if (rdma_cap_ib_cm(id_priv->id.device, 1)) { + if (id_priv->cm_id.ib) + ib_destroy_cm_id(id_priv->cm_id.ib); + } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.iw) iw_destroy_cm_id(id_priv->cm_id.iw); - break; - default: - break; } cma_leave_mc_groups(id_priv); cma_release_dev(id_priv); } cma_release_port(id_priv); cma_deref_id(id_priv); - INIT_WORK(&id_priv->work, __rdma_free); - queue_work(cma_free_wq, &id_priv->work); + wait_for_completion(&id_priv->comp); + + if (id_priv->internal_id) + cma_deref_id(id_priv->id.context); + + kfree(id_priv->id.route.path_rec); + kfree(id_priv); } EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) { int ret; ret = cma_modify_qp_rtr(id_priv, NULL); if (ret) goto reject; ret = cma_modify_qp_rts(id_priv, NULL); if (ret) goto reject; - cma_dbg(id_priv, "sending RTU\n"); ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); - cma_dbg(id_priv, "sending REJ\n"); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; } -static int cma_verify_rep(struct rdma_id_private *id_priv, void *data) -{ - if (id_priv->id.ps == RDMA_PS_SDP && - sdp_get_majv(((struct sdp_hah *) data)->sdp_version) != - SDP_MAJ_VERSION) - return -EINVAL; - - return 0; -} - static void cma_set_rep_event_data(struct rdma_cm_event *event, struct ib_cm_rep_event_param *rep_data, void *private_data) { event->param.conn.private_data = private_data; event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; event->param.conn.responder_resources = rep_data->responder_resources; event->param.conn.initiator_depth = rep_data->initiator_depth; event->param.conn.flow_control = rep_data->flow_control; event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; } static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event; int ret = 0; + mutex_lock(&id_priv->handler_mutex); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && - cma_disable_callback(id_priv, RDMA_CM_CONNECT)) || + id_priv->state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && - cma_disable_callback(id_priv, RDMA_CM_DISCONNECT))) - return 0; + id_priv->state != RDMA_CM_DISCONNECT)) + goto out; + memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_REQ_ERROR: case IB_CM_REP_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: - event.status = cma_verify_rep(id_priv, ib_event->private_data); - if (event.status) - event.event = RDMA_CM_EVENT_CONNECT_ERROR; - else if (id_priv->id.qp && id_priv->id.ps != RDMA_PS_SDP) { + if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : RDMA_CM_EVENT_ESTABLISHED; - } else + } else { event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; + } cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, ib_event->private_data); break; case IB_CM_RTU_RECEIVED: case IB_CM_USER_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR: event.status = -ETIMEDOUT; /* fall through */ case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_DISCONNECT)) goto out; event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IB_CM_TIMEWAIT_EXIT: event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; break; case IB_CM_MRA_RECEIVED: /* ignore event */ goto out; case IB_CM_REJ_RECEIVED: cma_modify_qp_err(id_priv); event.status = ib_event->param.rej_rcvd.reason; event.event = RDMA_CM_EVENT_REJECTED; event.param.conn.private_data = ib_event->private_data; event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: - printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", + pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, - struct ib_cm_event *ib_event) + struct ib_cm_event *ib_event, + struct net_device *net_dev) { struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; - union cma_ip_addr *src, *dst; - __be16 port; - u8 ip_ver; + const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; + const __be64 service_id = + ib_event->param.req_rcvd.primary_path->service_id; int ret; - if (cma_get_net_info(ib_event->private_data, listen_id->ps, - &ip_ver, &port, &src, &dst)) - return NULL; - - id = rdma_create_id(listen_id->event_handler, listen_id->context, + id = rdma_create_id(listen_id->route.addr.dev_addr.net, + listen_id->event_handler, listen_id->context, listen_id->ps, ib_event->param.req_rcvd.qp_type); if (IS_ERR(id)) return NULL; - cma_save_net_info(&id->route.addr, &listen_id->route.addr, - ip_ver, port, src, dst); + id_priv = container_of(id, struct rdma_id_private, id); + if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, + (struct sockaddr *)&id->route.addr.dst_addr, + listen_id, ib_event, ss_family, service_id)) + goto err; rt = &id->route; rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths, GFP_KERNEL); if (!rt->path_rec) goto err; rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path; if (rt->num_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; - if (cma_any_addr((struct sockaddr *) &rt->addr.src_addr)) { - rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; - rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); - ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); - } else { - ret = rdma_translate_ip((struct sockaddr *) &rt->addr.src_addr, - &rt->addr.dev_addr, NULL); + if (net_dev) { + ret = rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL); if (ret) goto err; + } else { + if (!cma_protocol_roce(listen_id) && + cma_any_addr(cma_src_addr(id_priv))) { + rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; + rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); + ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); + } else if (!cma_any_addr(cma_src_addr(id_priv))) { + ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); + if (ret) + goto err; + } } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); - id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, - struct ib_cm_event *ib_event) + struct ib_cm_event *ib_event, + struct net_device *net_dev) { struct rdma_id_private *id_priv; struct rdma_cm_id *id; - union cma_ip_addr *src, *dst; - __be16 port; - u8 ip_ver; + const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; + struct vnet *net = listen_id->route.addr.dev_addr.net; int ret; - id = rdma_create_id(listen_id->event_handler, listen_id->context, + id = rdma_create_id(net, listen_id->event_handler, listen_id->context, listen_id->ps, IB_QPT_UD); if (IS_ERR(id)) return NULL; - - if (cma_get_net_info(ib_event->private_data, listen_id->ps, - &ip_ver, &port, &src, &dst)) + id_priv = container_of(id, struct rdma_id_private, id); + if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, + (struct sockaddr *)&id->route.addr.dst_addr, + listen_id, ib_event, ss_family, + ib_event->param.sidr_req_rcvd.service_id)) goto err; - cma_save_net_info(&id->route.addr, &listen_id->route.addr, - ip_ver, port, src, dst); - - if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) { - ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, - &id->route.addr.dev_addr, NULL); + if (net_dev) { + ret = rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL); if (ret) goto err; + } else { + if (!cma_any_addr(cma_src_addr(id_priv))) { + ret = cma_translate_addr(cma_src_addr(id_priv), + &id->route.addr.dev_addr); + if (ret) + goto err; + } } - id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static void cma_set_req_event_data(struct rdma_cm_event *event, struct ib_cm_req_event_param *req_data, void *private_data, int offset) { - event->param.conn.private_data = private_data + offset; + event->param.conn.private_data = (char *)private_data + offset; event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; event->param.conn.responder_resources = req_data->responder_resources; event->param.conn.initiator_depth = req_data->initiator_depth; event->param.conn.flow_control = req_data->flow_control; event->param.conn.retry_count = req_data->retry_count; event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; } static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_event) { return (((ib_event->event == IB_CM_REQ_RECEIVED) && (ib_event->param.req_rcvd.qp_type == id->qp_type)) || ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) && (id->qp_type == IB_QPT_UD)) || (!id->qp_type)); } static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { - struct rdma_id_private *listen_id, *conn_id; + struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event; + struct net_device *net_dev; int offset, ret; - u8 smac[ETH_ALEN]; - u8 alt_smac[ETH_ALEN]; - u8 *psmac = smac; - u8 *palt_smac = alt_smac; - int is_iboe = ((rdma_node_get_transport(cm_id->device->node_type) == - RDMA_TRANSPORT_IB) && - (rdma_port_get_link_layer(cm_id->device, - ib_event->param.req_rcvd.port) == - IB_LINK_LAYER_ETHERNET)); - int is_sidr = 0; - listen_id = cm_id->context; - if (!cma_check_req_qp_type(&listen_id->id, ib_event)) - return -EINVAL; + listen_id = cma_id_from_event(cm_id, ib_event, &net_dev); + if (IS_ERR(listen_id)) + return PTR_ERR(listen_id); - if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) - return -ECONNABORTED; + if (!cma_check_req_qp_type(&listen_id->id, ib_event)) { + ret = -EINVAL; + goto net_dev_put; + } + mutex_lock(&listen_id->handler_mutex); + if (listen_id->state != RDMA_CM_LISTEN) { + ret = -ECONNABORTED; + goto err1; + } + memset(&event, 0, sizeof event); - offset = cma_user_data_offset(listen_id->id.ps); + offset = cma_user_data_offset(listen_id); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { - is_sidr = 1; - conn_id = cma_new_udp_id(&listen_id->id, ib_event); - event.param.ud.private_data = ib_event->private_data + offset; + conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev); + event.param.ud.private_data = (char *)ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { - conn_id = cma_new_conn_id(&listen_id->id, ib_event); + conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } if (!conn_id) { ret = -ENOMEM; goto err1; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); ret = cma_acquire_dev(conn_id, listen_id); if (ret) goto err2; conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; /* * Protect against the user destroying conn_id from another thread * until we're done accessing it. */ atomic_inc(&conn_id->refcount); ret = conn_id->id.event_handler(&conn_id->id, &event); if (ret) goto err3; - - if (is_iboe && !is_sidr) { - u32 scope_id = rdma_get_ipv6_scope_id(cm_id->device, - ib_event->param.req_rcvd.port); - - if (ib_event->param.req_rcvd.primary_path != NULL) - rdma_addr_find_smac_by_sgid( - &ib_event->param.req_rcvd.primary_path->sgid, - psmac, NULL, scope_id); - else - psmac = NULL; - if (ib_event->param.req_rcvd.alternate_path != NULL) - rdma_addr_find_smac_by_sgid( - &ib_event->param.req_rcvd.alternate_path->sgid, - palt_smac, NULL, scope_id); - else - palt_smac = NULL; - } - /* - * Acquire mutex to prevent user executing rdma_destroy_id() - * while we're accessing the cm_id. - */ - mutex_lock(&lock); - if (is_iboe && !is_sidr) - ib_update_cm_av(cm_id, psmac, palt_smac); - if (cma_comp(conn_id, RDMA_CM_CONNECT) && (conn_id->id.qp_type != IB_QPT_UD)) { - cma_dbg(container_of(&conn_id->id, struct rdma_id_private, id), "sending MRA\n"); - ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); - } - mutex_unlock(&lock); - mutex_unlock(&conn_id->handler_mutex); + /* + * Acquire mutex to prevent user executing rdma_destroy_id() + * while we're accessing the cm_id. + */ + mutex_lock(&lock); + if (cma_comp(conn_id, RDMA_CM_CONNECT) && + (conn_id->id.qp_type != IB_QPT_UD)) + ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + mutex_unlock(&lock); + mutex_unlock(&conn_id->handler_mutex); mutex_unlock(&listen_id->handler_mutex); cma_deref_id(conn_id); + if (net_dev) + dev_put(net_dev); return 0; err3: cma_deref_id(conn_id); /* Destroy the CM ID by returning a non-zero value. */ conn_id->cm_id.ib = NULL; err2: cma_exch(conn_id, RDMA_CM_DESTROYING); mutex_unlock(&conn_id->handler_mutex); err1: mutex_unlock(&listen_id->handler_mutex); if (conn_id) rdma_destroy_id(&conn_id->id); + +net_dev_put: + if (net_dev) + dev_put(net_dev); + return ret; } -static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr) +__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) { - return cpu_to_be64(((u64)ps << 16) + be16_to_cpu(cma_port(addr))); -} + if (addr->sa_family == AF_IB) + return ((struct sockaddr_ib *) addr)->sib_sid; -static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, - struct ib_cm_compare_data *compare) -{ - struct cma_hdr *cma_data, *cma_mask; - struct sdp_hh *sdp_data, *sdp_mask; - __be32 ip4_addr; - struct in6_addr ip6_addr; - - memset(compare, 0, sizeof *compare); - cma_data = (void *) compare->data; - cma_mask = (void *) compare->mask; - sdp_data = (void *) compare->data; - sdp_mask = (void *) compare->mask; - - switch (addr->sa_family) { - case AF_INET: - ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr; - if (ps == RDMA_PS_SDP) { - sdp_set_ip_ver(sdp_data, 4); - sdp_set_ip_ver(sdp_mask, 0xF); - if (!cma_any_addr(addr)) { - sdp_data->dst_addr.ip4.addr = ip4_addr; - sdp_mask->dst_addr.ip4.addr = htonl(~0); - } - } else { - cma_set_ip_ver(cma_data, 4); - cma_set_ip_ver(cma_mask, 0xF); - if (!cma_any_addr(addr)) { - cma_data->dst_addr.ip4.addr = ip4_addr; - cma_mask->dst_addr.ip4.addr = htonl(~0); - } - } - break; - case AF_INET6: - ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr; - if (ps == RDMA_PS_SDP) { - sdp_set_ip_ver(sdp_data, 6); - sdp_set_ip_ver(sdp_mask, 0xF); - if (!cma_any_addr(addr)) { - sdp_data->dst_addr.ip6 = ip6_addr; - memset(&sdp_mask->dst_addr.ip6, 0xFF, - sizeof(sdp_mask->dst_addr.ip6)); - } - } else { - cma_set_ip_ver(cma_data, 6); - cma_set_ip_ver(cma_mask, 0xF); - if (!cma_any_addr(addr)) { - cma_data->dst_addr.ip6 = ip6_addr; - memset(&cma_mask->dst_addr.ip6, 0xFF, - sizeof(cma_mask->dst_addr.ip6)); - } - } - break; - default: - break; - } + return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr))); } +EXPORT_SYMBOL(rdma_get_service_id); static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; struct rdma_cm_event event; - struct sockaddr_in *sin; int ret = 0; + struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; + struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; - if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) - return 0; + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state != RDMA_CM_CONNECT) + goto out; memset(&event, 0, sizeof event); switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: - sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; - *sin = iw_event->local_addr; - sin = (struct sockaddr_in *) &id_priv->id.route.addr.dst_addr; - *sin = iw_event->remote_addr; - switch ((int)iw_event->status) { + memcpy(cma_src_addr(id_priv), laddr, + rdma_addr_size(laddr)); + memcpy(cma_dst_addr(id_priv), raddr, + rdma_addr_size(raddr)); + switch (iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; case -ECONNRESET: case -ECONNREFUSED: event.event = RDMA_CM_EVENT_REJECTED; break; case -ETIMEDOUT: event.event = RDMA_CM_EVENT_UNREACHABLE; break; default: event.event = RDMA_CM_EVENT_CONNECT_ERROR; break; } break; case IW_CM_EVENT_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; default: BUG_ON(1); } event.status = iw_event->status; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } +out: mutex_unlock(&id_priv->handler_mutex); return ret; } static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; - struct sockaddr_in *sin; - struct net_device *dev = NULL; struct rdma_cm_event event; - int ret; - struct ib_device_attr attr; + int ret = -ECONNABORTED; + struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; + struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; listen_id = cm_id->context; - if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) - return -ECONNABORTED; + mutex_lock(&listen_id->handler_mutex); + if (listen_id->state != RDMA_CM_LISTEN) + goto out; + /* Create a new RDMA id for the new IW CM ID */ - new_cm_id = rdma_create_id(listen_id->id.event_handler, + new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net, + listen_id->id.event_handler, listen_id->id.context, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(new_cm_id)) { ret = -ENOMEM; goto out; } conn_id = container_of(new_cm_id, struct rdma_id_private, id); mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; - dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr); - if (!dev) { - ret = -EADDRNOTAVAIL; - mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; - } - ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL); + ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr, NULL); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } ret = cma_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } conn_id->cm_id.iw = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; - sin = (struct sockaddr_in *) &new_cm_id->route.addr.src_addr; - *sin = iw_event->local_addr; - sin = (struct sockaddr_in *) &new_cm_id->route.addr.dst_addr; - *sin = iw_event->remote_addr; + memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); + memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); - ret = ib_query_device(conn_id->id.device, &attr); - if (ret) { - mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; - } - memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; /* * Protect against the user destroying conn_id from another thread * until we're done accessing it. */ atomic_inc(&conn_id->refcount); ret = conn_id->id.event_handler(&conn_id->id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; cma_exch(conn_id, RDMA_CM_DESTROYING); mutex_unlock(&conn_id->handler_mutex); cma_deref_id(conn_id); rdma_destroy_id(&conn_id->id); goto out; } mutex_unlock(&conn_id->handler_mutex); cma_deref_id(conn_id); out: - if (dev) - dev_put(dev); mutex_unlock(&listen_id->handler_mutex); return ret; } static int cma_ib_listen(struct rdma_id_private *id_priv) { - struct ib_cm_compare_data compare_data; struct sockaddr *addr; struct ib_cm_id *id; __be64 svc_id; - int ret; - id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv); + addr = cma_src_addr(id_priv); + svc_id = rdma_get_service_id(&id_priv->id, addr); + id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id); if (IS_ERR(id)) return PTR_ERR(id); - id_priv->cm_id.ib = id; - addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; - svc_id = cma_get_service_id(id_priv->id.ps, addr); - if (cma_any_addr(addr) && !id_priv->afonly) - ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL); - else { - cma_set_compare_data(id_priv->id.ps, addr, &compare_data); - ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data); - } - - if (ret) { - ib_destroy_cm_id(id_priv->cm_id.ib); - id_priv->cm_id.ib = NULL; - } - - return ret; + return 0; } static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; - struct sockaddr_in *sin; struct iw_cm_id *id; id = iw_create_cm_id(id_priv->id.device, - id_priv->sock, - iw_conn_req_handler, - id_priv); + iw_conn_req_handler, + id_priv); if (IS_ERR(id)) return PTR_ERR(id); + id->tos = id_priv->tos; id_priv->cm_id.iw = id; - sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; - id_priv->cm_id.iw->local_addr = *sin; + memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); ret = iw_cm_listen(id_priv->cm_id.iw, backlog); if (ret) { iw_destroy_cm_id(id_priv->cm_id.iw); id_priv->cm_id.iw = NULL; } return ret; } static int cma_listen_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rdma_id_private *id_priv = id->context; id->context = id_priv->id.context; id->event_handler = id_priv->id.event_handler; return id_priv->id.event_handler(id, event); } static void cma_listen_on_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { struct rdma_id_private *dev_id_priv; struct rdma_cm_id *id; + struct vnet *net = id_priv->id.route.addr.dev_addr.net; int ret; - id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps, + if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) + return; + + id = rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, id_priv->id.qp_type); if (IS_ERR(id)) return; dev_id_priv = container_of(id, struct rdma_id_private, id); dev_id_priv->state = RDMA_CM_ADDR_BOUND; - dev_id_priv->sock = id_priv->sock; - memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr, - ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); + memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); - cma_attach_to_dev(dev_id_priv, cma_dev); + _cma_attach_to_dev(dev_id_priv, cma_dev); list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); atomic_inc(&id_priv->refcount); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; ret = rdma_listen(id, id_priv->backlog); if (ret) - cma_warn(id_priv, "cma_listen_on_dev, error %d, listening on device %s\n", ret, cma_dev->device->name); + pr_warn("RDMA CMA: cma_listen_on_dev, error %d, listening on device %s\n", + ret, cma_dev->device->name); } static void cma_listen_on_all(struct rdma_id_private *id_priv) { struct cma_device *cma_dev; mutex_lock(&lock); list_add_tail(&id_priv->list, &listen_any_list); list_for_each_entry(cma_dev, &dev_list, list) cma_listen_on_dev(id_priv, cma_dev); mutex_unlock(&lock); } void rdma_set_service_type(struct rdma_cm_id *id, int tos) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); id_priv->tos = (u8) tos; } EXPORT_SYMBOL(rdma_set_service_type); -void rdma_set_timeout(struct rdma_cm_id *id, int timeout) -{ - struct rdma_id_private *id_priv; - - id_priv = container_of(id, struct rdma_id_private, id); - id_priv->qp_timeout = (u8) timeout; -} -EXPORT_SYMBOL(rdma_set_timeout); - static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, void *context) { struct cma_work *work = context; struct rdma_route *route; route = &work->id->id.route; if (!status) { route->num_paths = 1; *route->path_rec = *path_rec; } else { work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; work->event.status = status; } queue_work(cma_wq, &work->work); } static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, struct cma_work *work) { - struct rdma_addr *addr = &id_priv->id.route.addr; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct ib_sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; + struct sockaddr_ib *sib; memset(&path_rec, 0, sizeof path_rec); - rdma_addr_get_sgid(&addr->dev_addr, &path_rec.sgid); - rdma_addr_get_dgid(&addr->dev_addr, &path_rec.dgid); - path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr)); + rdma_addr_get_sgid(dev_addr, &path_rec.sgid); + rdma_addr_get_dgid(dev_addr, &path_rec.dgid); + path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; - path_rec.service_id = cma_get_service_id(id_priv->id.ps, - (struct sockaddr *) &addr->dst_addr); + path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; - if (addr->src_addr.ss_family == AF_INET) { + switch (cma_family(id_priv)) { + case AF_INET: path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; - } else { - sin6 = (struct sockaddr_in6 *) &addr->src_addr; + break; + case AF_INET6: + sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; + break; + case AF_IB: + sib = (struct sockaddr_ib *) cma_src_addr(id_priv); + path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20); + comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; + break; } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, id_priv->id.port_num, &path_rec, comp_mask, timeout_ms, GFP_KERNEL, cma_query_handler, work, &id_priv->query); return (id_priv->query_id < 0) ? id_priv->query_id : 0; } static void cma_work_handler(struct work_struct *_work) { struct cma_work *work = container_of(_work, struct cma_work, work); struct rdma_id_private *id_priv = work->id; int destroy = 0; mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) goto out; if (id_priv->id.event_handler(&id_priv->id, &work->event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); destroy = 1; } out: mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); if (destroy) rdma_destroy_id(&id_priv->id); kfree(work); } -static void cma_ndev_work_handler(struct work_struct *_work) -{ - struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work); - struct rdma_id_private *id_priv = work->id; - int destroy = 0; - - mutex_lock(&id_priv->handler_mutex); - if (id_priv->state == RDMA_CM_DESTROYING || - id_priv->state == RDMA_CM_DEVICE_REMOVAL) - goto out; - - if (id_priv->id.event_handler(&id_priv->id, &work->event)) { - cma_exch(id_priv, RDMA_CM_DESTROYING); - destroy = 1; - } - -out: - mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); - if (destroy) - rdma_destroy_id(&id_priv->id); - kfree(work); -} - static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } ret = cma_query_ib_route(id_priv, timeout_ms, work); if (ret) goto err2; return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } int rdma_set_ib_paths(struct rdma_cm_id *id, struct ib_sa_path_rec *path_rec, int num_paths) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths, GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } id->route.num_paths = num_paths; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_set_ib_paths); static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) { struct cma_work *work; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; queue_work(cma_wq, &work->work); return 0; } -static u8 tos_to_sl(u8 tos) +static int iboe_tos_to_sl(struct net_device *ndev, int tos) { - return def_prec2sl & 7; + /* TODO: Implement this function */ + return 0; } +static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, + unsigned long supported_gids, + enum ib_gid_type default_gid) +{ + if ((network_type == RDMA_NETWORK_IPV4 || + network_type == RDMA_NETWORK_IPV6) && + test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) + return IB_GID_TYPE_ROCE_UDP_ENCAP; + + return default_gid; +} + static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; - struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr; - struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr; struct net_device *ndev = NULL; - if (src_addr->sin_family != dst_addr->sin_family) - return -EINVAL; - work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } route->num_paths = 1; - if (addr->dev_addr.bound_dev_if) - ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); + if (addr->dev_addr.bound_dev_if) { + unsigned long supported_gids; + + ndev = dev_get_by_index(addr->dev_addr.net, + addr->dev_addr.bound_dev_if); + if (!ndev) { + ret = -ENODEV; + goto err2; + } + + if (ndev->if_flags & IFF_LOOPBACK) { + dev_put(ndev); + if (!id_priv->id.device->get_netdev) { + ret = -EOPNOTSUPP; + goto err2; + } + + ndev = id_priv->id.device->get_netdev(id_priv->id.device, + id_priv->id.port_num); + if (!ndev) { + ret = -ENODEV; + goto err2; + } + } + + route->path_rec->net = ndev->if_vnet; + route->path_rec->ifindex = ndev->if_index; + supported_gids = roce_gid_type_mask_support(id_priv->id.device, + id_priv->id.port_num); + route->path_rec->gid_type = + cma_route_gid_type(addr->dev_addr.network, + supported_gids, + id_priv->gid_type); + } if (!ndev) { ret = -ENODEV; goto err2; } - route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev); memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN); - memcpy(route->path_rec->smac, IF_LLADDR(ndev), ndev->if_addrlen); - rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); - route->path_rec->hop_limit = 1; + /* Use the hint from IP Stack to select GID Type */ + if (route->path_rec->gid_type < ib_network_to_gid_type(addr->dev_addr.network)) + route->path_rec->gid_type = ib_network_to_gid_type(addr->dev_addr.network); + if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) + /* TODO: get the hoplimit from the inet/inet6 device */ + route->path_rec->hop_limit = addr->dev_addr.hoplimit; + else + route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; - route->path_rec->sl = tos_to_sl(id_priv->tos); - + route->path_rec->sl = iboe_tos_to_sl(ndev, id_priv->tos); route->path_rec->mtu = iboe_get_mtu(ndev->if_mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = iboe_get_rate(ndev); dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; work->event.status = 0; queue_work(cma_wq, &work->work); return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: - switch (rdma_port_get_link_layer(id->device, id->port_num)) { - case IB_LINK_LAYER_INFINIBAND: - ret = cma_resolve_ib_route(id_priv, timeout_ms); - break; - case IB_LINK_LAYER_ETHERNET: - ret = cma_resolve_iboe_route(id_priv); - break; - default: - ret = -ENOSYS; - } - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + if (rdma_cap_ib_sa(id->device, id->port_num)) + ret = cma_resolve_ib_route(id_priv, timeout_ms); + else if (rdma_protocol_roce(id->device, id->port_num)) + ret = cma_resolve_iboe_route(id_priv); + else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv, timeout_ms); - break; - default: + else ret = -ENOSYS; - break; - } + if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); -int rdma_enable_apm(struct rdma_cm_id *id, enum alt_path_type alt_type) +static void cma_set_loopback(struct sockaddr *addr) { - /* APM is not supported yet */ - return -EINVAL; + switch (addr->sa_family) { + case AF_INET: + ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + break; + case AF_INET6: + ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr, + 0, 0, 0, htonl(1)); + break; + default: + ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr, + 0, 0, 0, htonl(1)); + break; + } } -EXPORT_SYMBOL(rdma_enable_apm); static int cma_bind_loopback(struct rdma_id_private *id_priv) { - struct cma_device *cma_dev; + struct cma_device *cma_dev, *cur_dev; struct ib_port_attr port_attr; union ib_gid gid; u16 pkey; int ret; u8 p; + cma_dev = NULL; mutex_lock(&lock); - if (list_empty(&dev_list)) { + list_for_each_entry(cur_dev, &dev_list, list) { + if (cma_family(id_priv) == AF_IB && + !rdma_cap_ib_cm(cur_dev->device, 1)) + continue; + + if (!cma_dev) + cma_dev = cur_dev; + + for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + if (!ib_query_port(cur_dev->device, p, &port_attr) && + port_attr.state == IB_PORT_ACTIVE) { + cma_dev = cur_dev; + goto port_found; + } + } + } + + if (!cma_dev) { ret = -ENODEV; goto out; } - list_for_each_entry(cma_dev, &dev_list, list) - for (p = 1; p <= cma_dev->device->phys_port_cnt; ++p) - if (!ib_query_port(cma_dev->device, p, &port_attr) && - port_attr.state == IB_PORT_ACTIVE) - goto port_found; p = 1; - cma_dev = list_entry(dev_list.next, struct cma_device, list); port_found: - ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid); + ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid, NULL); if (ret) goto out; ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); if (ret) goto out; id_priv->id.route.addr.dev_addr.dev_type = - (rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ? + (rdma_protocol_ib(cma_dev->device, p)) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); + cma_set_loopback(cma_src_addr(id_priv)); out: mutex_unlock(&lock); return ret; } static void addr_handler(int status, struct sockaddr *src_addr, struct rdma_dev_addr *dev_addr, void *context) { struct rdma_id_private *id_priv = context; struct rdma_cm_event event; memset(&event, 0, sizeof event); mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) goto out; - memcpy(&id_priv->id.route.addr.src_addr, src_addr, - ip_addr_size(src_addr)); + memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) status = cma_acquire_dev(id_priv, NULL); if (status) { if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ADDR_BOUND)) goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; if (id_priv->id.event_handler(&id_priv->id, &event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); rdma_destroy_id(&id_priv->id); return; } out: mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; - struct sockaddr *src, *dst; union ib_gid gid; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_bind_loopback(id_priv); if (ret) goto err; } rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); - src = (struct sockaddr *) &id_priv->id.route.addr.src_addr; - if (cma_zero_addr(src)) { - dst = (struct sockaddr *) &id_priv->id.route.addr.dst_addr; - if ((src->sa_family = dst->sa_family) == AF_INET) { - ((struct sockaddr_in *)src)->sin_addr = - ((struct sockaddr_in *)dst)->sin_addr; - } else { - ((struct sockaddr_in6 *)src)->sin6_addr = - ((struct sockaddr_in6 *)dst)->sin6_addr; - } - } - work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); return 0; err: kfree(work); return ret; } -static int cma_resolve_scif(struct rdma_id_private *id_priv) +static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) { struct cma_work *work; + int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; - /* we probably can leave it empty here */ + if (!id_priv->cma_dev) { + ret = cma_resolve_ib_dev(id_priv); + if (ret) + goto err; + } + rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) + &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); + work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); return 0; +err: + kfree(work); + return ret; } static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr) { if (!src_addr || !src_addr->sa_family) { src_addr = (struct sockaddr *) &id->route.addr.src_addr; src_addr->sa_family = dst_addr->sa_family; -#ifdef INET6 if (dst_addr->sa_family == AF_INET6) { - ((struct sockaddr_in6 *) src_addr)->sin6_scope_id = - ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id; + struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr; + struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr; + src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; + if (IN6_IS_SCOPE_LINKLOCAL(&dst_addr6->sin6_addr)) + id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id; + } else if (dst_addr->sa_family == AF_IB) { + ((struct sockaddr_ib *) src_addr)->sib_pkey = + ((struct sockaddr_ib *) dst_addr)->sib_pkey; } -#endif } - if (!cma_any_addr(src_addr)) - return rdma_bind_addr(id, src_addr); - else { -#if defined(INET6) || defined(INET) - union { -#ifdef INET - struct sockaddr_in in; -#endif -#ifdef INET6 - struct sockaddr_in6 in6; -#endif - } addr; -#endif - - switch(dst_addr->sa_family) { -#ifdef INET - case AF_INET: - memset(&addr.in, 0, sizeof(addr.in)); - addr.in.sin_family = dst_addr->sa_family; - addr.in.sin_len = sizeof(addr.in); - return rdma_bind_addr(id, (struct sockaddr *)&addr.in); -#endif -#ifdef INET6 - case AF_INET6: - memset(&addr.in6, 0, sizeof(addr.in6)); - addr.in6.sin6_family = dst_addr->sa_family; - addr.in6.sin6_len = sizeof(addr.in6); - addr.in6.sin6_scope_id = - ((struct sockaddr_in6 *)dst_addr)->sin6_scope_id; - return rdma_bind_addr(id, (struct sockaddr *)&addr.in6); -#endif - default: - return -EINVAL; - } - } + return rdma_bind_addr(id, src_addr); } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr, int timeout_ms) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id_priv->state == RDMA_CM_IDLE) { ret = cma_bind_addr(id, src_addr, dst_addr); if (ret) return ret; } + if (cma_family(id_priv) != dst_addr->sa_family) + return -EINVAL; + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); - memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr)); - if (cma_any_addr(dst_addr)) + memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); + if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); - else if (id_priv->id.device && - rdma_node_get_transport(id_priv->id.device->node_type) == RDMA_TRANSPORT_SCIF) - ret = cma_resolve_scif(id_priv); - else - ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr, - dst_addr, &id->route.addr.dev_addr, - timeout_ms, addr_handler, id_priv); + } else { + if (dst_addr->sa_family == AF_IB) { + ret = cma_resolve_ib_addr(id_priv); + } else { + ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv), + dst_addr, &id->route.addr.dev_addr, + timeout_ms, addr_handler, id_priv); + } + } if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); - if (id_priv->state == RDMA_CM_IDLE) { + if (reuse || id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_reuseaddr); int rdma_set_afonly(struct rdma_cm_id *id, int afonly) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) { id_priv->options |= (1 << CMA_OPTION_AFONLY); id_priv->afonly = afonly; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_afonly); static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { - struct sockaddr_in *sin; + struct sockaddr *addr; + struct sockaddr_ib *sib; + u64 sid, mask; + __be16 port; - sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; - sin->sin_port = htons(bind_list->port); + addr = cma_src_addr(id_priv); + port = htons(bind_list->port); + + switch (addr->sa_family) { + case AF_INET: + ((struct sockaddr_in *) addr)->sin_port = port; + break; + case AF_INET6: + ((struct sockaddr_in6 *) addr)->sin6_port = port; + break; + case AF_IB: + sib = (struct sockaddr_ib *) addr; + sid = be64_to_cpu(sib->sib_sid); + mask = be64_to_cpu(sib->sib_sid_mask); + sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port)); + sib->sib_sid_mask = cpu_to_be64(~0ULL); + break; + } id_priv->bind_list = bind_list; hlist_add_head(&id_priv->node, &bind_list->owners); } -static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv, - unsigned short snum) +static int cma_alloc_port(enum rdma_port_space ps, + struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; - int port, ret; + int ret; bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; - do { - ret = idr_get_new_above(ps, bind_list, snum, &port); - } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); + ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list, + snum); + if (ret < 0) + goto err; - if (ret) - goto err1; - - if (port != snum) { - ret = -EADDRNOTAVAIL; - goto err2; - } - bind_list->ps = ps; - bind_list->port = (unsigned short) port; + bind_list->port = (unsigned short)ret; cma_bind_port(bind_list, id_priv); return 0; -err2: - idr_remove(ps, port); -err1: +err: kfree(bind_list); - return ret; + return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; } -static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) +static int cma_alloc_any_port(enum rdma_port_space ps, + struct rdma_id_private *id_priv) { static unsigned int last_used_port; int low, high, remaining; unsigned int rover; + struct vnet *net = id_priv->id.route.addr.dev_addr.net; + u32 rand; - inet_get_local_port_range(&init_net, &low, &high); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rover = random() % remaining + low; + get_random_bytes(&rand, sizeof(rand)); + rover = rand % remaining + low; retry: if (last_used_port != rover && - !idr_find(ps, (unsigned short) rover)) { + !cma_ps_find(net, ps, (unsigned short)rover)) { int ret = cma_alloc_port(ps, id_priv, rover); /* * Remember previously used port number in order to avoid * re-using same port immediately after it is closed. */ if (!ret) last_used_port = rover; if (ret != -EADDRNOTAVAIL) return ret; - } + } if (--remaining) { rover++; if ((rover < low) || (rover > high)) rover = low; goto retry; } return -EADDRNOTAVAIL; } /* * Check that the requested port is available. This is called when trying to * bind to a specific port, or when trying to listen on a bound port. In * the latter case, the provided id_priv may already be on the bind_list, but * we still need to check that it's okay to start listening. */ static int cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv, uint8_t reuseaddr) { struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; - addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; + addr = cma_src_addr(id_priv); hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr && cur_id->reuseaddr) continue; - cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr; + cur_addr = cma_src_addr(cur_id); if (id_priv->afonly && cur_id->afonly && (addr->sa_family != cur_addr->sa_family)) continue; if (cma_any_addr(addr) || cma_any_addr(cur_addr)) return -EADDRNOTAVAIL; if (!cma_addr_cmp(addr, cur_addr)) return -EADDRINUSE; } return 0; } -static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) +static int cma_use_port(enum rdma_port_space ps, + struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; unsigned short snum; int ret; - snum = ntohs(cma_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)); + snum = ntohs(cma_port(cma_src_addr(id_priv))); + if (snum < IPPORT_RESERVED && + priv_check(curthread, PRIV_NETINET_BINDANY) != 0) + return -EACCES; - bind_list = idr_find(ps, snum); + bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, snum); } else { ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); if (!ret) cma_bind_port(bind_list, id_priv); } return ret; } static int cma_bind_listen(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list = id_priv->bind_list; int ret = 0; mutex_lock(&lock); if (bind_list->owners.first->next) ret = cma_check_port(bind_list, id_priv, 0); mutex_unlock(&lock); return ret; } -static int cma_get_tcp_port(struct rdma_id_private *id_priv) +static enum rdma_port_space cma_select_inet_ps( + struct rdma_id_private *id_priv) { - int ret; - int size; - struct socket *sock; + switch (id_priv->id.ps) { + case RDMA_PS_TCP: + case RDMA_PS_UDP: + case RDMA_PS_IPOIB: + case RDMA_PS_IB: + return id_priv->id.ps; + default: - ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (ret) - return ret; -#ifdef __linux__ - ret = sock->ops->bind(sock, - (struct sockaddr *) &id_priv->id.route.addr.src_addr, - ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); -#else - ret = -sobind(sock, - (struct sockaddr *)&id_priv->id.route.addr.src_addr, - curthread); -#endif - if (ret) { - sock_release(sock); - return ret; + return 0; } +} - size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr); - ret = sock_getname(sock, - (struct sockaddr *) &id_priv->id.route.addr.src_addr, - &size, 0); - if (ret) { - sock_release(sock); - return ret; +static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv) +{ + enum rdma_port_space ps = 0; + struct sockaddr_ib *sib; + u64 sid_ps, mask, sid; + + sib = (struct sockaddr_ib *) cma_src_addr(id_priv); + mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK; + sid = be64_to_cpu(sib->sib_sid) & mask; + + if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { + sid_ps = RDMA_IB_IP_PS_IB; + ps = RDMA_PS_IB; + } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && + (sid == (RDMA_IB_IP_PS_TCP & mask))) { + sid_ps = RDMA_IB_IP_PS_TCP; + ps = RDMA_PS_TCP; + } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && + (sid == (RDMA_IB_IP_PS_UDP & mask))) { + sid_ps = RDMA_IB_IP_PS_UDP; + ps = RDMA_PS_UDP; } - id_priv->sock = sock; - return 0; + if (ps) { + sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib))); + sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK | + be64_to_cpu(sib->sib_sid_mask)); + } + return ps; } static int cma_get_port(struct rdma_id_private *id_priv) { - struct idr *ps; + enum rdma_port_space ps; int ret; - switch (id_priv->id.ps) { - case RDMA_PS_SDP: - ps = &sdp_ps; - break; - case RDMA_PS_TCP: - ps = &tcp_ps; - if (unify_tcp_port_space) { - ret = cma_get_tcp_port(id_priv); - if (ret) - goto out; - } - break; - case RDMA_PS_UDP: - ps = &udp_ps; - break; - case RDMA_PS_IPOIB: - ps = &ipoib_ps; - break; - case RDMA_PS_IB: - ps = &ib_ps; - break; - default: + if (cma_family(id_priv) != AF_IB) + ps = cma_select_inet_ps(id_priv); + else + ps = cma_select_ib_ps(id_priv); + if (!ps) return -EPROTONOSUPPORT; - } mutex_lock(&lock); - if (cma_any_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)) + if (cma_any_port(cma_src_addr(id_priv))) ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); -out: + return ret; } static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, struct sockaddr *addr) { -#if defined(INET6) - struct sockaddr_in6 *sin6; +#ifdef INET6 + struct sockaddr_in6 sin6; if (addr->sa_family != AF_INET6) return 0; - sin6 = (struct sockaddr_in6 *) addr; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) && - !sin6->sin6_scope_id) - return -EINVAL; + sin6 = *(struct sockaddr_in6 *)addr; - dev_addr->bound_dev_if = sin6->sin6_scope_id; + if (!(IN6_IS_SCOPE_LINKLOCAL(&sin6.sin6_addr))) + return 0; + + if (sa6_recoverscope(&sin6) || sin6.sin6_scope_id == 0) + return -EINVAL; + + dev_addr->bound_dev_if = sin6.sin6_scope_id; #endif return 0; } int rdma_listen(struct rdma_cm_id *id, int backlog) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id_priv->state == RDMA_CM_IDLE) { - ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET; - ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr); + id->route.addr.src_addr.ss_family = AF_INET; + ret = rdma_bind_addr(id, cma_src_addr(id_priv)); if (ret) return ret; } if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) return -EINVAL; if (id_priv->reuseaddr) { ret = cma_bind_listen(id_priv); if (ret) goto err; } id_priv->backlog = backlog; if (id->device) { - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, 1)) { ret = cma_ib_listen(id_priv); if (ret) goto err; - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + } else if (rdma_cap_iw_cm(id->device, 1)) { ret = cma_iw_listen(id_priv, backlog); if (ret) goto err; - break; - default: + } else { ret = -ENOSYS; goto err; } } else cma_listen_on_all(id_priv); return 0; err: id_priv->backlog = 0; cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_listen); int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; int ret; -#if defined(INET6) - int ipv6only; - size_t var_size = sizeof(int); -#endif - if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) + if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && + addr->sa_family != AF_IB) return -EAFNOSUPPORT; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL; ret = cma_check_linklocal(&id->route.addr.dev_addr, addr); if (ret) goto err1; - memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr)); + memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { - ret = rdma_translate_ip(addr, &id->route.addr.dev_addr, NULL); + ret = cma_translate_addr(addr, &id->route.addr.dev_addr); if (ret) goto err1; ret = cma_acquire_dev(id_priv, NULL); if (ret) goto err1; } if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { if (addr->sa_family == AF_INET) id_priv->afonly = 1; -#if defined(INET6) - else if (addr->sa_family == AF_INET6) - id_priv->afonly = kernel_sysctlbyname(&thread0, "net.inet6.ip6.v6only", - &ipv6only, &var_size, NULL, 0, NULL, 0); +#ifdef INET6 + else if (addr->sa_family == AF_INET6) { + CURVNET_SET_QUIET(id_priv->id.route.addr.dev_addr.net); + id_priv->afonly = V_ip6_v6only; + CURVNET_RESTORE(); + } #endif } ret = cma_get_port(id_priv); if (ret) goto err2; return 0; err2: if (id_priv->cma_dev) cma_release_dev(id_priv); err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; } EXPORT_SYMBOL(rdma_bind_addr); -static int cma_format_hdr(void *hdr, enum rdma_port_space ps, - struct rdma_route *route) +static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) { struct cma_hdr *cma_hdr; - struct sdp_hh *sdp_hdr; - if (route->addr.src_addr.ss_family == AF_INET) { + cma_hdr = hdr; + cma_hdr->cma_version = CMA_VERSION; + if (cma_family(id_priv) == AF_INET) { struct sockaddr_in *src4, *dst4; - src4 = (struct sockaddr_in *) &route->addr.src_addr; - dst4 = (struct sockaddr_in *) &route->addr.dst_addr; + src4 = (struct sockaddr_in *) cma_src_addr(id_priv); + dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv); - switch (ps) { - case RDMA_PS_SDP: - sdp_hdr = hdr; - if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) - return -EINVAL; - sdp_set_ip_ver(sdp_hdr, 4); - sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; - sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; - sdp_hdr->port = src4->sin_port; - break; - default: - cma_hdr = hdr; - cma_hdr->cma_version = CMA_VERSION; - cma_set_ip_ver(cma_hdr, 4); - cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; - cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; - cma_hdr->port = src4->sin_port; - break; - } - } else { + cma_set_ip_ver(cma_hdr, 4); + cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; + cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; + cma_hdr->port = src4->sin_port; + } else if (cma_family(id_priv) == AF_INET6) { struct sockaddr_in6 *src6, *dst6; - src6 = (struct sockaddr_in6 *) &route->addr.src_addr; - dst6 = (struct sockaddr_in6 *) &route->addr.dst_addr; + src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); + dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv); - switch (ps) { - case RDMA_PS_SDP: - sdp_hdr = hdr; - if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) - return -EINVAL; - sdp_set_ip_ver(sdp_hdr, 6); - sdp_hdr->src_addr.ip6 = src6->sin6_addr; - sdp_hdr->dst_addr.ip6 = dst6->sin6_addr; - sdp_hdr->port = src6->sin6_port; - break; - default: - cma_hdr = hdr; - cma_hdr->cma_version = CMA_VERSION; - cma_set_ip_ver(cma_hdr, 6); - cma_hdr->src_addr.ip6 = src6->sin6_addr; - cma_hdr->dst_addr.ip6 = dst6->sin6_addr; - cma_hdr->port = src6->sin6_port; - break; - } + cma_set_ip_ver(cma_hdr, 6); + cma_hdr->src_addr.ip6 = src6->sin6_addr; + cma_hdr->dst_addr.ip6 = dst6->sin6_addr; + cma_hdr->port = src6->sin6_port; } return 0; } static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event; struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret = 0; - if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) - return 0; + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state != RDMA_CM_CONNECT) + goto out; memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_SIDR_REP_RECEIVED: event.param.ud.private_data = ib_event->private_data; event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; if (rep->status != IB_SIDR_SUCCESS) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = ib_event->param.sidr_rep_rcvd.status; break; } - ret = cma_set_qkey(id_priv); + ret = cma_set_qkey(id_priv, rep->qkey); if (ret) { event.event = RDMA_CM_EVENT_ADDR_ERROR; - event.status = -EINVAL; + event.status = ret; break; } - if (id_priv->qkey != rep->qkey) { - event.event = RDMA_CM_EVENT_UNREACHABLE; - event.status = -EINVAL; + ret = ib_init_ah_from_path(id_priv->id.device, + id_priv->id.port_num, + id_priv->id.route.path_rec, + &event.param.ud.ah_attr); + if (ret) { + event.event = RDMA_CM_EVENT_ADDR_ERROR; + event.status = ret; break; } - ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num, - id_priv->id.route.path_rec, - &event.param.ud.ah_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; event.status = 0; break; default: - printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", + pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_sidr_req_param req; - struct rdma_route *route; struct ib_cm_id *id; - int ret; + void *private_data; + int offset, ret; - req.private_data_len = sizeof(struct cma_hdr) + - conn_param->private_data_len; + memset(&req, 0, sizeof req); + offset = cma_user_data_offset(id_priv); + req.private_data_len = offset + conn_param->private_data_len; if (req.private_data_len < conn_param->private_data_len) return -EINVAL; - req.private_data = kzalloc(req.private_data_len, GFP_ATOMIC); - if (!req.private_data) - return -ENOMEM; + if (req.private_data_len) { + private_data = kzalloc(req.private_data_len, GFP_ATOMIC); + if (!private_data) + return -ENOMEM; + } else { + private_data = NULL; + } if (conn_param->private_data && conn_param->private_data_len) - memcpy((void *) req.private_data + sizeof(struct cma_hdr), - conn_param->private_data, conn_param->private_data_len); + memcpy((char *)private_data + offset, conn_param->private_data, + conn_param->private_data_len); - route = &id_priv->id.route; - ret = cma_format_hdr((void *) req.private_data, id_priv->id.ps, route); - if (ret) - goto out; + if (private_data) { + ret = cma_format_hdr(private_data, id_priv); + if (ret) + goto out; + req.private_data = private_data; + } id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; - req.path = route->path_rec; - req.service_id = cma_get_service_id(id_priv->id.ps, - (struct sockaddr *) &route->addr.dst_addr); - req.timeout_ms = 1 << (cma_response_timeout - 8); + req.path = id_priv->id.route.path_rec; + req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); + req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; - cma_dbg(id_priv, "sending SIDR\n"); ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } out: - kfree(req.private_data); + kfree(private_data); return ret; } static int cma_connect_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_req_param req; struct rdma_route *route; void *private_data; struct ib_cm_id *id; int offset, ret; memset(&req, 0, sizeof req); - offset = cma_user_data_offset(id_priv->id.ps); + offset = cma_user_data_offset(id_priv); req.private_data_len = offset + conn_param->private_data_len; if (req.private_data_len < conn_param->private_data_len) return -EINVAL; - private_data = kzalloc(req.private_data_len, GFP_ATOMIC); - if (!private_data) - return -ENOMEM; + if (req.private_data_len) { + private_data = kzalloc(req.private_data_len, GFP_ATOMIC); + if (!private_data) + return -ENOMEM; + } else { + private_data = NULL; + } if (conn_param->private_data && conn_param->private_data_len) - memcpy(private_data + offset, conn_param->private_data, + memcpy((char *)private_data + offset, conn_param->private_data, conn_param->private_data_len); id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; route = &id_priv->id.route; - ret = cma_format_hdr(private_data, id_priv->id.ps, route); - if (ret) - goto out; - req.private_data = private_data; + if (private_data) { + ret = cma_format_hdr(private_data, id_priv); + if (ret) + goto out; + req.private_data = private_data; + } req.primary_path = &route->path_rec[0]; if (route->num_paths == 2) req.alternate_path = &route->path_rec[1]; - req.service_id = cma_get_service_id(id_priv->id.ps, - (struct sockaddr *) &route->addr.dst_addr); + req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.qp_num = id_priv->qp_num; req.qp_type = id_priv->id.qp_type; req.starting_psn = id_priv->seq_num; req.responder_resources = conn_param->responder_resources; req.initiator_depth = conn_param->initiator_depth; req.flow_control = conn_param->flow_control; req.retry_count = min_t(u8, 7, conn_param->retry_count); req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); - req.remote_cm_response_timeout = cma_response_timeout; - req.local_cm_response_timeout = cma_response_timeout; + req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; + req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; - cma_dbg(id_priv, "sending REQ\n"); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: if (ret && !IS_ERR(id)) { ib_destroy_cm_id(id); id_priv->cm_id.ib = NULL; } kfree(private_data); return ret; } static int cma_connect_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_id *cm_id; - struct sockaddr_in* sin; int ret; struct iw_cm_conn_param iw_param; - cm_id = iw_create_cm_id(id_priv->id.device, id_priv->sock, - cma_iw_handler, id_priv); + cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); + cm_id->tos = id_priv->tos; id_priv->cm_id.iw = cm_id; - sin = (struct sockaddr_in*) &id_priv->id.route.addr.src_addr; - cm_id->local_addr = *sin; + memcpy(&cm_id->local_addr, cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); + memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv), + rdma_addr_size(cma_dst_addr(id_priv))); - sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr; - cm_id->remote_addr = *sin; - ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; if (conn_param) { - iw_param.ord = conn_param->initiator_depth; - iw_param.ird = conn_param->responder_resources; - iw_param.private_data = conn_param->private_data; - iw_param.private_data_len = conn_param->private_data_len; + iw_param.ord = conn_param->initiator_depth; + iw_param.ird = conn_param->responder_resources; + iw_param.private_data = conn_param->private_data; + iw_param.private_data_len = conn_param->private_data_len; iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; } else { memset(&iw_param, 0, sizeof iw_param); iw_param.qpn = id_priv->qp_num; } ret = iw_cm_connect(cm_id, &iw_param); out: if (ret) { iw_destroy_cm_id(cm_id); id_priv->cm_id.iw = NULL; } return ret; } int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) ret = cma_connect_iw(id_priv, conn_param); - break; - default: + else ret = -ENOSYS; - break; - } if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_connect); static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_rep_param rep; int ret; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; ret = cma_modify_qp_rts(id_priv, conn_param); if (ret) goto out; memset(&rep, 0, sizeof rep); rep.qp_num = id_priv->qp_num; rep.starting_psn = id_priv->seq_num; rep.private_data = conn_param->private_data; rep.private_data_len = conn_param->private_data_len; rep.responder_resources = conn_param->responder_resources; rep.initiator_depth = conn_param->initiator_depth; rep.failover_accepted = 0; rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; - cma_dbg(id_priv, "sending REP\n"); + ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; } static int cma_accept_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_conn_param iw_param; int ret; - if (!conn_param) - return -EINVAL; - ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) return ret; iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; if (id_priv->id.qp) { iw_param.qpn = id_priv->qp_num; } else iw_param.qpn = conn_param->qp_num; return iw_cm_accept(id_priv->cm_id.iw, &iw_param); } static int cma_send_sidr_rep(struct rdma_id_private *id_priv, - enum ib_cm_sidr_status status, + enum ib_cm_sidr_status status, u32 qkey, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; int ret; memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { - ret = cma_set_qkey(id_priv); + ret = cma_set_qkey(id_priv, qkey); if (ret) return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; } rep.private_data = private_data; rep.private_data_len = private_data_len; - cma_dbg(id_priv, "sending SIDR\n"); return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); - id_priv->owner = curthread->td_proc->p_pid; + id_priv->owner = task_pid_nr(current); + if (!cma_comp(id_priv, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp && conn_param) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { if (conn_param) - ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, - conn_param->private_data, - conn_param->private_data_len); + ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, + conn_param->qkey, + conn_param->private_data, + conn_param->private_data_len); else ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, - NULL, 0); + 0, NULL, 0); } else { if (conn_param) - ret = cma_accept_ib(id_priv, conn_param); - else - ret = cma_rep_recv(id_priv); + ret = cma_accept_ib(id_priv, conn_param); + else + ret = cma_rep_recv(id_priv); } - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) ret = cma_accept_iw(id_priv, conn_param); - break; - default: + else ret = -ENOSYS; - break; - } if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); rdma_reject(id, NULL, 0); return ret; } EXPORT_SYMBOL(rdma_accept); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; switch (id->device->node_type) { case RDMA_NODE_IB_CA: ret = ib_cm_notify(id_priv->cm_id.ib, event); break; default: ret = 0; break; } return ret; } EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, u8 private_data_len) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) - ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, + ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, private_data, private_data_len); - else { - cma_dbg(id_priv, "sending REJ\n"); + else ret = ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, private_data, private_data_len); - } - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); - break; - default: + } else ret = -ENOSYS; - break; - } + return ret; } EXPORT_SYMBOL(rdma_reject); int rdma_disconnect(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { ret = cma_modify_qp_err(id_priv); if (ret) goto out; /* Initiate or respond to a disconnect. */ - cma_dbg(id_priv, "sending DREQ\n"); - if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { - cma_dbg(id_priv, "sending DREP\n"); + if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); - } - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); - break; - default: + } else ret = -EINVAL; - break; - } + out: return ret; } EXPORT_SYMBOL(rdma_disconnect); static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) { struct rdma_id_private *id_priv; struct cma_multicast *mc = multicast->context; struct rdma_cm_event event; - struct rdma_dev_addr *dev_addr; - int ret; - struct net_device *ndev = NULL; - u16 vlan; + int ret = 0; id_priv = mc->id_priv; - dev_addr = &id_priv->id.route.addr.dev_addr; - if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) && - cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED)) - return 0; + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state != RDMA_CM_ADDR_BOUND && + id_priv->state != RDMA_CM_ADDR_RESOLVED) + goto out; + if (!status) + status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); mutex_lock(&id_priv->qp_mutex); if (!status && id_priv->id.qp) status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, be16_to_cpu(multicast->rec.mlid)); mutex_unlock(&id_priv->qp_mutex); memset(&event, 0, sizeof event); event.status = status; event.param.ud.private_data = mc->context; - ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); - if (!ndev) { - status = -ENODEV; - } else { - vlan = rdma_vlan_dev_vlan_id(ndev); - dev_put(ndev); - } if (!status) { + struct rdma_dev_addr *dev_addr = + &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = + dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); + enum ib_gid_type gid_type = + id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + event.event = RDMA_CM_EVENT_MULTICAST_JOIN; ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, + ndev, gid_type, &event.param.ud.ah_attr); - event.param.ud.ah_attr.vlan_id = vlan; event.param.ud.qp_num = 0xFFFFFF; event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); - } else { + if (ndev) + dev_put(ndev); + } else event.event = RDMA_CM_EVENT_MULTICAST_ERROR; - /* mark that the cached record is no longer valid */ - if (status != -ENETRESET && status != -EAGAIN) { - spin_lock(&id_priv->lock); - id_priv->is_valid_rec = 0; - spin_unlock(&id_priv->lock); - } - } - ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return 0; } +out: mutex_unlock(&id_priv->handler_mutex); return 0; } static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr, union ib_gid *mgid) { unsigned char mc_map[MAX_ADDR_LEN]; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sockaddr_in *sin = (struct sockaddr_in *) addr; -#if defined(INET6) struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; -#endif if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); -#if defined(INET6) } else if ((addr->sa_family == AF_INET6) && ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); + } else if (addr->sa_family == AF_IB) { + memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); -#endif } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } } +static void cma_query_sa_classport_info_cb(int status, + struct ib_class_port_info *rec, + void *context) +{ + struct class_port_info_context *cb_ctx = context; + + WARN_ON(!context); + + if (status || !rec) { + pr_debug("RDMA CM: %s port %u failed query ClassPortInfo status: %d\n", + cb_ctx->device->name, cb_ctx->port_num, status); + goto out; + } + + memcpy(cb_ctx->class_port_info, rec, sizeof(struct ib_class_port_info)); + +out: + complete(&cb_ctx->done); +} + +static int cma_query_sa_classport_info(struct ib_device *device, u8 port_num, + struct ib_class_port_info *class_port_info) +{ + struct class_port_info_context *cb_ctx; + int ret; + + cb_ctx = kmalloc(sizeof(*cb_ctx), GFP_KERNEL); + if (!cb_ctx) + return -ENOMEM; + + cb_ctx->device = device; + cb_ctx->class_port_info = class_port_info; + cb_ctx->port_num = port_num; + init_completion(&cb_ctx->done); + + ret = ib_sa_classport_info_rec_query(&sa_client, device, port_num, + CMA_QUERY_CLASSPORT_INFO_TIMEOUT, + GFP_KERNEL, cma_query_sa_classport_info_cb, + cb_ctx, &cb_ctx->sa_query); + if (ret < 0) { + pr_err("RDMA CM: %s port %u failed to send ClassPortInfo query, ret: %d\n", + device->name, port_num, ret); + goto out; + } + + wait_for_completion(&cb_ctx->done); + +out: + kfree(cb_ctx); + return ret; +} + static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct ib_sa_mcmember_rec rec; + struct ib_class_port_info class_port_info; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; - int ret = 0; + int ret; - ib_addr_get_mgid(dev_addr, &id_priv->rec.mgid); + ib_addr_get_mgid(dev_addr, &rec.mgid); + ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, + &rec.mgid, &rec); + if (ret) + return ret; - /* cache ipoib bc record */ - spin_lock(&id_priv->lock); - if (!id_priv->is_valid_rec) - ret = ib_sa_get_mcmember_rec(id_priv->id.device, - id_priv->id.port_num, - &id_priv->rec.mgid, - &id_priv->rec); - if (ret) { - id_priv->is_valid_rec = 0; - spin_unlock(&id_priv->lock); + ret = cma_set_qkey(id_priv, 0); + if (ret) return ret; - } else { - rec = id_priv->rec; - id_priv->is_valid_rec = 1; - } - spin_unlock(&id_priv->lock); cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); - if (id_priv->id.ps == RDMA_PS_UDP) - rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); + rec.qkey = cpu_to_be32(id_priv->qkey); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); - rec.join_state = 1; + rec.join_state = mc->join_state; + if (rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) { + ret = cma_query_sa_classport_info(id_priv->id.device, + id_priv->id.port_num, + &class_port_info); + + if (ret) + return ret; + + if (!(ib_get_cpi_capmask2(&class_port_info) & + IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT)) { + pr_warn("RDMA CM: %s port %u Unable to multicast join\n" + "RDMA CM: SM doesn't support Send Only Full Member option\n", + id_priv->id.device->name, id_priv->id.port_num); + return -EOPNOTSUPP; + } + } + comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; if (id_priv->id.ps == RDMA_PS_IPOIB) comp_mask |= IB_SA_MCMEMBER_REC_RATE | IB_SA_MCMEMBER_REC_RATE_SELECTOR | IB_SA_MCMEMBER_REC_MTU_SELECTOR | IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_HOP_LIMIT; mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device, id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, cma_ib_mc_handler, mc); - return PTR_RET(mc->multicast.ib); + return PTR_ERR_OR_ZERO(mc->multicast.ib); } static void iboe_mcast_work_handler(struct work_struct *work) { struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work); struct cma_multicast *mc = mw->mc; struct ib_sa_multicast *m = mc->multicast.ib; mc->multicast.ib->context = mc; cma_ib_mc_handler(0, m); kref_put(&mc->mcref, release_mc); kfree(mw); } static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { mgid->raw[0] = 0xff; mgid->raw[1] = 0x0e; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; mgid->raw[5] = 0; mgid->raw[6] = 0; mgid->raw[7] = 0; mgid->raw[8] = 0; mgid->raw[9] = 0; mgid->raw[10] = 0xff; mgid->raw[11] = 0xff; *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr; } } static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct iboe_mcast_work *work; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; - int err; + int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; + enum ib_gid_type gid_type; + bool send_only; + send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); + if (cma_zero_addr((struct sockaddr *)&mc->addr)) return -EINVAL; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL); if (!mc->multicast.ib) { err = -ENOMEM; goto out1; } cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid); mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); if (id_priv->id.ps == RDMA_PS_UDP) mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!ndev) { err = -ENODEV; goto out2; } mc->multicast.ib->rec.rate = iboe_get_rate(ndev); mc->multicast.ib->rec.hop_limit = 1; mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->if_mtu); + + gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + if (addr->sa_family == AF_INET) { + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { + mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; + if (!send_only) { + mc->igmp_joined = true; + } + } + } else { + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + err = -ENOTSUPP; + } dev_put(ndev); - if (!mc->multicast.ib->rec.mtu) { - err = -EINVAL; + if (err || !mc->multicast.ib->rec.mtu) { + if (!err) + err = -EINVAL; goto out2; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &mc->multicast.ib->rec.port_gid); work->id = id_priv; work->mc = mc; INIT_WORK(&work->work, iboe_mcast_work_handler); kref_get(&mc->mcref); queue_work(cma_wq, &work->work); return 0; out2: kfree(mc->multicast.ib); out1: kfree(work); return err; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, - void *context) + u8 join_state, void *context) { struct rdma_id_private *id_priv; struct cma_multicast *mc; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) && !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED)) return -EINVAL; mc = kmalloc(sizeof *mc, GFP_KERNEL); if (!mc) return -ENOMEM; - memcpy(&mc->addr, addr, ip_addr_size(addr)); + memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; - + mc->igmp_joined = false; + mc->join_state = join_state; spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: - switch (rdma_port_get_link_layer(id->device, id->port_num)) { - case IB_LINK_LAYER_INFINIBAND: - ret = cma_join_ib_multicast(id_priv, mc); - break; - case IB_LINK_LAYER_ETHERNET: - kref_init(&mc->mcref); - ret = cma_iboe_join_multicast(id_priv, mc); - break; - default: - ret = -EINVAL; - } - break; - default: + if (rdma_protocol_roce(id->device, id->port_num)) { + kref_init(&mc->mcref); + ret = cma_iboe_join_multicast(id_priv, mc); + } else if (rdma_cap_ib_mcast(id->device, id->port_num)) + ret = cma_join_ib_multicast(id_priv, mc); + else ret = -ENOSYS; - break; - } if (ret) { spin_lock_irq(&id_priv->lock); list_del(&mc->list); spin_unlock_irq(&id_priv->lock); kfree(mc); } return ret; } EXPORT_SYMBOL(rdma_join_multicast); void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; struct cma_multicast *mc; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { - if (!memcmp(&mc->addr, addr, ip_addr_size(addr))) { + if (!memcmp(&mc->addr, addr, rdma_addr_size(addr))) { list_del(&mc->list); spin_unlock_irq(&id_priv->lock); if (id->qp) ib_detach_mcast(id->qp, &mc->multicast.ib->rec.mgid, be16_to_cpu(mc->multicast.ib->rec.mlid)); - if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) { - switch (rdma_port_get_link_layer(id->device, id->port_num)) { - case IB_LINK_LAYER_INFINIBAND: - ib_sa_free_multicast(mc->multicast.ib); - kfree(mc); - break; - case IB_LINK_LAYER_ETHERNET: - kref_put(&mc->mcref, release_mc); - break; - default: - break; + + BUG_ON(id_priv->cma_dev->device != id->device); + + if (rdma_cap_ib_mcast(id->device, id->port_num)) { + ib_sa_free_multicast(mc->multicast.ib); + kfree(mc); + } else if (rdma_protocol_roce(id->device, id->port_num)) { + if (mc->igmp_joined) { + struct rdma_dev_addr *dev_addr = + &id->route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(dev_addr->net, + dev_addr->bound_dev_if); + if (ndev) { + dev_put(ndev); + } + mc->igmp_joined = false; } + kref_put(&mc->mcref, release_mc); } return; } } spin_unlock_irq(&id_priv->lock); } EXPORT_SYMBOL(rdma_leave_multicast); -static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) +static int +sysctl_cma_default_roce_mode(SYSCTL_HANDLER_ARGS) { - struct rdma_dev_addr *dev_addr; - struct cma_ndev_work *work; + struct cma_device *cma_dev = arg1; + const int port = arg2; + char buf[64]; + int error; - dev_addr = &id_priv->id.route.addr.dev_addr; + strlcpy(buf, ib_cache_gid_type_str( + cma_get_default_gid_type(cma_dev, port)), sizeof(buf)); - if ((dev_addr->bound_dev_if == ndev->if_index) && - memcmp(dev_addr->src_dev_addr, IF_LLADDR(ndev), ndev->if_addrlen)) { - printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", - ndev->if_xname, &id_priv->id); - work = kzalloc(sizeof *work, GFP_KERNEL); - if (!work) - return -ENOMEM; + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + goto done; - INIT_WORK(&work->work, cma_ndev_work_handler); - work->id = id_priv; - work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; - atomic_inc(&id_priv->refcount); - queue_work(cma_wq, &work->work); + error = ib_cache_gid_parse_type_str(buf); + if (error < 0) { + error = EINVAL; + goto done; } - return 0; + cma_set_default_gid_type(cma_dev, port, error); + error = 0; +done: + return (error); } -static int cma_netdev_callback(struct notifier_block *self, unsigned long event, - void *ctx) -{ - struct net_device *ndev = (struct net_device *)ctx; - struct cma_device *cma_dev; - struct rdma_id_private *id_priv; - int ret = NOTIFY_DONE; - -/* BONDING related, commented out until the bonding is resolved */ -#if 0 - if (dev_net(ndev) != &init_net) - return NOTIFY_DONE; - - if (event != NETDEV_BONDING_FAILOVER) - return NOTIFY_DONE; - - if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING)) - return NOTIFY_DONE; -#endif - if (event != NETDEV_DOWN && event != NETDEV_UNREGISTER) - return NOTIFY_DONE; - - mutex_lock(&lock); - list_for_each_entry(cma_dev, &dev_list, list) - list_for_each_entry(id_priv, &cma_dev->id_list, list) { - ret = cma_netdev_change(ndev, id_priv); - if (ret) - goto out; - } - -out: - mutex_unlock(&lock); - return ret; -} - -static struct notifier_block cma_nb = { - .notifier_call = cma_netdev_callback -}; - static void cma_add_one(struct ib_device *device) { struct cma_device *cma_dev; struct rdma_id_private *id_priv; + unsigned int i; + unsigned long supported_gids = 0; cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); if (!cma_dev) return; + sysctl_ctx_init(&cma_dev->sysctl_ctx); + cma_dev->device = device; + cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, + sizeof(*cma_dev->default_gid_type), + GFP_KERNEL); + if (!cma_dev->default_gid_type) { + kfree(cma_dev); + return; + } + for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + supported_gids = roce_gid_type_mask_support(device, i); + WARN_ON(!supported_gids); + cma_dev->default_gid_type[i - rdma_start_port(device)] = + find_first_bit(&supported_gids, BITS_PER_LONG); + } init_completion(&cma_dev->comp); atomic_set(&cma_dev->refcount, 1); INIT_LIST_HEAD(&cma_dev->id_list); ib_set_client_data(device, &cma_client, cma_dev); mutex_lock(&lock); list_add_tail(&cma_dev->list, &dev_list); list_for_each_entry(id_priv, &listen_any_list, list) cma_listen_on_dev(id_priv, cma_dev); mutex_unlock(&lock); + + for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + char buf[64]; + + snprintf(buf, sizeof(buf), "default_roce_mode_port%d", i); + + (void) SYSCTL_ADD_PROC(&cma_dev->sysctl_ctx, + SYSCTL_CHILDREN(device->ports_parent->parent->oidp), + OID_AUTO, buf, CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + cma_dev, i, &sysctl_cma_default_roce_mode, "A", + "Default RoCE mode. Valid values: IB/RoCE v1 and RoCE v2"); + } } static int cma_remove_id_dev(struct rdma_id_private *id_priv) { struct rdma_cm_event event; enum rdma_cm_state state; int ret = 0; /* Record that we want to remove the device */ state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL); if (state == RDMA_CM_DESTROYING) return 0; cma_cancel_operation(id_priv, state); mutex_lock(&id_priv->handler_mutex); /* Check for destruction from another callback. */ if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL)) goto out; memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_DEVICE_REMOVAL; ret = id_priv->id.event_handler(&id_priv->id, &event); out: mutex_unlock(&id_priv->handler_mutex); return ret; } static void cma_process_remove(struct cma_device *cma_dev) { struct rdma_id_private *id_priv; int ret; mutex_lock(&lock); while (!list_empty(&cma_dev->id_list)) { id_priv = list_entry(cma_dev->id_list.next, struct rdma_id_private, list); list_del(&id_priv->listen_list); list_del_init(&id_priv->list); atomic_inc(&id_priv->refcount); mutex_unlock(&lock); ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv); cma_deref_id(id_priv); if (ret) rdma_destroy_id(&id_priv->id); mutex_lock(&lock); } mutex_unlock(&lock); cma_deref_dev(cma_dev); wait_for_completion(&cma_dev->comp); } -static void cma_remove_one(struct ib_device *device) +static void cma_remove_one(struct ib_device *device, void *client_data) { - struct cma_device *cma_dev; + struct cma_device *cma_dev = client_data; - cma_dev = ib_get_client_data(device, &cma_client); if (!cma_dev) return; mutex_lock(&lock); list_del(&cma_dev->list); mutex_unlock(&lock); cma_process_remove(cma_dev); + sysctl_ctx_free(&cma_dev->sysctl_ctx); + kfree(cma_dev->default_gid_type); kfree(cma_dev); } +static void cma_init_vnet(void *arg) +{ + struct cma_pernet *pernet = &VNET(cma_pernet); + + idr_init(&pernet->tcp_ps); + idr_init(&pernet->udp_ps); + idr_init(&pernet->ipoib_ps); + idr_init(&pernet->ib_ps); +} +VNET_SYSINIT(cma_init_vnet, SI_SUB_OFED_MODINIT - 1, SI_ORDER_FIRST, cma_init_vnet, NULL); + +static void cma_destroy_vnet(void *arg) +{ + struct cma_pernet *pernet = &VNET(cma_pernet); + + idr_destroy(&pernet->tcp_ps); + idr_destroy(&pernet->udp_ps); + idr_destroy(&pernet->ipoib_ps); + idr_destroy(&pernet->ib_ps); +} +VNET_SYSUNINIT(cma_destroy_vnet, SI_SUB_OFED_MODINIT - 1, SI_ORDER_SECOND, cma_destroy_vnet, NULL); + static int __init cma_init(void) { - int ret = -ENOMEM; + int ret; - cma_wq = create_singlethread_workqueue("rdma_cm"); + cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM); if (!cma_wq) return -ENOMEM; - cma_free_wq = create_singlethread_workqueue("rdma_cm_fr"); - if (!cma_free_wq) - goto err1; - ib_sa_register_client(&sa_client); rdma_addr_register_client(&addr_client); - register_netdevice_notifier(&cma_nb); ret = ib_register_client(&cma_client); if (ret) goto err; + cma_configfs_init(); + return 0; err: - unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); - - destroy_workqueue(cma_free_wq); -err1: destroy_workqueue(cma_wq); return ret; } static void __exit cma_cleanup(void) { + cma_configfs_exit(); ib_unregister_client(&cma_client); - unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); - flush_workqueue(cma_free_wq); - destroy_workqueue(cma_free_wq); destroy_workqueue(cma_wq); - idr_destroy(&sdp_ps); - idr_destroy(&tcp_ps); - idr_destroy(&udp_ps); - idr_destroy(&ipoib_ps); - idr_destroy(&ib_ps); } module_init(cma_init); module_exit(cma_cleanup); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/core_priv.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/core_priv.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/core_priv.h (revision 319974) @@ -1,52 +1,148 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef _CORE_PRIV_H #define _CORE_PRIV_H #include #include #include +#include + +#ifdef CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS +int cma_configfs_init(void); +void cma_configfs_exit(void); +#else +static inline int cma_configfs_init(void) +{ + return 0; +} + +static inline void cma_configfs_exit(void) +{ +} +#endif +struct cma_device; +void cma_ref_dev(struct cma_device *cma_dev); +void cma_deref_dev(struct cma_device *cma_dev); +typedef bool (*cma_device_filter)(struct ib_device *, void *); +struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, + void *cookie); +int cma_get_default_gid_type(struct cma_device *cma_dev, + unsigned int port); +int cma_set_default_gid_type(struct cma_device *cma_dev, + unsigned int port, + enum ib_gid_type default_gid_type); +struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev); + int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, - u8, struct kobject *)); + u8, struct kobject *)); void ib_device_unregister_sysfs(struct ib_device *device); -int ib_sysfs_setup(void); -void ib_sysfs_cleanup(void); - -int ib_cache_setup(void); +void ib_cache_setup(void); void ib_cache_cleanup(void); + +int ib_resolve_eth_dmac(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, int *qp_attr_mask); + +typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, + struct net_device *idev, void *cookie); + +typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port, + struct net_device *idev, void *cookie); + +void ib_enum_roce_netdev(struct ib_device *ib_dev, + roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie); +void ib_enum_all_roce_netdevs(roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie); + +enum ib_cache_gid_default_mode { + IB_CACHE_GID_DEFAULT_MODE_SET, + IB_CACHE_GID_DEFAULT_MODE_DELETE +}; + +int ib_cache_gid_parse_type_str(const char *buf); + +const char *ib_cache_gid_type_str(enum ib_gid_type gid_type); + +void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, + struct net_device *ndev, + unsigned long gid_type_mask, + enum ib_cache_gid_default_mode mode); + +int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr); + +int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr); + +int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, + struct net_device *ndev); + +int roce_gid_mgmt_init(void); +void roce_gid_mgmt_cleanup(void); + +int roce_rescan_device(struct ib_device *ib_dev); +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port); + +int ib_cache_setup_one(struct ib_device *device); +void ib_cache_cleanup_one(struct ib_device *device); +void ib_cache_release_one(struct ib_device *device); + +static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, + struct net_device *upper) +{ + + /* TODO: add support for LAGG */ + upper = VLAN_TRUNKDEV(upper); + + return (dev == upper); +} + +int addr_init(void); +void addr_cleanup(void); + +int ib_mad_init(void); +void ib_mad_cleanup(void); + +int ib_sa_init(void); +void ib_sa_cleanup(void); #endif /* _CORE_PRIV_H */ Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/device.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/device.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/device.c (revision 319974) @@ -1,793 +1,1025 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include +#include +#include +#include #include "core_priv.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); struct ib_client_data { struct list_head list; struct ib_client *client; void * data; + /* The device or client is going down. Do not call client or device + * callbacks other than remove(). */ + bool going_down; }; +struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); +/* The device_list and client_list contain devices and clients after their + * registration has completed, and the devices and clients are removed + * during unregistration. */ static LIST_HEAD(device_list); static LIST_HEAD(client_list); /* - * device_mutex protects access to both device_list and client_list. - * There's no real point to using multiple locks or something fancier - * like an rwsem: we always access both lists, and we're always - * modifying one list or the other list. In any case this is not a - * hot path so there's no point in trying to optimize. + * device_mutex and lists_rwsem protect access to both device_list and + * client_list. device_mutex protects writer access by device and client + * registration / de-registration. lists_rwsem protects reader access to + * these lists. Iterators of these lists must lock it for read, while updates + * to the lists must be done with a write lock. A special case is when the + * device_mutex is locked. In this case locking the lists for read access is + * not necessary as the device_mutex implies it. + * + * lists_rwsem also protects access to the client data list. */ static DEFINE_MUTEX(device_mutex); +static DECLARE_RWSEM(lists_rwsem); + static int ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x } static const struct { size_t offset; char *name; } mandatory_table[] = { IB_MANDATORY_FUNC(query_device), IB_MANDATORY_FUNC(query_port), IB_MANDATORY_FUNC(query_pkey), IB_MANDATORY_FUNC(query_gid), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), IB_MANDATORY_FUNC(create_ah), IB_MANDATORY_FUNC(destroy_ah), IB_MANDATORY_FUNC(create_qp), IB_MANDATORY_FUNC(modify_qp), IB_MANDATORY_FUNC(destroy_qp), IB_MANDATORY_FUNC(post_send), IB_MANDATORY_FUNC(post_recv), IB_MANDATORY_FUNC(create_cq), IB_MANDATORY_FUNC(destroy_cq), IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), - IB_MANDATORY_FUNC(dereg_mr) + IB_MANDATORY_FUNC(dereg_mr), + IB_MANDATORY_FUNC(get_port_immutable) }; int i; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { - if (!*(void **) ((void *) device + mandatory_table[i].offset)) { - printk(KERN_WARNING "Device %s is missing mandatory function %s\n", - device->name, mandatory_table[i].name); + if (!*(void **) ((char *) device + mandatory_table[i].offset)) { + pr_warn("Device %s is missing mandatory function %s\n", + device->name, mandatory_table[i].name); return -EINVAL; } } return 0; } static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; list_for_each_entry(device, &device_list, core_list) if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX)) return device; return NULL; } static int alloc_name(char *name) { unsigned long *inuse; char buf[IB_DEVICE_NAME_MAX]; struct ib_device *device; int i; inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL); if (!inuse) return -ENOMEM; list_for_each_entry(device, &device_list, core_list) { if (!sscanf(device->name, name, &i)) continue; if (i < 0 || i >= PAGE_SIZE * 8) continue; snprintf(buf, sizeof buf, name, i); if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX)) set_bit(i, inuse); } i = find_first_zero_bit(inuse, PAGE_SIZE * 8); free_page((unsigned long) inuse); snprintf(buf, sizeof buf, name, i); if (__ib_device_get_by_name(buf)) return -ENFILE; strlcpy(name, buf, IB_DEVICE_NAME_MAX); return 0; } -static int start_port(struct ib_device *device) +static void ib_device_release(struct device *device) { - return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; -} + struct ib_device *dev = container_of(device, struct ib_device, dev); - -static int end_port(struct ib_device *device) -{ - return (device->node_type == RDMA_NODE_IB_SWITCH) ? - 0 : device->phys_port_cnt; + ib_cache_release_one(dev); + kfree(dev->port_immutable); + kfree(dev); } +static struct class ib_class = { + .name = "infiniband", + .dev_release = ib_device_release, +}; + /** * ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, * including any private data used by the low-level driver. * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ struct ib_device *ib_alloc_device(size_t size) { - struct ib_device *dev; + struct ib_device *device; - BUG_ON(size < sizeof (struct ib_device)); + if (WARN_ON(size < sizeof(struct ib_device))) + return NULL; - dev = kzalloc(size, GFP_KERNEL); - spin_lock_init(&dev->cmd_perf_lock); + device = kzalloc(size, GFP_KERNEL); + if (!device) + return NULL; - return dev; + device->dev.parent = &linux_root_device; + device->dev.class = &ib_class; + device_initialize(&device->dev); + + dev_set_drvdata(&device->dev, device); + + INIT_LIST_HEAD(&device->event_handler_list); + spin_lock_init(&device->event_handler_lock); + spin_lock_init(&device->client_data_lock); + INIT_LIST_HEAD(&device->client_data_list); + INIT_LIST_HEAD(&device->port_list); + + return device; } EXPORT_SYMBOL(ib_alloc_device); /** * ib_dealloc_device - free an IB device struct * @device:structure to free * * Free a structure allocated with ib_alloc_device(). */ void ib_dealloc_device(struct ib_device *device) { - if (device->reg_state == IB_DEV_UNINITIALIZED) { - kfree(device); - return; - } - - BUG_ON(device->reg_state != IB_DEV_UNREGISTERED); - + WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && + device->reg_state != IB_DEV_UNINITIALIZED); kobject_put(&device->dev.kobj); } EXPORT_SYMBOL(ib_dealloc_device); static int add_client_context(struct ib_device *device, struct ib_client *client) { struct ib_client_data *context; unsigned long flags; context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) { - printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n", - device->name, client->name); + pr_warn("Couldn't allocate client context for %s/%s\n", + device->name, client->name); return -ENOMEM; } context->client = client; context->data = NULL; + context->going_down = false; + down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_add(&context->list, &device->client_data_list); spin_unlock_irqrestore(&device->client_data_lock, flags); + up_write(&lists_rwsem); return 0; } -static int read_port_table_lengths(struct ib_device *device) +static int verify_immutable(const struct ib_device *dev, u8 port) { - struct ib_port_attr *tprops = NULL; - int num_ports, ret = -ENOMEM; - u8 port_index; + return WARN_ON(!rdma_cap_ib_mad(dev, port) && + rdma_max_mad_size(dev, port) != 0); +} - tprops = kmalloc(sizeof *tprops, GFP_KERNEL); - if (!tprops) - goto out; +static int read_port_immutable(struct ib_device *device) +{ + int ret; + u8 start_port = rdma_start_port(device); + u8 end_port = rdma_end_port(device); + u8 port; - num_ports = end_port(device) - start_port(device) + 1; + /** + * device->port_immutable is indexed directly by the port number to make + * access to this data as efficient as possible. + * + * Therefore port_immutable is declared as a 1 based array with + * potential empty slots at the beginning. + */ + device->port_immutable = kzalloc(sizeof(*device->port_immutable) + * (end_port + 1), + GFP_KERNEL); + if (!device->port_immutable) + return -ENOMEM; - device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports, - GFP_KERNEL); - device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports, - GFP_KERNEL); - if (!device->pkey_tbl_len || !device->gid_tbl_len) - goto err; - - for (port_index = 0; port_index < num_ports; ++port_index) { - ret = ib_query_port(device, port_index + start_port(device), - tprops); + for (port = start_port; port <= end_port; ++port) { + ret = device->get_port_immutable(device, port, + &device->port_immutable[port]); if (ret) - goto err; - device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len; - device->gid_tbl_len[port_index] = tprops->gid_tbl_len; + return ret; + + if (verify_immutable(device, port)) + return -EINVAL; } + return 0; +} - ret = 0; - goto out; - -err: - kfree(device->gid_tbl_len); - kfree(device->pkey_tbl_len); -out: - kfree(tprops); - return ret; +void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len) +{ + if (dev->get_dev_fw_str) + dev->get_dev_fw_str(dev, str, str_len); + else + str[0] = '\0'; } +EXPORT_SYMBOL(ib_get_device_fw_str); /** * ib_register_device - Register an IB device with IB core * @device:Device to register * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). */ int ib_register_device(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)) { int ret; + struct ib_client *client; + struct ib_udata uhw = {.outlen = 0, .inlen = 0}; mutex_lock(&device_mutex); if (strchr(device->name, '%')) { ret = alloc_name(device->name); if (ret) goto out; } if (ib_device_check_mandatory(device)) { ret = -EINVAL; goto out; } - INIT_LIST_HEAD(&device->event_handler_list); - INIT_LIST_HEAD(&device->client_data_list); - spin_lock_init(&device->event_handler_lock); - spin_lock_init(&device->client_data_lock); + ret = read_port_immutable(device); + if (ret) { + pr_warn("Couldn't create per port immutable data %s\n", + device->name); + goto out; + } - ret = read_port_table_lengths(device); + ret = ib_cache_setup_one(device); if (ret) { - printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n", - device->name); + pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n"); goto out; } + memset(&device->attrs, 0, sizeof(device->attrs)); + ret = device->query_device(device, &device->attrs, &uhw); + if (ret) { + pr_warn("Couldn't query the device attributes\n"); + ib_cache_cleanup_one(device); + goto out; + } + ret = ib_device_register_sysfs(device, port_callback); if (ret) { - printk(KERN_WARNING "Couldn't register device %s with driver model\n", - device->name); - kfree(device->gid_tbl_len); - kfree(device->pkey_tbl_len); + pr_warn("Couldn't register device %s with driver model\n", + device->name); + ib_cache_cleanup_one(device); goto out; } - list_add_tail(&device->core_list, &device_list); - device->reg_state = IB_DEV_REGISTERED; - { - struct ib_client *client; + list_for_each_entry(client, &client_list, list) + if (client->add && !add_client_context(device, client)) + client->add(device); - list_for_each_entry(client, &client_list, list) - if (client->add && !add_client_context(device, client)) - client->add(device); - } - - out: + down_write(&lists_rwsem); + list_add_tail(&device->core_list, &device_list); + up_write(&lists_rwsem); +out: mutex_unlock(&device_mutex); return ret; } EXPORT_SYMBOL(ib_register_device); /** * ib_unregister_device - Unregister an IB device * @device:Device to unregister * * Unregister an IB device. All clients will receive a remove callback. */ void ib_unregister_device(struct ib_device *device) { - struct ib_client *client; struct ib_client_data *context, *tmp; unsigned long flags; mutex_lock(&device_mutex); - list_for_each_entry_reverse(client, &client_list, list) - if (client->remove) - client->remove(device); - + down_write(&lists_rwsem); list_del(&device->core_list); + spin_lock_irqsave(&device->client_data_lock, flags); + list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + context->going_down = true; + spin_unlock_irqrestore(&device->client_data_lock, flags); + downgrade_write(&lists_rwsem); - kfree(device->gid_tbl_len); - kfree(device->pkey_tbl_len); + list_for_each_entry_safe(context, tmp, &device->client_data_list, + list) { + if (context->client->remove) + context->client->remove(device, context->data); + } + up_read(&lists_rwsem); mutex_unlock(&device_mutex); ib_device_unregister_sysfs(device); + ib_cache_cleanup_one(device); + down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) kfree(context); spin_unlock_irqrestore(&device->client_data_lock, flags); + up_write(&lists_rwsem); device->reg_state = IB_DEV_UNREGISTERED; } EXPORT_SYMBOL(ib_unregister_device); /** * ib_register_client - Register an IB client * @client:Client to register * * Upper level users of the IB drivers can use ib_register_client() to * register callbacks for IB device addition and removal. When an IB * device is added, each registered client's add method will be called * (in the order the clients were registered), and when a device is * removed, each client's remove method will be called (in the reverse * order that clients were registered). In addition, when * ib_register_client() is called, the client will receive an add * callback for all devices already registered. */ int ib_register_client(struct ib_client *client) { struct ib_device *device; mutex_lock(&device_mutex); - list_add_tail(&client->list, &client_list); list_for_each_entry(device, &device_list, core_list) if (client->add && !add_client_context(device, client)) client->add(device); + down_write(&lists_rwsem); + list_add_tail(&client->list, &client_list); + up_write(&lists_rwsem); + mutex_unlock(&device_mutex); return 0; } EXPORT_SYMBOL(ib_register_client); /** * ib_unregister_client - Unregister an IB client * @client:Client to unregister * * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. */ void ib_unregister_client(struct ib_client *client) { struct ib_client_data *context, *tmp; struct ib_device *device; unsigned long flags; mutex_lock(&device_mutex); + down_write(&lists_rwsem); + list_del(&client->list); + up_write(&lists_rwsem); + list_for_each_entry(device, &device_list, core_list) { - if (client->remove) - client->remove(device); + struct ib_client_data *found_context = NULL; + down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) if (context->client == client) { - list_del(&context->list); - kfree(context); + context->going_down = true; + found_context = context; + break; } spin_unlock_irqrestore(&device->client_data_lock, flags); + up_write(&lists_rwsem); + + if (client->remove) + client->remove(device, found_context ? + found_context->data : NULL); + + if (!found_context) { + pr_warn("No client context found for %s/%s\n", + device->name, client->name); + continue; + } + + down_write(&lists_rwsem); + spin_lock_irqsave(&device->client_data_lock, flags); + list_del(&found_context->list); + kfree(found_context); + spin_unlock_irqrestore(&device->client_data_lock, flags); + up_write(&lists_rwsem); } - list_del(&client->list); mutex_unlock(&device_mutex); } EXPORT_SYMBOL(ib_unregister_client); /** * ib_get_client_data - Get IB client context * @device:Device to get context for * @client:Client to get context for * * ib_get_client_data() returns client context set with * ib_set_client_data(). */ void *ib_get_client_data(struct ib_device *device, struct ib_client *client) { struct ib_client_data *context; void *ret = NULL; unsigned long flags; spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { ret = context->data; break; } spin_unlock_irqrestore(&device->client_data_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_client_data); /** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * * ib_set_client_data() sets client context that can be retrieved with * ib_get_client_data(). */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { struct ib_client_data *context; unsigned long flags; spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->data = data; goto out; } - printk(KERN_WARNING "No client context found for %s/%s\n", - device->name, client->name); + pr_warn("No client context found for %s/%s\n", + device->name, client->name); out: spin_unlock_irqrestore(&device->client_data_lock, flags); } EXPORT_SYMBOL(ib_set_client_data); /** * ib_register_event_handler - Register an IB event handler * @event_handler:Handler to register * * ib_register_event_handler() registers an event handler that will be * called back when asynchronous IB events occur (as defined in * chapter 11 of the InfiniBand Architecture Specification). This * callback may occur in interrupt context. */ int ib_register_event_handler (struct ib_event_handler *event_handler) { unsigned long flags; spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); return 0; } EXPORT_SYMBOL(ib_register_event_handler); /** * ib_unregister_event_handler - Unregister an event handler * @event_handler:Handler to unregister * * Unregister an event handler registered with * ib_register_event_handler(). */ int ib_unregister_event_handler(struct ib_event_handler *event_handler) { unsigned long flags; spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); list_del(&event_handler->list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); return 0; } EXPORT_SYMBOL(ib_unregister_event_handler); /** * ib_dispatch_event - Dispatch an asynchronous event * @event:Event to dispatch * * Low-level drivers must call ib_dispatch_event() to dispatch the * event to all registered event handlers when an asynchronous event * occurs. */ void ib_dispatch_event(struct ib_event *event) { unsigned long flags; struct ib_event_handler *handler; spin_lock_irqsave(&event->device->event_handler_lock, flags); list_for_each_entry(handler, &event->device->event_handler_list, list) handler->handler(handler, event); spin_unlock_irqrestore(&event->device->event_handler_lock, flags); } EXPORT_SYMBOL(ib_dispatch_event); /** - * ib_query_device - Query IB device attributes - * @device:Device to query - * @device_attr:Device attributes - * - * ib_query_device() returns the attributes of a device through the - * @device_attr pointer. - */ -int ib_query_device(struct ib_device *device, - struct ib_device_attr *device_attr) -{ - return device->query_device(device, device_attr); -} -EXPORT_SYMBOL(ib_query_device); - -/** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query * @port_attr:Port attributes * * ib_query_port() returns the attributes of a port through the * @port_attr pointer. */ int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr) { - if (port_num < start_port(device) || port_num > end_port(device)) + union ib_gid gid; + int err; + + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; - return device->query_port(device, port_num, port_attr); + memset(port_attr, 0, sizeof(*port_attr)); + err = device->query_port(device, port_num, port_attr); + if (err || port_attr->subnet_prefix) + return err; + + if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) + return 0; + + err = ib_query_gid(device, port_num, 0, &gid, NULL); + if (err) + return err; + + port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); + return 0; } EXPORT_SYMBOL(ib_query_port); /** * ib_query_gid - Get GID table entry * @device:Device to query * @port_num:Port number to query * @index:GID table index to query * @gid:Returned GID + * @attr: Returned GID attributes related to this GID index (only in RoCE). + * NULL means ignore. * * ib_query_gid() fetches the specified GID table entry. */ int ib_query_gid(struct ib_device *device, - u8 port_num, int index, union ib_gid *gid) + u8 port_num, int index, union ib_gid *gid, + struct ib_gid_attr *attr) { + if (rdma_cap_roce_gid_table(device, port_num)) + return ib_get_cached_gid(device, port_num, index, gid, attr); + + if (attr) + return -EINVAL; + return device->query_gid(device, port_num, index, gid); } EXPORT_SYMBOL(ib_query_gid); /** + * ib_enum_roce_netdev - enumerate all RoCE ports + * @ib_dev : IB device we want to query + * @filter: Should we call the callback? + * @filter_cookie: Cookie passed to filter + * @cb: Callback to call for each found RoCE ports + * @cookie: Cookie passed back to the callback + * + * Enumerates all of the physical RoCE ports of ib_dev + * which are related to netdevice and calls callback() on each + * device for which filter() function returns non zero. + */ +void ib_enum_roce_netdev(struct ib_device *ib_dev, + roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie) +{ + u8 port; + + for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev); + port++) + if (rdma_protocol_roce(ib_dev, port)) { + struct net_device *idev = NULL; + + if (ib_dev->get_netdev) + idev = ib_dev->get_netdev(ib_dev, port); + + if (idev && (idev->if_flags & IFF_DYING)) { + dev_put(idev); + idev = NULL; + } + + if (filter(ib_dev, port, idev, filter_cookie)) + cb(ib_dev, port, idev, cookie); + + if (idev) + dev_put(idev); + } +} + +/** + * ib_enum_all_roce_netdevs - enumerate all RoCE devices + * @filter: Should we call the callback? + * @filter_cookie: Cookie passed to filter + * @cb: Callback to call for each found RoCE ports + * @cookie: Cookie passed back to the callback + * + * Enumerates all RoCE devices' physical ports which are related + * to netdevices and calls callback() on each device for which + * filter() function returns non zero. + */ +void ib_enum_all_roce_netdevs(roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie) +{ + struct ib_device *dev; + + down_read(&lists_rwsem); + list_for_each_entry(dev, &device_list, core_list) + ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); + up_read(&lists_rwsem); +} + +/** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query * @index:P_Key table index to query * @pkey:Returned P_Key * * ib_query_pkey() fetches the specified P_Key table entry. */ int ib_query_pkey(struct ib_device *device, u8 port_num, u16 index, u16 *pkey) { return device->query_pkey(device, port_num, index, pkey); } EXPORT_SYMBOL(ib_query_pkey); /** * ib_modify_device - Change IB device attributes * @device:Device to modify * @device_modify_mask:Mask of attributes to change * @device_modify:New attribute values * * ib_modify_device() changes a device's attributes as specified by * the @device_modify_mask and @device_modify structure. */ int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify) { if (!device->modify_device) return -ENOSYS; return device->modify_device(device, device_modify_mask, device_modify); } EXPORT_SYMBOL(ib_modify_device); /** * ib_modify_port - Modifies the attributes for the specified port. * @device: The device to modify. * @port_num: The number of the port to modify. * @port_modify_mask: Mask used to specify which attributes of the port * to change. * @port_modify: New attribute values for the port. * * ib_modify_port() changes a port's attributes as specified by the * @port_modify_mask and @port_modify structure. */ int ib_modify_port(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { if (!device->modify_port) return -ENOSYS; - if (port_num < start_port(device) || port_num > end_port(device)) + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; return device->modify_port(device, port_num, port_modify_mask, port_modify); } EXPORT_SYMBOL(ib_modify_port); /** * ib_find_gid - Returns the port number and GID table index where * a specified GID value occurs. * @device: The device to query. * @gid: The GID value to search for. + * @gid_type: Type of GID. + * @ndev: The ndev related to the GID to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, + enum ib_gid_type gid_type, struct net_device *ndev, u8 *port_num, u16 *index) { union ib_gid tmp_gid; int ret, port, i; - for (port = start_port(device); port <= end_port(device); ++port) { - for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) { - ret = ib_query_gid(device, port, i, &tmp_gid); + for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { + if (rdma_cap_roce_gid_table(device, port)) { + if (!ib_find_cached_gid_by_port(device, gid, gid_type, port, + ndev, index)) { + *port_num = port; + return 0; + } + } + + if (gid_type != IB_GID_TYPE_IB) + continue; + + for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { + ret = ib_query_gid(device, port, i, &tmp_gid, NULL); if (ret) return ret; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { *port_num = port; if (index) *index = i; return 0; } } } return -ENOENT; } EXPORT_SYMBOL(ib_find_gid); /** * ib_find_pkey - Returns the PKey table index where a specified * PKey value occurs. * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the PKey table where the PKey was found. */ int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index) { int ret, i; u16 tmp_pkey; int partial_ix = -1; - for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) { + for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { /* if there is full-member pkey take it.*/ if (tmp_pkey & 0x8000) { *index = i; return 0; } if (partial_ix < 0) partial_ix = i; } } /*no full-member, if exists take the limited*/ if (partial_ix >= 0) { *index = partial_ix; return 0; } return -ENOENT; } EXPORT_SYMBOL(ib_find_pkey); +/** + * ib_get_net_dev_by_params() - Return the appropriate net_dev + * for a received CM request + * @dev: An RDMA device on which the request has been received. + * @port: Port number on the RDMA device. + * @pkey: The Pkey the request came on. + * @gid: A GID that the net_dev uses to communicate. + * @addr: Contains the IP address that the request specified as its + * destination. + */ +struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, + u8 port, + u16 pkey, + const union ib_gid *gid, + const struct sockaddr *addr) +{ + struct net_device *net_dev = NULL; + struct ib_client_data *context; + + if (!rdma_protocol_ib(dev, port)) + return NULL; + + down_read(&lists_rwsem); + + list_for_each_entry(context, &dev->client_data_list, list) { + struct ib_client *client = context->client; + + if (context->going_down) + continue; + + if (client->get_net_dev_by_params) { + net_dev = client->get_net_dev_by_params(dev, port, pkey, + gid, addr, + context->data); + if (net_dev) + break; + } + } + + up_read(&lists_rwsem); + + return net_dev; +} +EXPORT_SYMBOL(ib_get_net_dev_by_params); + static int __init ib_core_init(void) { int ret; - ib_wq = create_workqueue("infiniband"); + ib_wq = alloc_workqueue("infiniband", 0, 0); if (!ib_wq) return -ENOMEM; - ret = ib_sysfs_setup(); - if (ret) { - printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); + ib_comp_wq = alloc_workqueue("ib-comp-wq", + WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM, + mp_ncpus * 4 /* WQ_UNBOUND_MAX_ACTIVE */); + if (!ib_comp_wq) { + ret = -ENOMEM; goto err; } - ret = ib_cache_setup(); + ret = class_register(&ib_class); if (ret) { - printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n"); + pr_warn("Couldn't create InfiniBand device class\n"); + goto err_comp; + } + + ret = addr_init(); + if (ret) { + pr_warn("Could't init IB address resolution\n"); goto err_sysfs; } + ret = ib_mad_init(); + if (ret) { + pr_warn("Couldn't init IB MAD\n"); + goto err_addr; + } + + ret = ib_sa_init(); + if (ret) { + pr_warn("Couldn't init SA\n"); + goto err_mad; + } + + ib_cache_setup(); + return 0; +err_mad: + ib_mad_cleanup(); +err_addr: + addr_cleanup(); err_sysfs: - ib_sysfs_cleanup(); - + class_unregister(&ib_class); +err_comp: + destroy_workqueue(ib_comp_wq); err: destroy_workqueue(ib_wq); return ret; } static void __exit ib_core_cleanup(void) { ib_cache_cleanup(); - ib_sysfs_cleanup(); + ib_sa_cleanup(); + ib_mad_cleanup(); + addr_cleanup(); + class_unregister(&ib_class); + destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); } module_init(ib_core_init); module_exit(ib_core_cleanup); -static int -ibcore_evhand(module_t mod, int event, void *arg) -{ - return (0); -} - -static moduledata_t ibcore_mod = { - .name = "ibcore", - .evhand = ibcore_evhand, -}; - MODULE_VERSION(ibcore, 1); MODULE_DEPEND(ibcore, linuxkpi, 1, 1, 1); -DECLARE_MODULE(ibcore, ibcore_mod, SI_SUB_LAST, SI_ORDER_ANY); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/fmr_pool.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/fmr_pool.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/fmr_pool.c (revision 319974) @@ -1,544 +1,520 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include -#include #include #include #include #include #include "core_priv.h" #define PFX "fmr_pool: " enum { IB_FMR_MAX_REMAPS = 32, IB_FMR_HASH_BITS = 8, IB_FMR_HASH_SIZE = 1 << IB_FMR_HASH_BITS, IB_FMR_HASH_MASK = IB_FMR_HASH_SIZE - 1 }; /* * If an FMR is not in use, then the list member will point to either * its pool's free_list (if the FMR can be mapped again; that is, * remap_count < pool->max_remaps) or its pool's dirty_list (if the * FMR needs to be unmapped before being remapped). In either of * these cases it is a bug if the ref_count is not 0. In other words, * if ref_count is > 0, then the list member must not be linked into * either free_list or dirty_list. * * The cache_node member is used to link the FMR into a cache bucket * (if caching is enabled). This is independent of the reference * count of the FMR. When a valid FMR is released, its ref_count is * decremented, and if ref_count reaches 0, the FMR is placed in * either free_list or dirty_list as appropriate. However, it is not * removed from the cache and may be "revived" if a call to * ib_fmr_register_physical() occurs before the FMR is remapped. In * this case we just increment the ref_count and remove the FMR from * free_list/dirty_list. * * Before we remap an FMR from free_list, we remove it from the cache * (to prevent another user from obtaining a stale FMR). When an FMR * is released, we add it to the tail of the free list, so that our * cache eviction policy is "least recently used." * * All manipulation of ref_count, list and cache_node is protected by * pool_lock to maintain consistency. */ struct ib_fmr_pool { spinlock_t pool_lock; int pool_size; int max_pages; int max_remaps; int dirty_watermark; int dirty_len; struct list_head free_list; struct list_head dirty_list; struct hlist_head *cache_bucket; void (*flush_function)(struct ib_fmr_pool *pool, void * arg); void *flush_arg; struct task_struct *thread; atomic_t req_ser; atomic_t flush_ser; wait_queue_head_t force_wait; }; static inline u32 ib_fmr_hash(u64 first_page) { return jhash_2words((u32) first_page, (u32) (first_page >> 32), 0) & (IB_FMR_HASH_SIZE - 1); } /* Caller must hold pool_lock */ static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool, u64 *page_list, int page_list_len, u64 io_virtual_address) { struct hlist_head *bucket; struct ib_pool_fmr *fmr; if (!pool->cache_bucket) return NULL; bucket = pool->cache_bucket + ib_fmr_hash(*page_list); hlist_for_each_entry(fmr, bucket, cache_node) if (io_virtual_address == fmr->io_virtual_address && page_list_len == fmr->page_list_len && !memcmp(page_list, fmr->page_list, page_list_len * sizeof *page_list)) return fmr; return NULL; } static void ib_fmr_batch_release(struct ib_fmr_pool *pool) { int ret; struct ib_pool_fmr *fmr; LIST_HEAD(unmap_list); LIST_HEAD(fmr_list); spin_lock_irq(&pool->pool_lock); list_for_each_entry(fmr, &pool->dirty_list, list) { hlist_del_init(&fmr->cache_node); fmr->remap_count = 0; list_add_tail(&fmr->fmr->list, &fmr_list); #ifdef DEBUG if (fmr->ref_count !=0) { - printk(KERN_WARNING PFX "Unmapping FMR %p with ref count %d\n", - fmr, fmr->ref_count); + pr_warn(PFX "Unmapping FMR 0x%08x with ref count %d\n", + fmr, fmr->ref_count); } #endif } list_splice_init(&pool->dirty_list, &unmap_list); pool->dirty_len = 0; spin_unlock_irq(&pool->pool_lock); if (list_empty(&unmap_list)) { return; } ret = ib_unmap_fmr(&fmr_list); if (ret) - printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret); + pr_warn(PFX "ib_unmap_fmr returned %d\n", ret); spin_lock_irq(&pool->pool_lock); list_splice(&unmap_list, &pool->free_list); spin_unlock_irq(&pool->pool_lock); } static int ib_fmr_cleanup_thread(void *pool_ptr) { struct ib_fmr_pool *pool = pool_ptr; do { if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) { ib_fmr_batch_release(pool); atomic_inc(&pool->flush_ser); wake_up_interruptible(&pool->force_wait); if (pool->flush_function) pool->flush_function(pool, pool->flush_arg); } set_current_state(TASK_INTERRUPTIBLE); if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 && !kthread_should_stop()) schedule(); __set_current_state(TASK_RUNNING); } while (!kthread_should_stop()); return 0; } /** * ib_create_fmr_pool - Create an FMR pool * @pd:Protection domain for FMRs * @params:FMR pool parameters * * Create a pool of FMRs. Return value is pointer to new pool or * error code if creation failed. */ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, struct ib_fmr_pool_param *params) { struct ib_device *device; struct ib_fmr_pool *pool; - struct ib_device_attr *attr; int i; int ret; int max_remaps; if (!params) return ERR_PTR(-EINVAL); device = pd->device; if (!device->alloc_fmr || !device->dealloc_fmr || !device->map_phys_fmr || !device->unmap_fmr) { - printk(KERN_INFO PFX "Device %s does not support FMRs\n", - device->name); + pr_info(PFX "Device %s does not support FMRs\n", device->name); return ERR_PTR(-ENOSYS); } - attr = kmalloc(sizeof *attr, GFP_KERNEL); - if (!attr) { - printk(KERN_WARNING PFX "couldn't allocate device attr struct\n"); - return ERR_PTR(-ENOMEM); - } - - ret = ib_query_device(device, attr); - if (ret) { - printk(KERN_WARNING PFX "couldn't query device: %d\n", ret); - kfree(attr); - return ERR_PTR(ret); - } - - if (!attr->max_map_per_fmr) + if (!device->attrs.max_map_per_fmr) max_remaps = IB_FMR_MAX_REMAPS; else - max_remaps = attr->max_map_per_fmr; + max_remaps = device->attrs.max_map_per_fmr; - kfree(attr); - pool = kmalloc(sizeof *pool, GFP_KERNEL); - if (!pool) { - printk(KERN_WARNING PFX "couldn't allocate pool struct\n"); + if (!pool) return ERR_PTR(-ENOMEM); - } pool->cache_bucket = NULL; - pool->flush_function = params->flush_function; pool->flush_arg = params->flush_arg; INIT_LIST_HEAD(&pool->free_list); INIT_LIST_HEAD(&pool->dirty_list); if (params->cache) { pool->cache_bucket = kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket, GFP_KERNEL); if (!pool->cache_bucket) { - printk(KERN_WARNING PFX "Failed to allocate cache in pool\n"); + pr_warn(PFX "Failed to allocate cache in pool\n"); ret = -ENOMEM; goto out_free_pool; } for (i = 0; i < IB_FMR_HASH_SIZE; ++i) INIT_HLIST_HEAD(pool->cache_bucket + i); } pool->pool_size = 0; pool->max_pages = params->max_pages_per_fmr; pool->max_remaps = max_remaps; pool->dirty_watermark = params->dirty_watermark; pool->dirty_len = 0; spin_lock_init(&pool->pool_lock); atomic_set(&pool->req_ser, 0); atomic_set(&pool->flush_ser, 0); init_waitqueue_head(&pool->force_wait); pool->thread = kthread_run(ib_fmr_cleanup_thread, pool, "ib_fmr(%s)", device->name); if (IS_ERR(pool->thread)) { - printk(KERN_WARNING PFX "couldn't start cleanup thread\n"); + pr_warn(PFX "couldn't start cleanup thread\n"); ret = PTR_ERR(pool->thread); goto out_free_pool; } { struct ib_pool_fmr *fmr; struct ib_fmr_attr fmr_attr = { .max_pages = params->max_pages_per_fmr, .max_maps = pool->max_remaps, .page_shift = params->page_shift }; int bytes_per_fmr = sizeof *fmr; if (pool->cache_bucket) bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64); for (i = 0; i < params->pool_size; ++i) { fmr = kmalloc(bytes_per_fmr, GFP_KERNEL); - if (!fmr) { - printk(KERN_WARNING PFX "failed to allocate fmr " - "struct for FMR %d\n", i); + if (!fmr) goto out_fail; - } fmr->pool = pool; fmr->remap_count = 0; fmr->ref_count = 0; INIT_HLIST_NODE(&fmr->cache_node); fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr); if (IS_ERR(fmr->fmr)) { - printk(KERN_WARNING PFX "fmr_create failed " - "for FMR %d\n", i); + pr_warn(PFX "fmr_create failed for FMR %d\n", + i); kfree(fmr); goto out_fail; } list_add_tail(&fmr->list, &pool->free_list); ++pool->pool_size; } } return pool; out_free_pool: kfree(pool->cache_bucket); kfree(pool); return ERR_PTR(ret); out_fail: ib_destroy_fmr_pool(pool); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(ib_create_fmr_pool); /** * ib_destroy_fmr_pool - Free FMR pool * @pool:FMR pool to free * * Destroy an FMR pool and free all associated resources. */ void ib_destroy_fmr_pool(struct ib_fmr_pool *pool) { struct ib_pool_fmr *fmr; struct ib_pool_fmr *tmp; LIST_HEAD(fmr_list); int i; kthread_stop(pool->thread); ib_fmr_batch_release(pool); i = 0; list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) { if (fmr->remap_count) { INIT_LIST_HEAD(&fmr_list); list_add_tail(&fmr->fmr->list, &fmr_list); ib_unmap_fmr(&fmr_list); } ib_dealloc_fmr(fmr->fmr); list_del(&fmr->list); kfree(fmr); ++i; } if (i < pool->pool_size) - printk(KERN_WARNING PFX "pool still has %d regions registered\n", - pool->pool_size - i); + pr_warn(PFX "pool still has %d regions registered\n", + pool->pool_size - i); kfree(pool->cache_bucket); kfree(pool); } EXPORT_SYMBOL(ib_destroy_fmr_pool); /** * ib_flush_fmr_pool - Invalidate all unmapped FMRs * @pool:FMR pool to flush * * Ensure that all unmapped FMRs are fully invalidated. */ int ib_flush_fmr_pool(struct ib_fmr_pool *pool) { int serial; struct ib_pool_fmr *fmr, *next; /* * The free_list holds FMRs that may have been used * but have not been remapped enough times to be dirty. * Put them on the dirty list now so that the cleanup * thread will reap them too. */ spin_lock_irq(&pool->pool_lock); list_for_each_entry_safe(fmr, next, &pool->free_list, list) { if (fmr->remap_count > 0) list_move(&fmr->list, &pool->dirty_list); } spin_unlock_irq(&pool->pool_lock); serial = atomic_inc_return(&pool->req_ser); wake_up_process(pool->thread); if (wait_event_interruptible(pool->force_wait, atomic_read(&pool->flush_ser) - serial >= 0)) return -EINTR; return 0; } EXPORT_SYMBOL(ib_flush_fmr_pool); /** * ib_fmr_pool_map_phys - * @pool:FMR pool to allocate FMR from * @page_list:List of pages to map * @list_len:Number of pages in @page_list * @io_virtual_address:I/O virtual address for new FMR * * Map an FMR from an FMR pool. */ struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, u64 *page_list, int list_len, u64 io_virtual_address) { struct ib_fmr_pool *pool = pool_handle; struct ib_pool_fmr *fmr; unsigned long flags; int result; if (list_len < 1 || list_len > pool->max_pages) return ERR_PTR(-EINVAL); spin_lock_irqsave(&pool->pool_lock, flags); fmr = ib_fmr_cache_lookup(pool, page_list, list_len, io_virtual_address); if (fmr) { /* found in cache */ ++fmr->ref_count; if (fmr->ref_count == 1) { list_del(&fmr->list); } spin_unlock_irqrestore(&pool->pool_lock, flags); return fmr; } if (list_empty(&pool->free_list)) { spin_unlock_irqrestore(&pool->pool_lock, flags); return ERR_PTR(-EAGAIN); } fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list); list_del(&fmr->list); hlist_del_init(&fmr->cache_node); spin_unlock_irqrestore(&pool->pool_lock, flags); result = ib_map_phys_fmr(fmr->fmr, page_list, list_len, io_virtual_address); if (result) { spin_lock_irqsave(&pool->pool_lock, flags); list_add(&fmr->list, &pool->free_list); spin_unlock_irqrestore(&pool->pool_lock, flags); - printk(KERN_WARNING PFX "fmr_map returns %d\n", result); + pr_warn(PFX "fmr_map returns %d\n", result); return ERR_PTR(result); } ++fmr->remap_count; fmr->ref_count = 1; if (pool->cache_bucket) { fmr->io_virtual_address = io_virtual_address; fmr->page_list_len = list_len; memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list)); spin_lock_irqsave(&pool->pool_lock, flags); hlist_add_head(&fmr->cache_node, pool->cache_bucket + ib_fmr_hash(fmr->page_list[0])); spin_unlock_irqrestore(&pool->pool_lock, flags); } return fmr; } EXPORT_SYMBOL(ib_fmr_pool_map_phys); /** * ib_fmr_pool_unmap - Unmap FMR * @fmr:FMR to unmap * * Unmap an FMR. The FMR mapping may remain valid until the FMR is * reused (or until ib_flush_fmr_pool() is called). */ int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) { struct ib_fmr_pool *pool; unsigned long flags; pool = fmr->pool; spin_lock_irqsave(&pool->pool_lock, flags); --fmr->ref_count; if (!fmr->ref_count) { if (fmr->remap_count < pool->max_remaps) { list_add_tail(&fmr->list, &pool->free_list); } else { list_add_tail(&fmr->list, &pool->dirty_list); if (++pool->dirty_len >= pool->dirty_watermark) { atomic_inc(&pool->req_ser); wake_up_process(pool->thread); } } } #ifdef DEBUG if (fmr->ref_count < 0) - printk(KERN_WARNING PFX "FMR %p has ref count %d < 0\n", - fmr, fmr->ref_count); + pr_warn(PFX "FMR %p has ref count %d < 0\n", + fmr, fmr->ref_count); #endif spin_unlock_irqrestore(&pool->pool_lock, flags); return 0; } EXPORT_SYMBOL(ib_fmr_pool_unmap); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_addr.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_addr.c (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_addr.c (revision 319974) @@ -0,0 +1,760 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. + * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "core_priv.h" + +struct addr_req { + struct list_head list; + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; + struct rdma_dev_addr *addr; + struct rdma_addr_client *client; + void *context; + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context); + unsigned long timeout; + int status; +}; + +static void process_req(struct work_struct *work); + +static DEFINE_MUTEX(lock); +static LIST_HEAD(req_list); +static DECLARE_DELAYED_WORK(work, process_req); +static struct workqueue_struct *addr_wq; + +int rdma_addr_size(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + return sizeof(struct sockaddr_in); + case AF_INET6: + return sizeof(struct sockaddr_in6); + case AF_IB: + return sizeof(struct sockaddr_ib); + default: + return 0; + } +} +EXPORT_SYMBOL(rdma_addr_size); + +static struct rdma_addr_client self; + +void rdma_addr_register_client(struct rdma_addr_client *client) +{ + atomic_set(&client->refcount, 1); + init_completion(&client->comp); +} +EXPORT_SYMBOL(rdma_addr_register_client); + +static inline void put_client(struct rdma_addr_client *client) +{ + if (atomic_dec_and_test(&client->refcount)) + complete(&client->comp); +} + +void rdma_addr_unregister_client(struct rdma_addr_client *client) +{ + put_client(client); + wait_for_completion(&client->comp); +} +EXPORT_SYMBOL(rdma_addr_unregister_client); + +static inline void +rdma_copy_addr_sub(u8 *dst, const u8 *src, unsigned min, unsigned max) +{ + if (min > max) + min = max; + memcpy(dst, src, min); + memset(dst + min, 0, max - min); +} + +int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, + const unsigned char *dst_dev_addr) +{ + if (dev->if_type == IFT_INFINIBAND) + dev_addr->dev_type = ARPHRD_INFINIBAND; + else if (dev->if_type == IFT_ETHER) + dev_addr->dev_type = ARPHRD_ETHER; + else + dev_addr->dev_type = 0; + rdma_copy_addr_sub(dev_addr->src_dev_addr, IF_LLADDR(dev), + dev->if_addrlen, MAX_ADDR_LEN); + rdma_copy_addr_sub(dev_addr->broadcast, dev->if_broadcastaddr, + dev->if_addrlen, MAX_ADDR_LEN); + if (dst_dev_addr != NULL) { + rdma_copy_addr_sub(dev_addr->dst_dev_addr, dst_dev_addr, + dev->if_addrlen, MAX_ADDR_LEN); + } + dev_addr->bound_dev_if = dev->if_index; + return 0; +} +EXPORT_SYMBOL(rdma_copy_addr); + +int rdma_translate_ip(const struct sockaddr *addr, + struct rdma_dev_addr *dev_addr, + u16 *vlan_id) +{ + struct net_device *dev = NULL; + int ret = -EADDRNOTAVAIL; + + if (dev_addr->bound_dev_if) { + dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); + if (!dev) + return -ENODEV; + ret = rdma_copy_addr(dev_addr, dev, NULL); + dev_put(dev); + return ret; + } + + switch (addr->sa_family) { +#ifdef INET + case AF_INET: + dev = ip_dev_find(dev_addr->net, + ((const struct sockaddr_in *)addr)->sin_addr.s_addr); + break; +#endif +#ifdef INET6 + case AF_INET6: { + struct in6_addr in6_addr = ((const struct sockaddr_in6 *)addr)->sin6_addr; + + /* embed scope ID */ + in6_addr.s6_addr[3] = ((const struct sockaddr_in6 *)addr)->sin6_scope_id; + + dev = ip6_dev_find(dev_addr->net, in6_addr); + break; + } +#endif + default: + break; + } + + if (dev != NULL) { + ret = rdma_copy_addr(dev_addr, dev, NULL); + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); + dev_put(dev); + } + return ret; +} +EXPORT_SYMBOL(rdma_translate_ip); + +static void set_timeout(unsigned long time) +{ + int delay; /* under FreeBSD ticks are 32-bit */ + + delay = time - jiffies; + if (delay <= 0) + delay = 1; + + mod_delayed_work(addr_wq, &work, delay); +} + +static void queue_req(struct addr_req *req) +{ + struct addr_req *temp_req; + + mutex_lock(&lock); + list_for_each_entry_reverse(temp_req, &req_list, list) { + if (time_after_eq(req->timeout, temp_req->timeout)) + break; + } + + list_add(&req->list, &temp_req->list); + + if (req_list.next == &req->list) + set_timeout(req->timeout); + mutex_unlock(&lock); +} + +#if defined(INET) || defined(INET6) +static int addr_resolve_multi(u8 *edst, struct ifnet *ifp, struct sockaddr *dst_in) +{ + struct sockaddr *llsa; + struct sockaddr_dl sdl; + int error; + + sdl.sdl_len = sizeof(sdl); + llsa = (struct sockaddr *)&sdl; + + if (ifp->if_resolvemulti == NULL) { + error = EOPNOTSUPP; + } else { + error = ifp->if_resolvemulti(ifp, &llsa, dst_in); + if (error == 0) { + rdma_copy_addr_sub(edst, LLADDR((struct sockaddr_dl *)llsa), + ifp->if_addrlen, MAX_ADDR_LEN); + } + } + return (error); +} +#endif + +#ifdef INET +static int addr4_resolve(struct sockaddr_in *src_in, + const struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr, + struct ifnet **ifpp) +{ + struct sockaddr_in dst_tmp = *dst_in; + u8 edst[MAX_ADDR_LEN]; + struct rtentry *rte; + struct ifnet *ifp; + int error; + + /* + * Make sure the socket address length field + * is set, else rtalloc1() will fail. + */ + dst_tmp.sin_len = sizeof(dst_tmp); + + CURVNET_SET(addr->net); + /* set default TTL limit */ + addr->hoplimit = V_ip_defttl; + + /* lookup route for destination */ + rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0); + CURVNET_RESTORE(); + + /* + * Make sure the route exists and has a valid link. + */ + if (rte == NULL) { + error = EHOSTUNREACH; + goto done; + } else if (rte->rt_ifp == NULL || RT_LINK_IS_UP(rte->rt_ifp) == 0) { + RTFREE_LOCKED(rte); + error = EHOSTUNREACH; + goto done; + } else if (src_in->sin_addr.s_addr != INADDR_ANY) { + RT_UNLOCK(rte); + + ifp = ip_dev_find(addr->net, src_in->sin_addr.s_addr); + if (ifp == NULL) { + RTFREE(rte); + error = ENETUNREACH; + goto done; + } else if (ifp != rte->rt_ifp) { + error = ENETUNREACH; + goto failure; + } + } else { + struct sockaddr *saddr; + + ifp = rte->rt_ifp; + dev_hold(ifp); + + saddr = rte->rt_ifa->ifa_addr; + memcpy(src_in, saddr, rdma_addr_size(saddr)); + RT_UNLOCK(rte); + } + + /* + * Resolve destination MAC address + */ + if (dst_tmp.sin_addr.s_addr == INADDR_BROADCAST) { + rdma_copy_addr_sub(edst, ifp->if_broadcastaddr, + ifp->if_addrlen, MAX_ADDR_LEN); + } else if (IN_MULTICAST(ntohl(dst_tmp.sin_addr.s_addr))) { + error = addr_resolve_multi(edst, ifp, (struct sockaddr *)&dst_tmp); + if (error != 0) + goto failure; + } else { + bool is_gw = (rte->rt_flags & RTF_GATEWAY) != 0; + memset(edst, 0, sizeof(edst)); + error = arpresolve(ifp, is_gw, NULL, is_gw ? + rte->rt_gateway : (const struct sockaddr *)&dst_tmp, + edst, NULL, NULL); + if (error != 0) + goto failure; + else if (is_gw != 0) + addr->network = RDMA_NETWORK_IPV4; + } + + /* + * Copy destination and source MAC addresses + */ + error = -rdma_copy_addr(addr, ifp, edst); + if (error != 0) { +failure: + dev_put(ifp); + + if (error == EWOULDBLOCK || error == EAGAIN) + error = ENODATA; + } else { + *ifpp = ifp; + } + RTFREE(rte); +done: + return (-error); +} +#else +static int addr4_resolve(struct sockaddr_in *src_in, + const struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr, + struct ifnet **ifpp) +{ + return -EADDRNOTAVAIL; +} +#endif + +#ifdef INET6 +static int addr6_resolve(struct sockaddr_in6 *src_in, + const struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr, + struct ifnet **ifpp) +{ + struct sockaddr_in6 dst_tmp = *dst_in; + u8 edst[MAX_ADDR_LEN]; + struct rtentry *rte; + struct ifnet *ifp; + int error; + + sa6_embedscope(&dst_tmp, 0); + sa6_embedscope(src_in, 0); + + /* + * Make sure the socket address length field + * is set, else rtalloc1() will fail. + */ + dst_tmp.sin6_len = sizeof(dst_tmp); + + CURVNET_SET(addr->net); + /* set default TTL limit */ + addr->hoplimit = V_ip_defttl; + + /* lookup route for destination */ + rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0); + CURVNET_RESTORE(); + + /* + * Make sure the route exists and has a valid link. + */ + if (rte == NULL) { + error = EHOSTUNREACH; + goto done; + } else if (rte->rt_ifp == NULL || RT_LINK_IS_UP(rte->rt_ifp) == 0) { + RTFREE_LOCKED(rte); + error = EHOSTUNREACH; + goto done; + } else if (!IN6_IS_ADDR_UNSPECIFIED(&src_in->sin6_addr)) { + RT_UNLOCK(rte); + + ifp = ip6_dev_find(addr->net, src_in->sin6_addr); + if (ifp == NULL) { + RTFREE(rte); + error = ENETUNREACH; + goto done; + } else if (ifp != rte->rt_ifp) { + error = ENETUNREACH; + goto failure; + } + } else { + struct sockaddr *saddr; + + ifp = rte->rt_ifp; + dev_hold(ifp); + + saddr = rte->rt_ifa->ifa_addr; + memcpy(src_in, saddr, rdma_addr_size(saddr)); + RT_UNLOCK(rte); + } + + /* + * Resolve destination MAC address + */ + if (IN6_IS_ADDR_MULTICAST(&dst_tmp.sin6_addr)) { + error = addr_resolve_multi(edst, ifp, (struct sockaddr *)&dst_tmp); + if (error != 0) + goto failure; + } else { + bool is_gw = (rte->rt_flags & RTF_GATEWAY) != 0; + memset(edst, 0, sizeof(edst)); + error = nd6_resolve(ifp, is_gw, NULL, is_gw ? + rte->rt_gateway : (const struct sockaddr *)&dst_tmp, + edst, NULL, NULL); + if (error != 0) + goto failure; + else if (is_gw != 0) + addr->network = RDMA_NETWORK_IPV6; + } + + /* + * Copy destination and source MAC addresses + */ + error = -rdma_copy_addr(addr, ifp, edst); + if (error != 0) { +failure: + dev_put(ifp); + + if (error == EWOULDBLOCK || error == EAGAIN) + error = ENODATA; + } else { + *ifpp = ifp; + } + RTFREE(rte); +done: + sa6_recoverscope(&dst_tmp); + sa6_recoverscope(src_in); + + return (-error); +} +#else +static int addr6_resolve(struct sockaddr_in6 *src_in, + const struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr, + struct ifnet **ifpp) +{ + return -EADDRNOTAVAIL; +} +#endif + +static int addr_resolve_neigh(struct ifnet *dev, + const struct sockaddr *dst_in, + struct rdma_dev_addr *addr) +{ + if (dev->if_flags & IFF_LOOPBACK) { + int ret; + + ret = rdma_translate_ip(dst_in, addr, NULL); + if (!ret) + memcpy(addr->dst_dev_addr, addr->src_dev_addr, + MAX_ADDR_LEN); + + return ret; + } + + /* If the device doesn't do ARP internally */ + if (!(dev->if_flags & IFF_NOARP)) + return 0; + + return rdma_copy_addr(addr, dev, NULL); +} + +static int addr_resolve(struct sockaddr *src_in, + const struct sockaddr *dst_in, + struct rdma_dev_addr *addr, + bool resolve_neigh) +{ + struct net_device *ndev = NULL; + int ret; + + if (dst_in->sa_family != src_in->sa_family) + return -EINVAL; + + if (src_in->sa_family == AF_INET) { + ret = addr4_resolve((struct sockaddr_in *)src_in, + (const struct sockaddr_in *)dst_in, + addr, &ndev); + if (ret) + return ret; + + if (resolve_neigh) + ret = addr_resolve_neigh(ndev, dst_in, addr); + } else { + ret = addr6_resolve((struct sockaddr_in6 *)src_in, + (const struct sockaddr_in6 *)dst_in, addr, + &ndev); + if (ret) + return ret; + + if (resolve_neigh) + ret = addr_resolve_neigh(ndev, dst_in, addr); + } + + addr->bound_dev_if = ndev->if_index; + addr->net = dev_net(ndev); + dev_put(ndev); + + return ret; +} + +static void process_req(struct work_struct *work) +{ + struct addr_req *req, *temp_req; + struct sockaddr *src_in, *dst_in; + struct list_head done_list; + + INIT_LIST_HEAD(&done_list); + + mutex_lock(&lock); + list_for_each_entry_safe(req, temp_req, &req_list, list) { + if (req->status == -ENODATA) { + src_in = (struct sockaddr *) &req->src_addr; + dst_in = (struct sockaddr *) &req->dst_addr; + req->status = addr_resolve(src_in, dst_in, req->addr, + true); + if (req->status && time_after_eq(jiffies, req->timeout)) + req->status = -ETIMEDOUT; + else if (req->status == -ENODATA) + continue; + } + list_move_tail(&req->list, &done_list); + } + + if (!list_empty(&req_list)) { + req = list_entry(req_list.next, struct addr_req, list); + set_timeout(req->timeout); + } + mutex_unlock(&lock); + + list_for_each_entry_safe(req, temp_req, &done_list, list) { + list_del(&req->list); + req->callback(req->status, (struct sockaddr *) &req->src_addr, + req->addr, req->context); + put_client(req->client); + kfree(req); + } +} + +int rdma_resolve_ip(struct rdma_addr_client *client, + struct sockaddr *src_addr, struct sockaddr *dst_addr, + struct rdma_dev_addr *addr, int timeout_ms, + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context), + void *context) +{ + struct sockaddr *src_in, *dst_in; + struct addr_req *req; + int ret = 0; + + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) + return -ENOMEM; + + src_in = (struct sockaddr *) &req->src_addr; + dst_in = (struct sockaddr *) &req->dst_addr; + + if (src_addr) { + if (src_addr->sa_family != dst_addr->sa_family) { + ret = -EINVAL; + goto err; + } + + memcpy(src_in, src_addr, rdma_addr_size(src_addr)); + } else { + src_in->sa_family = dst_addr->sa_family; + } + + memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr)); + req->addr = addr; + req->callback = callback; + req->context = context; + req->client = client; + atomic_inc(&client->refcount); + + req->status = addr_resolve(src_in, dst_in, addr, true); + switch (req->status) { + case 0: + req->timeout = jiffies; + queue_req(req); + break; + case -ENODATA: + req->timeout = msecs_to_jiffies(timeout_ms) + jiffies; + queue_req(req); + break; + default: + ret = req->status; + atomic_dec(&client->refcount); + goto err; + } + return ret; +err: + kfree(req); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_ip); + +int rdma_resolve_ip_route(struct sockaddr *src_addr, + const struct sockaddr *dst_addr, + struct rdma_dev_addr *addr) +{ + struct sockaddr_storage ssrc_addr = {}; + struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr; + + if (src_addr) { + if (src_addr->sa_family != dst_addr->sa_family) + return -EINVAL; + + memcpy(src_in, src_addr, rdma_addr_size(src_addr)); + } else { + src_in->sa_family = dst_addr->sa_family; + } + + return addr_resolve(src_in, dst_addr, addr, false); +} +EXPORT_SYMBOL(rdma_resolve_ip_route); + +void rdma_addr_cancel(struct rdma_dev_addr *addr) +{ + struct addr_req *req, *temp_req; + + mutex_lock(&lock); + list_for_each_entry_safe(req, temp_req, &req_list, list) { + if (req->addr == addr) { + req->status = -ECANCELED; + req->timeout = jiffies; + list_move(&req->list, &req_list); + set_timeout(req->timeout); + break; + } + } + mutex_unlock(&lock); +} +EXPORT_SYMBOL(rdma_addr_cancel); + +struct resolve_cb_context { + struct rdma_dev_addr *addr; + struct completion comp; + int status; +}; + +static void resolve_cb(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context) +{ + if (!status) + memcpy(((struct resolve_cb_context *)context)->addr, + addr, sizeof(struct rdma_dev_addr)); + ((struct resolve_cb_context *)context)->status = status; + complete(&((struct resolve_cb_context *)context)->comp); +} + +int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + const union ib_gid *dgid, + u8 *dmac, u16 *vlan_id, int *if_index, + int *hoplimit) +{ + int ret = 0; + struct rdma_dev_addr dev_addr; + struct resolve_cb_context ctx; + struct net_device *dev; + + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; + + + rdma_gid2ip(&sgid_addr._sockaddr, sgid); + rdma_gid2ip(&dgid_addr._sockaddr, dgid); + + memset(&dev_addr, 0, sizeof(dev_addr)); + if (if_index) + dev_addr.bound_dev_if = *if_index; + dev_addr.net = TD_TO_VNET(curthread); + + ctx.addr = &dev_addr; + init_completion(&ctx.comp); + ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, + &dev_addr, 1000, resolve_cb, &ctx); + if (ret) + return ret; + + wait_for_completion(&ctx.comp); + + ret = ctx.status; + if (ret) + return ret; + + memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); + dev = dev_get_by_index(dev_addr.net, dev_addr.bound_dev_if); + if (!dev) + return -ENODEV; + if (if_index) + *if_index = dev_addr.bound_dev_if; + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); + if (hoplimit) + *hoplimit = dev_addr.hoplimit; + dev_put(dev); + return ret; +} +EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh); + +int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id) +{ + int ret = 0; + struct rdma_dev_addr dev_addr; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } gid_addr; + + rdma_gid2ip(&gid_addr._sockaddr, sgid); + + memset(&dev_addr, 0, sizeof(dev_addr)); + dev_addr.net = TD_TO_VNET(curthread); + ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id); + if (ret) + return ret; + + memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN); + return ret; +} +EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid); + +int addr_init(void) +{ + addr_wq = alloc_workqueue("ib_addr", WQ_MEM_RECLAIM, 0); + if (!addr_wq) + return -ENOMEM; + + rdma_addr_register_client(&self); + + return 0; +} + +void addr_cleanup(void) +{ + rdma_addr_unregister_client(&self); + destroy_workqueue(addr_wq); +} Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_addr.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_cache.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_cache.c (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_cache.c (revision 319974) @@ -0,0 +1,1253 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include + +#include "core_priv.h" + +struct ib_pkey_cache { + int table_len; + u16 table[0]; +}; + +struct ib_update_work { + struct work_struct work; + struct ib_device *device; + u8 port_num; +}; + +union ib_gid zgid; +EXPORT_SYMBOL(zgid); + +static const struct ib_gid_attr zattr; + +enum gid_attr_find_mask { + GID_ATTR_FIND_MASK_GID = 1UL << 0, + GID_ATTR_FIND_MASK_NETDEV = 1UL << 1, + GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2, + GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3, +}; + +enum gid_table_entry_props { + GID_TABLE_ENTRY_INVALID = 1UL << 0, + GID_TABLE_ENTRY_DEFAULT = 1UL << 1, +}; + +enum gid_table_write_action { + GID_TABLE_WRITE_ACTION_ADD, + GID_TABLE_WRITE_ACTION_DEL, + /* MODIFY only updates the GID table. Currently only used by + * ib_cache_update. + */ + GID_TABLE_WRITE_ACTION_MODIFY +}; + +struct ib_gid_table_entry { + unsigned long props; + union ib_gid gid; + struct ib_gid_attr attr; + void *context; +}; + +struct ib_gid_table { + int sz; + /* In RoCE, adding a GID to the table requires: + * (a) Find if this GID is already exists. + * (b) Find a free space. + * (c) Write the new GID + * + * Delete requires different set of operations: + * (a) Find the GID + * (b) Delete it. + * + * Add/delete should be carried out atomically. + * This is done by locking this mutex from multiple + * writers. We don't need this lock for IB, as the MAD + * layer replaces all entries. All data_vec entries + * are locked by this lock. + **/ + struct mutex lock; + /* This lock protects the table entries from being + * read and written simultaneously. + */ + rwlock_t rwlock; + struct ib_gid_table_entry *data_vec; +}; + +static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) +{ + if (rdma_cap_roce_gid_table(ib_dev, port)) { + struct ib_event event; + + event.device = ib_dev; + event.element.port_num = port; + event.event = IB_EVENT_GID_CHANGE; + + ib_dispatch_event(&event); + } +} + +static const char * const gid_type_str[] = { + [IB_GID_TYPE_IB] = "IB/RoCE v1", + [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2", +}; + +const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) +{ + if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type]) + return gid_type_str[gid_type]; + + return "Invalid GID type"; +} +EXPORT_SYMBOL(ib_cache_gid_type_str); + +int ib_cache_gid_parse_type_str(const char *buf) +{ + unsigned int i; + size_t len; + int err = -EINVAL; + + len = strlen(buf); + if (len == 0) + return -EINVAL; + + if (buf[len - 1] == '\n') + len--; + + for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i) + if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) && + len == strlen(gid_type_str[i])) { + err = i; + break; + } + + return err; +} +EXPORT_SYMBOL(ib_cache_gid_parse_type_str); + +/* This function expects that rwlock will be write locked in all + * scenarios and that lock will be locked in sleep-able (RoCE) + * scenarios. + */ +static int write_gid(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table, int ix, + const union ib_gid *gid, + const struct ib_gid_attr *attr, + enum gid_table_write_action action, + bool default_gid) + __releases(&table->rwlock) __acquires(&table->rwlock) +{ + int ret = 0; + struct net_device *old_net_dev; + enum ib_gid_type old_gid_type; + + /* in rdma_cap_roce_gid_table, this funciton should be protected by a + * sleep-able lock. + */ + + if (rdma_cap_roce_gid_table(ib_dev, port)) { + table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID; + write_unlock_irq(&table->rwlock); + /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by + * RoCE providers and thus only updates the cache. + */ + if (action == GID_TABLE_WRITE_ACTION_ADD) + ret = ib_dev->add_gid(ib_dev, port, ix, gid, attr, + &table->data_vec[ix].context); + else if (action == GID_TABLE_WRITE_ACTION_DEL) + ret = ib_dev->del_gid(ib_dev, port, ix, + &table->data_vec[ix].context); + write_lock_irq(&table->rwlock); + } + + old_net_dev = table->data_vec[ix].attr.ndev; + old_gid_type = table->data_vec[ix].attr.gid_type; + if (old_net_dev && old_net_dev != attr->ndev) + dev_put(old_net_dev); + /* if modify_gid failed, just delete the old gid */ + if (ret || action == GID_TABLE_WRITE_ACTION_DEL) { + gid = &zgid; + attr = &zattr; + table->data_vec[ix].context = NULL; + } + + memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid)); + memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr)); + if (default_gid) { + table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT; + if (action == GID_TABLE_WRITE_ACTION_DEL) + table->data_vec[ix].attr.gid_type = old_gid_type; + } + if (table->data_vec[ix].attr.ndev && + table->data_vec[ix].attr.ndev != old_net_dev) + dev_hold(table->data_vec[ix].attr.ndev); + + table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID; + + return ret; +} + +static int add_gid(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table, int ix, + const union ib_gid *gid, + const struct ib_gid_attr *attr, + bool default_gid) { + return write_gid(ib_dev, port, table, ix, gid, attr, + GID_TABLE_WRITE_ACTION_ADD, default_gid); +} + +static int modify_gid(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table, int ix, + const union ib_gid *gid, + const struct ib_gid_attr *attr, + bool default_gid) { + return write_gid(ib_dev, port, table, ix, gid, attr, + GID_TABLE_WRITE_ACTION_MODIFY, default_gid); +} + +static int del_gid(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table, int ix, + bool default_gid) { + return write_gid(ib_dev, port, table, ix, &zgid, &zattr, + GID_TABLE_WRITE_ACTION_DEL, default_gid); +} + +/* rwlock should be read locked */ +static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, + const struct ib_gid_attr *val, bool default_gid, + unsigned long mask, int *pempty) +{ + int i = 0; + int found = -1; + int empty = pempty ? -1 : 0; + + while (i < table->sz && (found < 0 || empty < 0)) { + struct ib_gid_table_entry *data = &table->data_vec[i]; + struct ib_gid_attr *attr = &data->attr; + int curr_index = i; + + i++; + + if (data->props & GID_TABLE_ENTRY_INVALID) + continue; + + if (empty < 0) + if (!memcmp(&data->gid, &zgid, sizeof(*gid)) && + !memcmp(attr, &zattr, sizeof(*attr)) && + !data->props) + empty = curr_index; + + if (found >= 0) + continue; + + if (mask & GID_ATTR_FIND_MASK_GID_TYPE && + attr->gid_type != val->gid_type) + continue; + + if (mask & GID_ATTR_FIND_MASK_GID && + memcmp(gid, &data->gid, sizeof(*gid))) + continue; + + if (mask & GID_ATTR_FIND_MASK_NETDEV && + attr->ndev != val->ndev) + continue; + + if (mask & GID_ATTR_FIND_MASK_DEFAULT && + !!(data->props & GID_TABLE_ENTRY_DEFAULT) != + default_gid) + continue; + + found = curr_index; + } + + if (pempty) + *pempty = empty; + + return found; +} + +static void addrconf_ifid_eui48(u8 *eui, struct net_device *dev) +{ + if (dev->if_addrlen != ETH_ALEN) + return; + memcpy(eui, IF_LLADDR(dev), 3); + memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); + + /* NOTE: The scope ID is added by the GID to IP conversion */ + + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[0] ^= 2; +} + +static void make_default_gid(struct net_device *dev, union ib_gid *gid) +{ + gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + addrconf_ifid_eui48(&gid->raw[8], dev); +} + +int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + int ix; + int ret = 0; + struct net_device *idev; + int empty; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + if (!memcmp(gid, &zgid, sizeof(*gid))) + return -EINVAL; + + if (ib_dev->get_netdev) { + idev = ib_dev->get_netdev(ib_dev, port); + if (idev && attr->ndev != idev) { + union ib_gid default_gid; + + /* Adding default GIDs in not permitted */ + make_default_gid(idev, &default_gid); + if (!memcmp(gid, &default_gid, sizeof(*gid))) { + dev_put(idev); + return -EPERM; + } + } + if (idev) + dev_put(idev); + } + + mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); + + ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_NETDEV, &empty); + if (ix >= 0) + goto out_unlock; + + if (empty < 0) { + ret = -ENOSPC; + goto out_unlock; + } + + ret = add_gid(ib_dev, port, table, empty, gid, attr, false); + if (!ret) + dispatch_gid_change_event(ib_dev, port); + +out_unlock: + write_unlock_irq(&table->rwlock); + mutex_unlock(&table->lock); + return ret; +} + +int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + int ix; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); + + ix = find_gid(table, gid, attr, false, + GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_NETDEV | + GID_ATTR_FIND_MASK_DEFAULT, + NULL); + if (ix < 0) + goto out_unlock; + + if (!del_gid(ib_dev, port, table, ix, false)) + dispatch_gid_change_event(ib_dev, port); + +out_unlock: + write_unlock_irq(&table->rwlock); + mutex_unlock(&table->lock); + return 0; +} + +int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, + struct net_device *ndev) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + int ix; + bool deleted = false; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); + + for (ix = 0; ix < table->sz; ix++) + if (table->data_vec[ix].attr.ndev == ndev) + if (!del_gid(ib_dev, port, table, ix, + !!(table->data_vec[ix].props & + GID_TABLE_ENTRY_DEFAULT))) + deleted = true; + + write_unlock_irq(&table->rwlock); + mutex_unlock(&table->lock); + + if (deleted) + dispatch_gid_change_event(ib_dev, port); + + return 0; +} + +static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, + union ib_gid *gid, struct ib_gid_attr *attr) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + if (index < 0 || index >= table->sz) + return -EINVAL; + + if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) + return -EAGAIN; + + memcpy(gid, &table->data_vec[index].gid, sizeof(*gid)); + if (attr) { + memcpy(attr, &table->data_vec[index].attr, sizeof(*attr)); + if (attr->ndev) + dev_hold(attr->ndev); + } + + return 0; +} + +static int _ib_cache_gid_table_find(struct ib_device *ib_dev, + const union ib_gid *gid, + const struct ib_gid_attr *val, + unsigned long mask, + u8 *port, u16 *index) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + u8 p; + int local_index; + unsigned long flags; + + for (p = 0; p < ib_dev->phys_port_cnt; p++) { + table = ports_table[p]; + read_lock_irqsave(&table->rwlock, flags); + local_index = find_gid(table, gid, val, false, mask, NULL); + if (local_index >= 0) { + if (index) + *index = local_index; + if (port) + *port = p + rdma_start_port(ib_dev); + read_unlock_irqrestore(&table->rwlock, flags); + return 0; + } + read_unlock_irqrestore(&table->rwlock, flags); + } + + return -ENOENT; +} + +static int ib_cache_gid_find(struct ib_device *ib_dev, + const union ib_gid *gid, + enum ib_gid_type gid_type, + struct net_device *ndev, u8 *port, + u16 *index) +{ + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; + + if (ndev) + mask |= GID_ATTR_FIND_MASK_NETDEV; + + return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val, + mask, port, index); +} + +int ib_find_cached_gid_by_port(struct ib_device *ib_dev, + const union ib_gid *gid, + enum ib_gid_type gid_type, + u8 port, struct net_device *ndev, + u16 *index) +{ + int local_index; + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type}; + unsigned long flags; + + if (port < rdma_start_port(ib_dev) || + port > rdma_end_port(ib_dev)) + return -ENOENT; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + if (ndev) + mask |= GID_ATTR_FIND_MASK_NETDEV; + + read_lock_irqsave(&table->rwlock, flags); + local_index = find_gid(table, gid, &val, false, mask, NULL); + if (local_index >= 0) { + if (index) + *index = local_index; + read_unlock_irqrestore(&table->rwlock, flags); + return 0; + } + + read_unlock_irqrestore(&table->rwlock, flags); + return -ENOENT; +} +EXPORT_SYMBOL(ib_find_cached_gid_by_port); + +/** + * ib_find_gid_by_filter - Returns the GID table index where a specified + * GID value occurs + * @device: The device to query. + * @gid: The GID value to search for. + * @port_num: The port number of the device where the GID value could be + * searched. + * @filter: The filter function is executed on any matching GID in the table. + * If the filter function returns true, the corresponding index is returned, + * otherwise, we continue searching the GID table. It's guaranteed that + * while filter is executed, ndev field is valid and the structure won't + * change. filter is executed in an atomic context. filter must not be NULL. + * @index: The index into the cached GID table where the GID was found. This + * parameter may be NULL. + * + * ib_cache_gid_find_by_filter() searches for the specified GID value + * of which the filter function returns true in the port's GID table. + * This function is only supported on RoCE ports. + * + */ +static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, + const union ib_gid *gid, + u8 port, + bool (*filter)(const union ib_gid *, + const struct ib_gid_attr *, + void *), + void *context, + u16 *index) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + unsigned int i; + unsigned long flags; + bool found = false; + + if (!ports_table) + return -EOPNOTSUPP; + + if (port < rdma_start_port(ib_dev) || + port > rdma_end_port(ib_dev) || + !rdma_protocol_roce(ib_dev, port)) + return -EPROTONOSUPPORT; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + read_lock_irqsave(&table->rwlock, flags); + for (i = 0; i < table->sz; i++) { + struct ib_gid_attr attr; + + if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) + goto next; + + if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) + goto next; + + memcpy(&attr, &table->data_vec[i].attr, sizeof(attr)); + + if (filter(gid, &attr, context)) + found = true; + +next: + if (found) + break; + } + read_unlock_irqrestore(&table->rwlock, flags); + + if (!found) + return -ENOENT; + + if (index) + *index = i; + return 0; +} + +static struct ib_gid_table *alloc_gid_table(int sz) +{ + struct ib_gid_table *table = + kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL); + + if (!table) + return NULL; + + table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL); + if (!table->data_vec) + goto err_free_table; + + mutex_init(&table->lock); + + table->sz = sz; + rwlock_init(&table->rwlock); + + return table; + +err_free_table: + kfree(table); + return NULL; +} + +static void release_gid_table(struct ib_gid_table *table) +{ + if (table) { + kfree(table->data_vec); + kfree(table); + } +} + +static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table) +{ + int i; + bool deleted = false; + + if (!table) + return; + + write_lock_irq(&table->rwlock); + for (i = 0; i < table->sz; ++i) { + if (memcmp(&table->data_vec[i].gid, &zgid, + sizeof(table->data_vec[i].gid))) + if (!del_gid(ib_dev, port, table, i, + table->data_vec[i].props & + GID_ATTR_FIND_MASK_DEFAULT)) + deleted = true; + } + write_unlock_irq(&table->rwlock); + + if (deleted) + dispatch_gid_change_event(ib_dev, port); +} + +void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, + struct net_device *ndev, + unsigned long gid_type_mask, + enum ib_cache_gid_default_mode mode) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + union ib_gid gid; + struct ib_gid_attr gid_attr; + struct ib_gid_attr zattr_type = zattr; + struct ib_gid_table *table; + unsigned int gid_type; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + make_default_gid(ndev, &gid); + memset(&gid_attr, 0, sizeof(gid_attr)); + gid_attr.ndev = ndev; + + for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) { + int ix; + union ib_gid current_gid; + struct ib_gid_attr current_gid_attr = {}; + + if (1UL << gid_type & ~gid_type_mask) + continue; + + gid_attr.gid_type = gid_type; + + mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); + ix = find_gid(table, NULL, &gid_attr, true, + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_DEFAULT, + NULL); + + /* Coudn't find default GID location */ + if (WARN_ON(ix < 0)) + goto release; + + zattr_type.gid_type = gid_type; + + if (!__ib_cache_gid_get(ib_dev, port, ix, + ¤t_gid, ¤t_gid_attr) && + mode == IB_CACHE_GID_DEFAULT_MODE_SET && + !memcmp(&gid, ¤t_gid, sizeof(gid)) && + !memcmp(&gid_attr, ¤t_gid_attr, sizeof(gid_attr))) + goto release; + + if (memcmp(¤t_gid, &zgid, sizeof(current_gid)) || + memcmp(¤t_gid_attr, &zattr_type, + sizeof(current_gid_attr))) { + if (del_gid(ib_dev, port, table, ix, true)) { + pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n", + ix, gid.raw); + goto release; + } else { + dispatch_gid_change_event(ib_dev, port); + } + } + + if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { + if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true)) + pr_warn("ib_cache_gid: unable to add default gid %pI6\n", + gid.raw); + else + dispatch_gid_change_event(ib_dev, port); + } + +release: + if (current_gid_attr.ndev) + dev_put(current_gid_attr.ndev); + write_unlock_irq(&table->rwlock); + mutex_unlock(&table->lock); + } +} + +static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table) +{ + unsigned int i; + unsigned long roce_gid_type_mask; + unsigned int num_default_gids; + unsigned int current_gid = 0; + + roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + num_default_gids = hweight_long(roce_gid_type_mask); + for (i = 0; i < num_default_gids && i < table->sz; i++) { + struct ib_gid_table_entry *entry = + &table->data_vec[i]; + + entry->props |= GID_TABLE_ENTRY_DEFAULT; + current_gid = find_next_bit(&roce_gid_type_mask, + BITS_PER_LONG, + current_gid); + entry->attr.gid_type = current_gid++; + } + + return 0; +} + +static int _gid_table_setup_one(struct ib_device *ib_dev) +{ + u8 port; + struct ib_gid_table **table; + int err = 0; + + table = kcalloc(ib_dev->phys_port_cnt, sizeof(*table), GFP_KERNEL); + + if (!table) { + pr_warn("failed to allocate ib gid cache for %s\n", + ib_dev->name); + return -ENOMEM; + } + + for (port = 0; port < ib_dev->phys_port_cnt; port++) { + u8 rdma_port = port + rdma_start_port(ib_dev); + + table[port] = + alloc_gid_table( + ib_dev->port_immutable[rdma_port].gid_tbl_len); + if (!table[port]) { + err = -ENOMEM; + goto rollback_table_setup; + } + + err = gid_table_reserve_default(ib_dev, + port + rdma_start_port(ib_dev), + table[port]); + if (err) + goto rollback_table_setup; + } + + ib_dev->cache.gid_cache = table; + return 0; + +rollback_table_setup: + for (port = 0; port < ib_dev->phys_port_cnt; port++) { + cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), + table[port]); + release_gid_table(table[port]); + } + + kfree(table); + return err; +} + +static void gid_table_release_one(struct ib_device *ib_dev) +{ + struct ib_gid_table **table = ib_dev->cache.gid_cache; + u8 port; + + if (!table) + return; + + for (port = 0; port < ib_dev->phys_port_cnt; port++) + release_gid_table(table[port]); + + kfree(table); + ib_dev->cache.gid_cache = NULL; +} + +static void gid_table_cleanup_one(struct ib_device *ib_dev) +{ + struct ib_gid_table **table = ib_dev->cache.gid_cache; + u8 port; + + if (!table) + return; + + for (port = 0; port < ib_dev->phys_port_cnt; port++) + cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), + table[port]); +} + +static int gid_table_setup_one(struct ib_device *ib_dev) +{ + int err; + + err = _gid_table_setup_one(ib_dev); + + if (err) + return err; + + err = roce_rescan_device(ib_dev); + + if (err) { + gid_table_cleanup_one(ib_dev); + gid_table_release_one(ib_dev); + } + + return err; +} + +int ib_get_cached_gid(struct ib_device *device, + u8 port_num, + int index, + union ib_gid *gid, + struct ib_gid_attr *gid_attr) +{ + int res; + unsigned long flags; + struct ib_gid_table **ports_table = device->cache.gid_cache; + struct ib_gid_table *table = ports_table[port_num - rdma_start_port(device)]; + + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + return -EINVAL; + + read_lock_irqsave(&table->rwlock, flags); + res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr); + read_unlock_irqrestore(&table->rwlock, flags); + + return res; +} +EXPORT_SYMBOL(ib_get_cached_gid); + +int ib_find_cached_gid(struct ib_device *device, + const union ib_gid *gid, + enum ib_gid_type gid_type, + struct net_device *ndev, + u8 *port_num, + u16 *index) +{ + return ib_cache_gid_find(device, gid, gid_type, ndev, port_num, index); +} +EXPORT_SYMBOL(ib_find_cached_gid); + +int ib_find_gid_by_filter(struct ib_device *device, + const union ib_gid *gid, + u8 port_num, + bool (*filter)(const union ib_gid *gid, + const struct ib_gid_attr *, + void *), + void *context, u16 *index) +{ + /* Only RoCE GID table supports filter function */ + if (!rdma_cap_roce_gid_table(device, port_num) && filter) + return -EPROTONOSUPPORT; + + return ib_cache_gid_find_by_filter(device, gid, + port_num, filter, + context, index); +} +EXPORT_SYMBOL(ib_find_gid_by_filter); + +int ib_get_cached_pkey(struct ib_device *device, + u8 port_num, + int index, + u16 *pkey) +{ + struct ib_pkey_cache *cache; + unsigned long flags; + int ret = 0; + + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + + cache = device->cache.pkey_cache[port_num - rdma_start_port(device)]; + + if (index < 0 || index >= cache->table_len) + ret = -EINVAL; + else + *pkey = cache->table[index]; + + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_get_cached_pkey); + +int ib_find_cached_pkey(struct ib_device *device, + u8 port_num, + u16 pkey, + u16 *index) +{ + struct ib_pkey_cache *cache; + unsigned long flags; + int i; + int ret = -ENOENT; + int partial_ix = -1; + + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + + cache = device->cache.pkey_cache[port_num - rdma_start_port(device)]; + + *index = -1; + + for (i = 0; i < cache->table_len; ++i) + if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) { + if (cache->table[i] & 0x8000) { + *index = i; + ret = 0; + break; + } else + partial_ix = i; + } + + if (ret && partial_ix >= 0) { + *index = partial_ix; + ret = 0; + } + + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_find_cached_pkey); + +int ib_find_exact_cached_pkey(struct ib_device *device, + u8 port_num, + u16 pkey, + u16 *index) +{ + struct ib_pkey_cache *cache; + unsigned long flags; + int i; + int ret = -ENOENT; + + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + + cache = device->cache.pkey_cache[port_num - rdma_start_port(device)]; + + *index = -1; + + for (i = 0; i < cache->table_len; ++i) + if (cache->table[i] == pkey) { + *index = i; + ret = 0; + break; + } + + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_find_exact_cached_pkey); + +int ib_get_cached_lmc(struct ib_device *device, + u8 port_num, + u8 *lmc) +{ + unsigned long flags; + int ret = 0; + + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + *lmc = device->cache.lmc_cache[port_num - rdma_start_port(device)]; + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_get_cached_lmc); + +static void ib_cache_update(struct ib_device *device, + u8 port) +{ + struct ib_port_attr *tprops = NULL; + struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; + struct ib_gid_cache { + int table_len; + union ib_gid table[0]; + } *gid_cache = NULL; + int i; + int ret; + struct ib_gid_table *table; + struct ib_gid_table **ports_table = device->cache.gid_cache; + bool use_roce_gid_table = + rdma_cap_roce_gid_table(device, port); + + if (port < rdma_start_port(device) || port > rdma_end_port(device)) + return; + + table = ports_table[port - rdma_start_port(device)]; + + tprops = kmalloc(sizeof *tprops, GFP_KERNEL); + if (!tprops) + return; + + ret = ib_query_port(device, port, tprops); + if (ret) { + pr_warn("ib_query_port failed (%d) for %s\n", + ret, device->name); + goto err; + } + + pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len * + sizeof *pkey_cache->table, GFP_KERNEL); + if (!pkey_cache) + goto err; + + pkey_cache->table_len = tprops->pkey_tbl_len; + + if (!use_roce_gid_table) { + gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len * + sizeof(*gid_cache->table), GFP_KERNEL); + if (!gid_cache) + goto err; + + gid_cache->table_len = tprops->gid_tbl_len; + } + + for (i = 0; i < pkey_cache->table_len; ++i) { + ret = ib_query_pkey(device, port, i, pkey_cache->table + i); + if (ret) { + pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n", + ret, device->name, i); + goto err; + } + } + + if (!use_roce_gid_table) { + for (i = 0; i < gid_cache->table_len; ++i) { + ret = ib_query_gid(device, port, i, + gid_cache->table + i, NULL); + if (ret) { + pr_warn("ib_query_gid failed (%d) for %s (index %d)\n", + ret, device->name, i); + goto err; + } + } + } + + write_lock_irq(&device->cache.lock); + + old_pkey_cache = device->cache.pkey_cache[port - rdma_start_port(device)]; + + device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache; + if (!use_roce_gid_table) { + write_lock(&table->rwlock); + for (i = 0; i < gid_cache->table_len; i++) { + modify_gid(device, port, table, i, gid_cache->table + i, + &zattr, false); + } + write_unlock(&table->rwlock); + } + + device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc; + + write_unlock_irq(&device->cache.lock); + + kfree(gid_cache); + kfree(old_pkey_cache); + kfree(tprops); + return; + +err: + kfree(pkey_cache); + kfree(gid_cache); + kfree(tprops); +} + +static void ib_cache_task(struct work_struct *_work) +{ + struct ib_update_work *work = + container_of(_work, struct ib_update_work, work); + + ib_cache_update(work->device, work->port_num); + kfree(work); +} + +static void ib_cache_event(struct ib_event_handler *handler, + struct ib_event *event) +{ + struct ib_update_work *work; + + if (event->event == IB_EVENT_PORT_ERR || + event->event == IB_EVENT_PORT_ACTIVE || + event->event == IB_EVENT_LID_CHANGE || + event->event == IB_EVENT_PKEY_CHANGE || + event->event == IB_EVENT_SM_CHANGE || + event->event == IB_EVENT_CLIENT_REREGISTER || + event->event == IB_EVENT_GID_CHANGE) { + work = kmalloc(sizeof *work, GFP_ATOMIC); + if (work) { + INIT_WORK(&work->work, ib_cache_task); + work->device = event->device; + work->port_num = event->element.port_num; + queue_work(ib_wq, &work->work); + } + } +} + +int ib_cache_setup_one(struct ib_device *device) +{ + int p; + int err; + + rwlock_init(&device->cache.lock); + + device->cache.pkey_cache = + kzalloc(sizeof *device->cache.pkey_cache * + (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL); + device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache * + (rdma_end_port(device) - + rdma_start_port(device) + 1), + GFP_KERNEL); + if (!device->cache.pkey_cache || + !device->cache.lmc_cache) { + pr_warn("Couldn't allocate cache for %s\n", device->name); + return -ENOMEM; + } + + err = gid_table_setup_one(device); + if (err) + /* Allocated memory will be cleaned in the release function */ + return err; + + for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) + ib_cache_update(device, p + rdma_start_port(device)); + + INIT_IB_EVENT_HANDLER(&device->cache.event_handler, + device, ib_cache_event); + err = ib_register_event_handler(&device->cache.event_handler); + if (err) + goto err; + + return 0; + +err: + gid_table_cleanup_one(device); + return err; +} + +void ib_cache_release_one(struct ib_device *device) +{ + int p; + + /* + * The release function frees all the cache elements. + * This function should be called as part of freeing + * all the device's resources when the cache could no + * longer be accessed. + */ + if (device->cache.pkey_cache) + for (p = 0; + p <= rdma_end_port(device) - rdma_start_port(device); ++p) + kfree(device->cache.pkey_cache[p]); + + gid_table_release_one(device); + kfree(device->cache.pkey_cache); + kfree(device->cache.lmc_cache); +} + +void ib_cache_cleanup_one(struct ib_device *device) +{ + /* The cleanup function unregisters the event handler, + * waits for all in-progress workqueue elements and cleans + * up the GID cache. This function should be called after + * the device was removed from the devices list and all + * clients were removed, so the cache exists but is + * non-functional and shouldn't be updated anymore. + */ + ib_unregister_event_handler(&device->cache.event_handler); + flush_workqueue(ib_wq); + gid_table_cleanup_one(device); +} + +void __init ib_cache_setup(void) +{ + roce_gid_mgmt_init(); +} + +void __exit ib_cache_cleanup(void) +{ + roce_gid_mgmt_cleanup(); +} Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_cache.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_cq.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_cq.c (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_cq.c (revision 319974) @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include + +#define IB_CQ_POLL_MAX 16 +/* maximum number of completions per poll loop */ +#define IB_CQ_POLL_BUDGET 65536 +#define IB_CQ_POLL_FLAGS (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) + +static void +ib_cq_poll_work(struct work_struct *work) +{ + struct ib_wc ib_wc[IB_CQ_POLL_MAX]; + struct ib_cq *cq = container_of(work, struct ib_cq, work); + int total = 0; + int i; + int n; + + while (1) { + n = ib_poll_cq(cq, IB_CQ_POLL_MAX, ib_wc); + for (i = 0; i < n; i++) { + struct ib_wc *wc = ib_wc + i; + + if (wc->wr_cqe != NULL) + wc->wr_cqe->done(cq, wc); + } + + if (n != IB_CQ_POLL_MAX) { + if (ib_req_notify_cq(cq, IB_CQ_POLL_FLAGS) > 0) + break; + else + return; + } + total += n; + if (total >= IB_CQ_POLL_BUDGET) + break; + } + + /* give other work structs a chance */ + queue_work(ib_comp_wq, &cq->work); +} + +static void +ib_cq_completion_workqueue(struct ib_cq *cq, void *private) +{ + queue_work(ib_comp_wq, &cq->work); +} + +struct ib_cq * +ib_alloc_cq(struct ib_device *dev, void *private, + int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx) +{ + struct ib_cq_init_attr cq_attr = { + .cqe = nr_cqe, + .comp_vector = comp_vector, + }; + struct ib_cq *cq; + + /* + * Check for invalid parameters early on to avoid + * extra error handling code: + */ + switch (poll_ctx) { + case IB_POLL_DIRECT: + case IB_POLL_SOFTIRQ: + case IB_POLL_WORKQUEUE: + break; + default: + return (ERR_PTR(-EINVAL)); + } + + cq = dev->create_cq(dev, &cq_attr, NULL, NULL); + if (IS_ERR(cq)) + return (cq); + + cq->device = dev; + cq->uobject = NULL; + cq->event_handler = NULL; + cq->cq_context = private; + cq->poll_ctx = poll_ctx; + atomic_set(&cq->usecnt, 0); + + switch (poll_ctx) { + case IB_POLL_DIRECT: + cq->comp_handler = NULL; /* no hardware completions */ + break; + case IB_POLL_SOFTIRQ: + case IB_POLL_WORKQUEUE: + cq->comp_handler = ib_cq_completion_workqueue; + INIT_WORK(&cq->work, ib_cq_poll_work); + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + break; + default: + break; + } + return (cq); +} +EXPORT_SYMBOL(ib_alloc_cq); + +void +ib_free_cq(struct ib_cq *cq) +{ + + if (WARN_ON_ONCE(atomic_read(&cq->usecnt) != 0)) + return; + + switch (cq->poll_ctx) { + case IB_POLL_DIRECT: + break; + case IB_POLL_SOFTIRQ: + case IB_POLL_WORKQUEUE: + flush_work(&cq->work); + break; + default: + break; + } + + (void)cq->device->destroy_cq(cq); +} +EXPORT_SYMBOL(ib_free_cq); Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_cq.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_iwpm_msg.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_iwpm_msg.c (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_iwpm_msg.c (revision 319974) @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iwpm_util.h" + +static int iwpm_user_pid = IWPM_PID_UNDEFINED; + +int iwpm_valid_pid(void) +{ + return iwpm_user_pid > 0; +} +EXPORT_SYMBOL(iwpm_valid_pid); + Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_iwpm_msg.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_iwpm_util.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_iwpm_util.c (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_iwpm_util.c (revision 319974) @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iwpm_util.h" + +#define IWPM_MAPINFO_HASH_SIZE 512 +#define IWPM_MAPINFO_HASH_MASK (IWPM_MAPINFO_HASH_SIZE - 1) +#define IWPM_REMINFO_HASH_SIZE 64 +#define IWPM_REMINFO_HASH_MASK (IWPM_REMINFO_HASH_SIZE - 1) +#define IWPM_MSG_SIZE 512 + +int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, + struct sockaddr_storage *b_sockaddr) +{ + if (a_sockaddr->ss_family != b_sockaddr->ss_family) + return 1; + if (a_sockaddr->ss_family == AF_INET) { + struct sockaddr_in *a4_sockaddr = + (struct sockaddr_in *)a_sockaddr; + struct sockaddr_in *b4_sockaddr = + (struct sockaddr_in *)b_sockaddr; + if (!memcmp(&a4_sockaddr->sin_addr, + &b4_sockaddr->sin_addr, sizeof(struct in_addr)) + && a4_sockaddr->sin_port == b4_sockaddr->sin_port) + return 0; + + } else if (a_sockaddr->ss_family == AF_INET6) { + struct sockaddr_in6 *a6_sockaddr = + (struct sockaddr_in6 *)a_sockaddr; + struct sockaddr_in6 *b6_sockaddr = + (struct sockaddr_in6 *)b_sockaddr; + if (!memcmp(&a6_sockaddr->sin6_addr, + &b6_sockaddr->sin6_addr, sizeof(struct in6_addr)) + && a6_sockaddr->sin6_port == b6_sockaddr->sin6_port) + return 0; + + } else { + pr_err("%s: Invalid sockaddr family\n", __func__); + } + return 1; +} + +void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg) +{ + struct sockaddr_in6 *sockaddr_v6; + struct sockaddr_in *sockaddr_v4; + + switch (sockaddr->ss_family) { + case AF_INET: + sockaddr_v4 = (struct sockaddr_in *)sockaddr; + pr_debug("%s IPV4 %pI4: %u(0x%04X)\n", + msg, &sockaddr_v4->sin_addr, + ntohs(sockaddr_v4->sin_port), + ntohs(sockaddr_v4->sin_port)); + break; + case AF_INET6: + sockaddr_v6 = (struct sockaddr_in6 *)sockaddr; + pr_debug("%s IPV6 %pI6: %u(0x%04X)\n", + msg, &sockaddr_v6->sin6_addr, + ntohs(sockaddr_v6->sin6_port), + ntohs(sockaddr_v6->sin6_port)); + break; + default: + break; + } +} + Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_iwpm_util.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c (revision 319974) @@ -0,0 +1,415 @@ +/* + * Copyright (c) 2015-2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "core_priv.h" + +#include +#include +#include + +#include +#include + +#include + +static struct workqueue_struct *roce_gid_mgmt_wq; + +enum gid_op_type { + GID_DEL = 0, + GID_ADD +}; + +struct roce_gid_scan_event_work { + struct work_struct work; + struct net_device *ndev; +}; + +struct roce_rescan_work { + struct work_struct work; + struct ib_device *ib_dev; +}; + +static const struct { + bool (*is_supported)(const struct ib_device *device, u8 port_num); + enum ib_gid_type gid_type; +} PORT_CAP_TO_GID_TYPE[] = { + {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, + {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, +}; + +#define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) + +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port) +{ + int i; + unsigned int ret_flags = 0; + + if (!rdma_protocol_roce(ib_dev, port)) + return 1UL << IB_GID_TYPE_IB; + + for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) + if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) + ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; + + return ret_flags; +} +EXPORT_SYMBOL(roce_gid_type_mask_support); + +static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, + u8 port, union ib_gid *gid, struct net_device *ndev) +{ + int i; + unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + struct ib_gid_attr gid_attr; + + memset(&gid_attr, 0, sizeof(gid_attr)); + gid_attr.ndev = ndev; + + for (i = 0; i != IB_GID_TYPE_SIZE; i++) { + if ((1UL << i) & gid_type_mask) { + gid_attr.gid_type = i; + switch (gid_op) { + case GID_ADD: + ib_cache_gid_add(ib_dev, port, + gid, &gid_attr); + break; + case GID_DEL: + ib_cache_gid_del(ib_dev, port, + gid, &gid_attr); + break; + } + } + } +} + +static int +roce_gid_match_netdev(struct ib_device *ib_dev, u8 port, + struct net_device *idev, void *cookie) +{ + struct net_device *ndev = (struct net_device *)cookie; + if (idev == NULL) + return (0); + return (ndev == idev); +} + +static int +roce_gid_match_all(struct ib_device *ib_dev, u8 port, + struct net_device *idev, void *cookie) +{ + if (idev == NULL) + return (0); + return (1); +} + +static int +roce_gid_enum_netdev_default(struct ib_device *ib_dev, + u8 port, struct net_device *idev) +{ + unsigned long gid_type_mask; + + gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + + ib_cache_gid_set_default_gid(ib_dev, port, idev, gid_type_mask, + IB_CACHE_GID_DEFAULT_MODE_SET); + + return (hweight_long(gid_type_mask)); +} + +#define ETH_IPOIB_DRV_NAME "ib" + +static inline int +is_eth_ipoib_intf(struct net_device *dev) +{ + if (strcmp(dev->if_dname, ETH_IPOIB_DRV_NAME)) + return 0; + return 1; +} + +static void +roce_gid_update_addr_callback(struct ib_device *device, u8 port, + struct net_device *ndev, void *cookie) +{ + struct ipx_entry { + STAILQ_ENTRY(ipx_entry) entry; + union ipx_addr { + struct sockaddr sa[0]; + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } ipx_addr; + }; + struct ipx_entry *entry; + struct net_device *idev; + struct ifaddr *ifa; + union ib_gid gid; + int default_gids; + u16 index_num; + int i; + + STAILQ_HEAD(, ipx_entry) ipx_head; + + STAILQ_INIT(&ipx_head); + + /* make sure default GIDs are in */ + default_gids = roce_gid_enum_netdev_default(device, port, ndev); + + CURVNET_SET(ndev->if_vnet); + IFNET_RLOCK(); + TAILQ_FOREACH(idev, &V_ifnet, if_link) { + if (idev != ndev) { + if (idev->if_type != IFT_L2VLAN) + continue; + if (ndev != rdma_vlan_dev_real_dev(idev)) + continue; + } + + /* clone address information for IPv4 and IPv6 */ + IF_ADDR_RLOCK(idev); +#if defined(INET) + TAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL || + ifa->ifa_addr->sa_family != AF_INET) + continue; + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) { + pr_warn("roce_gid_update_addr_callback: " + "couldn't allocate entry for IPv4 update\n"); + continue; + } + entry->ipx_addr.v4 = *((struct sockaddr_in *)ifa->ifa_addr); + STAILQ_INSERT_TAIL(&ipx_head, entry, entry); + } +#endif +#if defined(INET6) + TAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL || + ifa->ifa_addr->sa_family != AF_INET6) + continue; + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) { + pr_warn("roce_gid_update_addr_callback: " + "couldn't allocate entry for IPv6 update\n"); + continue; + } + entry->ipx_addr.v6 = *((struct sockaddr_in6 *)ifa->ifa_addr); + + /* trash IPv6 scope ID */ + sa6_recoverscope(&entry->ipx_addr.v6); + entry->ipx_addr.v6.sin6_scope_id = 0; + + STAILQ_INSERT_TAIL(&ipx_head, entry, entry); + } +#endif + IF_ADDR_RUNLOCK(idev); + } + IFNET_RUNLOCK(); + CURVNET_RESTORE(); + + /* add missing GIDs, if any */ + STAILQ_FOREACH(entry, &ipx_head, entry) { + unsigned long gid_type_mask = roce_gid_type_mask_support(device, port); + + if (rdma_ip2gid(&entry->ipx_addr.sa[0], &gid) != 0) + continue; + + for (i = 0; i != IB_GID_TYPE_SIZE; i++) { + if (!((1UL << i) & gid_type_mask)) + continue; + /* check if entry found */ + if (ib_find_cached_gid_by_port(device, &gid, i, + port, ndev, &index_num) == 0) + break; + } + if (i != IB_GID_TYPE_SIZE) + continue; + /* add new GID */ + update_gid(GID_ADD, device, port, &gid, ndev); + } + + /* remove stale GIDs, if any */ + for (i = default_gids; ib_get_cached_gid(device, port, i, &gid, NULL) == 0; i++) { + union ipx_addr ipx; + + /* don't delete empty entries */ + if (memcmp(&gid, &zgid, sizeof(zgid)) == 0) + continue; + + /* zero default */ + memset(&ipx, 0, sizeof(ipx)); + + rdma_gid2ip(&ipx.sa[0], &gid); + + STAILQ_FOREACH(entry, &ipx_head, entry) { + if (memcmp(&entry->ipx_addr, &ipx, sizeof(ipx)) == 0) + break; + } + /* check if entry found */ + if (entry != NULL) + continue; + + /* remove GID */ + update_gid(GID_DEL, device, port, &gid, ndev); + } + + while ((entry = STAILQ_FIRST(&ipx_head))) { + STAILQ_REMOVE_HEAD(&ipx_head, entry); + kfree(entry); + } +} + +static void +roce_gid_queue_scan_event_handler(struct work_struct *_work) +{ + struct roce_gid_scan_event_work *work = + container_of(_work, struct roce_gid_scan_event_work, work); + + ib_enum_all_roce_netdevs(roce_gid_match_netdev, work->ndev, + roce_gid_update_addr_callback, NULL); + + dev_put(work->ndev); + kfree(work); +} + +static void +roce_gid_queue_scan_event(struct net_device *ndev) +{ + struct roce_gid_scan_event_work *work; + +retry: + if (is_eth_ipoib_intf(ndev)) + return; + + if (ndev->if_type != IFT_ETHER) { + if (ndev->if_type == IFT_L2VLAN) { + ndev = rdma_vlan_dev_real_dev(ndev); + if (ndev != NULL) + goto retry; + } + return; + } + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (!work) { + pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n"); + return; + } + + INIT_WORK(&work->work, roce_gid_queue_scan_event_handler); + dev_hold(ndev); + + work->ndev = ndev; + + queue_work(roce_gid_mgmt_wq, &work->work); +} + +static int +inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *ndev = ptr; + + switch (event) { + case NETDEV_REGISTER: + case NETDEV_UNREGISTER: + case NETDEV_CHANGEADDR: + case NETDEV_CHANGEIFADDR: + roce_gid_queue_scan_event(ndev); + break; + default: + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block nb_inetaddr = { + .notifier_call = inetaddr_event +}; + +static void +roce_rescan_device_handler(struct work_struct *_work) +{ + struct roce_rescan_work *work = + container_of(_work, struct roce_rescan_work, work); + + ib_enum_roce_netdev(work->ib_dev, roce_gid_match_all, NULL, + roce_gid_update_addr_callback, NULL); + kfree(work); +} + +/* Caller must flush system workqueue before removing the ib_device */ +int roce_rescan_device(struct ib_device *ib_dev) +{ + struct roce_rescan_work *work = kmalloc(sizeof(*work), GFP_KERNEL); + + if (!work) + return -ENOMEM; + + work->ib_dev = ib_dev; + INIT_WORK(&work->work, roce_rescan_device_handler); + queue_work(roce_gid_mgmt_wq, &work->work); + + return 0; +} + +int __init roce_gid_mgmt_init(void) +{ + roce_gid_mgmt_wq = alloc_ordered_workqueue("roce_gid_mgmt_wq", 0); + if (!roce_gid_mgmt_wq) { + pr_warn("roce_gid_mgmt: can't allocate work queue\n"); + return -ENOMEM; + } + + register_inetaddr_notifier(&nb_inetaddr); + + /* + * We rely on the netdevice notifier to enumerate all existing + * devices in the system. Register to this notifier last to + * make sure we will not miss any IP add/del callbacks. + */ + register_netdevice_notifier(&nb_inetaddr); + + return 0; +} + +void __exit roce_gid_mgmt_cleanup(void) +{ + unregister_inetaddr_notifier(&nb_inetaddr); + unregister_netdevice_notifier(&nb_inetaddr); + + /* + * Ensure all gid deletion tasks complete before we go down, + * to avoid any reference to free'd memory. By the time + * ib-core is removed, all physical devices have been removed, + * so no issue with remaining hardware contexts. + */ + synchronize_rcu(); + drain_workqueue(roce_gid_mgmt_wq); + destroy_workqueue(roce_gid_mgmt_wq); +} Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_smi.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_smi.c (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_smi.c (revision 319974) @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved. + * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include "smi.h" +#include "opa_smi.h" + +static enum smi_action __smi_handle_dr_smp_send(bool is_switch, int port_num, + u8 *hop_ptr, u8 hop_cnt, + const u8 *initial_path, + const u8 *return_path, + u8 direction, + bool dr_dlid_is_permissive, + bool dr_slid_is_permissive) +{ + /* See section 14.2.2.2, Vol 1 IB spec */ + /* C14-6 -- valid hop_cnt values are from 0 to 63 */ + if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) + return IB_SMI_DISCARD; + + if (!direction) { + /* C14-9:1 */ + if (hop_cnt && *hop_ptr == 0) { + (*hop_ptr)++; + return (initial_path[*hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:2 */ + if (*hop_ptr && *hop_ptr < hop_cnt) { + if (!is_switch) + return IB_SMI_DISCARD; + + /* return_path set when received */ + (*hop_ptr)++; + return (initial_path[*hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:3 -- We're at the end of the DR segment of path */ + if (*hop_ptr == hop_cnt) { + /* return_path set when received */ + (*hop_ptr)++; + return (is_switch || + dr_dlid_is_permissive ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ + /* C14-9:5 -- Fail unreasonable hop pointer */ + return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); + + } else { + /* C14-13:1 */ + if (hop_cnt && *hop_ptr == hop_cnt + 1) { + (*hop_ptr)--; + return (return_path[*hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:2 */ + if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) { + if (!is_switch) + return IB_SMI_DISCARD; + + (*hop_ptr)--; + return (return_path[*hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:3 -- at the end of the DR segment of path */ + if (*hop_ptr == 1) { + (*hop_ptr)--; + /* C14-13:3 -- SMPs destined for SM shouldn't be here */ + return (is_switch || + dr_slid_is_permissive ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */ + if (*hop_ptr == 0) + return IB_SMI_HANDLE; + + /* C14-13:5 -- Check for unreasonable hop pointer */ + return IB_SMI_DISCARD; + } +} + +/* + * Fixup a directed route SMP for sending + * Return IB_SMI_DISCARD if the SMP should be discarded + */ +enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, + bool is_switch, int port_num) +{ + return __smi_handle_dr_smp_send(is_switch, port_num, + &smp->hop_ptr, smp->hop_cnt, + smp->initial_path, + smp->return_path, + ib_get_smp_direction(smp), + smp->dr_dlid == IB_LID_PERMISSIVE, + smp->dr_slid == IB_LID_PERMISSIVE); +} + +enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, + bool is_switch, int port_num) +{ + return __smi_handle_dr_smp_send(is_switch, port_num, + &smp->hop_ptr, smp->hop_cnt, + smp->route.dr.initial_path, + smp->route.dr.return_path, + opa_get_smp_direction(smp), + smp->route.dr.dr_dlid == + OPA_LID_PERMISSIVE, + smp->route.dr.dr_slid == + OPA_LID_PERMISSIVE); +} + +static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, int port_num, + int phys_port_cnt, + u8 *hop_ptr, u8 hop_cnt, + const u8 *initial_path, + u8 *return_path, + u8 direction, + bool dr_dlid_is_permissive, + bool dr_slid_is_permissive) +{ + /* See section 14.2.2.2, Vol 1 IB spec */ + /* C14-6 -- valid hop_cnt values are from 0 to 63 */ + if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) + return IB_SMI_DISCARD; + + if (!direction) { + /* C14-9:1 -- sender should have incremented hop_ptr */ + if (hop_cnt && *hop_ptr == 0) + return IB_SMI_DISCARD; + + /* C14-9:2 -- intermediate hop */ + if (*hop_ptr && *hop_ptr < hop_cnt) { + if (!is_switch) + return IB_SMI_DISCARD; + + return_path[*hop_ptr] = port_num; + /* hop_ptr updated when sending */ + return (initial_path[*hop_ptr+1] <= phys_port_cnt ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:3 -- We're at the end of the DR segment of path */ + if (*hop_ptr == hop_cnt) { + if (hop_cnt) + return_path[*hop_ptr] = port_num; + /* hop_ptr updated when sending */ + + return (is_switch || + dr_dlid_is_permissive ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ + /* C14-9:5 -- fail unreasonable hop pointer */ + return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); + + } else { + + /* C14-13:1 */ + if (hop_cnt && *hop_ptr == hop_cnt + 1) { + (*hop_ptr)--; + return (return_path[*hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:2 */ + if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) { + if (!is_switch) + return IB_SMI_DISCARD; + + /* hop_ptr updated when sending */ + return (return_path[*hop_ptr-1] <= phys_port_cnt ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:3 -- We're at the end of the DR segment of path */ + if (*hop_ptr == 1) { + if (dr_slid_is_permissive) { + /* giving SMP to SM - update hop_ptr */ + (*hop_ptr)--; + return IB_SMI_HANDLE; + } + /* hop_ptr updated when sending */ + return (is_switch ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:4 -- hop_ptr = 0 -> give to SM */ + /* C14-13:5 -- Check for unreasonable hop pointer */ + return (*hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } +} + +/* + * Adjust information for a received SMP + * Return IB_SMI_DISCARD if the SMP should be dropped + */ +enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, + int port_num, int phys_port_cnt) +{ + return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt, + &smp->hop_ptr, smp->hop_cnt, + smp->initial_path, + smp->return_path, + ib_get_smp_direction(smp), + smp->dr_dlid == IB_LID_PERMISSIVE, + smp->dr_slid == IB_LID_PERMISSIVE); +} + +/* + * Adjust information for a received SMP + * Return IB_SMI_DISCARD if the SMP should be dropped + */ +enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch, + int port_num, int phys_port_cnt) +{ + return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt, + &smp->hop_ptr, smp->hop_cnt, + smp->route.dr.initial_path, + smp->route.dr.return_path, + opa_get_smp_direction(smp), + smp->route.dr.dr_dlid == + OPA_LID_PERMISSIVE, + smp->route.dr.dr_slid == + OPA_LID_PERMISSIVE); +} + +static enum smi_forward_action __smi_check_forward_dr_smp(u8 hop_ptr, u8 hop_cnt, + u8 direction, + bool dr_dlid_is_permissive, + bool dr_slid_is_permissive) +{ + if (!direction) { + /* C14-9:2 -- intermediate hop */ + if (hop_ptr && hop_ptr < hop_cnt) + return IB_SMI_FORWARD; + + /* C14-9:3 -- at the end of the DR segment of path */ + if (hop_ptr == hop_cnt) + return (dr_dlid_is_permissive ? + IB_SMI_SEND : IB_SMI_LOCAL); + + /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ + if (hop_ptr == hop_cnt + 1) + return IB_SMI_SEND; + } else { + /* C14-13:2 -- intermediate hop */ + if (2 <= hop_ptr && hop_ptr <= hop_cnt) + return IB_SMI_FORWARD; + + /* C14-13:3 -- at the end of the DR segment of path */ + if (hop_ptr == 1) + return (!dr_slid_is_permissive ? + IB_SMI_SEND : IB_SMI_LOCAL); + } + return IB_SMI_LOCAL; + +} + +enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp) +{ + return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt, + ib_get_smp_direction(smp), + smp->dr_dlid == IB_LID_PERMISSIVE, + smp->dr_slid == IB_LID_PERMISSIVE); +} + +enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp) +{ + return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt, + opa_get_smp_direction(smp), + smp->route.dr.dr_dlid == + OPA_LID_PERMISSIVE, + smp->route.dr.dr_slid == + OPA_LID_PERMISSIVE); +} + +/* + * Return the forwarding port number from initial_path for outgoing SMP and + * from return_path for returning SMP + */ +int smi_get_fwd_port(struct ib_smp *smp) +{ + return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] : + smp->return_path[smp->hop_ptr-1]); +} + +/* + * Return the forwarding port number from initial_path for outgoing SMP and + * from return_path for returning SMP + */ +int opa_smi_get_fwd_port(struct opa_smp *smp) +{ + return !opa_get_smp_direction(smp) ? smp->route.dr.initial_path[smp->hop_ptr+1] : + smp->route.dr.return_path[smp->hop_ptr-1]; +} Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_smi.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_sysfs.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_sysfs.c (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_sysfs.c (revision 319974) @@ -0,0 +1,1327 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "core_priv.h" + +#include +#include +#include +#include +#include + +#include +#include + +struct ib_port; + +struct gid_attr_group { + struct ib_port *port; + struct kobject kobj; + struct attribute_group ndev; + struct attribute_group type; +}; +struct ib_port { + struct kobject kobj; + struct ib_device *ibdev; + struct gid_attr_group *gid_attr_group; + struct attribute_group gid_group; + struct attribute_group pkey_group; + struct attribute_group *pma_table; + struct attribute_group *hw_stats_ag; + struct rdma_hw_stats *hw_stats; + u8 port_num; +}; + +struct port_attribute { + struct attribute attr; + ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf); + ssize_t (*store)(struct ib_port *, struct port_attribute *, + const char *buf, size_t count); +}; + +#define PORT_ATTR(_name, _mode, _show, _store) \ +struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store) + +#define PORT_ATTR_RO(_name) \ +struct port_attribute port_attr_##_name = __ATTR_RO(_name) + +struct port_table_attribute { + struct port_attribute attr; + char name[8]; + int index; + __be16 attr_id; +}; + +struct hw_stats_attribute { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); + ssize_t (*store)(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t count); + int index; + u8 port_num; +}; + +static ssize_t port_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + + if (!port_attr->show) + return -EIO; + + return port_attr->show(p, port_attr, buf); +} + +static const struct sysfs_ops port_sysfs_ops = { + .show = port_attr_show +}; + +static ssize_t gid_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct ib_port *p = container_of(kobj, struct gid_attr_group, + kobj)->port; + + if (!port_attr->show) + return -EIO; + + return port_attr->show(p, port_attr, buf); +} + +static const struct sysfs_ops gid_attr_sysfs_ops = { + .show = gid_attr_show +}; + +static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + static const char *state_name[] = { + [IB_PORT_NOP] = "NOP", + [IB_PORT_DOWN] = "DOWN", + [IB_PORT_INIT] = "INIT", + [IB_PORT_ARMED] = "ARMED", + [IB_PORT_ACTIVE] = "ACTIVE", + [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER" + }; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "%d: %s\n", attr.state, + attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ? + state_name[attr.state] : "UNKNOWN"); +} + +static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "0x%x\n", attr.lid); +} + +static ssize_t lid_mask_count_show(struct ib_port *p, + struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "%d\n", attr.lmc); +} + +static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "0x%x\n", attr.sm_lid); +} + +static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "%d\n", attr.sm_sl); +} + +static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "0x%08x\n", attr.port_cap_flags); +} + +static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + char *speed = ""; + int rate; /* in deci-Gb/sec */ + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + switch (attr.active_speed) { + case IB_SPEED_DDR: + speed = " DDR"; + rate = 50; + break; + case IB_SPEED_QDR: + speed = " QDR"; + rate = 100; + break; + case IB_SPEED_FDR10: + speed = " FDR10"; + rate = 100; + break; + case IB_SPEED_FDR: + speed = " FDR"; + rate = 140; + break; + case IB_SPEED_EDR: + speed = " EDR"; + rate = 250; + break; + case IB_SPEED_SDR: + default: /* default to SDR for invalid rates */ + rate = 25; + break; + } + + rate *= ib_width_enum_to_int(attr.active_width); + if (rate < 0) + return -EINVAL; + + return sprintf(buf, "%d%s Gb/sec (%dX%s)\n", + rate / 10, rate % 10 ? ".5" : "", + ib_width_enum_to_int(attr.active_width), speed); +} + +static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + switch (attr.phys_state) { + case 1: return sprintf(buf, "1: Sleep\n"); + case 2: return sprintf(buf, "2: Polling\n"); + case 3: return sprintf(buf, "3: Disabled\n"); + case 4: return sprintf(buf, "4: PortConfigurationTraining\n"); + case 5: return sprintf(buf, "5: LinkUp\n"); + case 6: return sprintf(buf, "6: LinkErrorRecovery\n"); + case 7: return sprintf(buf, "7: Phy Test\n"); + default: return sprintf(buf, "%d: \n", attr.phys_state); + } +} + +static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + return sprintf(buf, "%s\n", "InfiniBand"); + case IB_LINK_LAYER_ETHERNET: + return sprintf(buf, "%s\n", "Ethernet"); + default: + return sprintf(buf, "%s\n", "Unknown"); + } +} + +static PORT_ATTR_RO(state); +static PORT_ATTR_RO(lid); +static PORT_ATTR_RO(lid_mask_count); +static PORT_ATTR_RO(sm_lid); +static PORT_ATTR_RO(sm_sl); +static PORT_ATTR_RO(cap_mask); +static PORT_ATTR_RO(rate); +static PORT_ATTR_RO(phys_state); +static PORT_ATTR_RO(link_layer); + +static struct attribute *port_default_attrs[] = { + &port_attr_state.attr, + &port_attr_lid.attr, + &port_attr_lid_mask_count.attr, + &port_attr_sm_lid.attr, + &port_attr_sm_sl.attr, + &port_attr_cap_mask.attr, + &port_attr_rate.attr, + &port_attr_phys_state.attr, + &port_attr_link_layer.attr, + NULL +}; + +static size_t print_ndev(struct ib_gid_attr *gid_attr, char *buf) +{ + if (!gid_attr->ndev) + return -EINVAL; + + return sprintf(buf, "%s\n", if_name(gid_attr->ndev)); +} + +static size_t print_gid_type(struct ib_gid_attr *gid_attr, char *buf) +{ + return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_attr->gid_type)); +} + +static ssize_t _show_port_gid_attr(struct ib_port *p, + struct port_attribute *attr, + char *buf, + size_t (*print)(struct ib_gid_attr *gid_attr, + char *buf)) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + union ib_gid gid; + struct ib_gid_attr gid_attr = {}; + ssize_t ret; + + ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, + &gid_attr); + if (ret) + goto err; + + ret = print(&gid_attr, buf); + +err: + if (gid_attr.ndev) + dev_put(gid_attr.ndev); + return ret; +} + +static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + union ib_gid gid; + ssize_t ret; + + ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, NULL); + if (ret) + return ret; + + return sprintf(buf, GID_PRINT_FMT"\n", GID_PRINT_ARGS(gid.raw)); +} + +static ssize_t show_port_gid_attr_ndev(struct ib_port *p, + struct port_attribute *attr, char *buf) +{ + return _show_port_gid_attr(p, attr, buf, print_ndev); +} + +static ssize_t show_port_gid_attr_gid_type(struct ib_port *p, + struct port_attribute *attr, + char *buf) +{ + return _show_port_gid_attr(p, attr, buf, print_gid_type); +} + +static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + u16 pkey; + ssize_t ret; + + ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey); + if (ret) + return ret; + + return sprintf(buf, "0x%04x\n", pkey); +} + +#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ +struct port_table_attribute port_pma_attr_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ + .index = (_offset) | ((_width) << 16) | ((_counter) << 24), \ + .attr_id = IB_PMA_PORT_COUNTERS , \ +} + +#define PORT_PMA_ATTR_EXT(_name, _width, _offset) \ +struct port_table_attribute port_pma_attr_ext_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ + .index = (_offset) | ((_width) << 16), \ + .attr_id = IB_PMA_PORT_COUNTERS_EXT , \ +} + +/* + * Get a Perfmgmt MAD block of data. + * Returns error code or the number of bytes retrieved. + */ +static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr, + void *data, int offset, size_t size) +{ + struct ib_mad *in_mad; + struct ib_mad *out_mad; + size_t mad_size = sizeof(*out_mad); + u16 out_mad_pkey_index = 0; + ssize_t ret; + + if (!dev->process_mad) + return -ENOSYS; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) { + ret = -ENOMEM; + goto out; + } + + in_mad->mad_hdr.base_version = 1; + in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT; + in_mad->mad_hdr.class_version = 1; + in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; + in_mad->mad_hdr.attr_id = attr; + + if (attr != IB_PMA_CLASS_PORT_INFO) + in_mad->data[41] = port_num; /* PortSelect field */ + + if ((dev->process_mad(dev, IB_MAD_IGNORE_MKEY, + port_num, NULL, NULL, + (const struct ib_mad_hdr *)in_mad, mad_size, + (struct ib_mad_hdr *)out_mad, &mad_size, + &out_mad_pkey_index) & + (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) != + (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) { + ret = -EINVAL; + goto out; + } + memcpy(data, out_mad->data + offset, size); + ret = size; +out: + kfree(in_mad); + kfree(out_mad); + return ret; +} + +static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + int offset = tab_attr->index & 0xffff; + int width = (tab_attr->index >> 16) & 0xff; + ssize_t ret; + u8 data[8]; + + ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data, + 40 + offset / 8, sizeof(data)); + if (ret < 0) + return sprintf(buf, "N/A (no PMA)\n"); + + switch (width) { + case 4: + ret = sprintf(buf, "%u\n", (*data >> + (4 - (offset % 8))) & 0xf); + break; + case 8: + ret = sprintf(buf, "%u\n", *data); + break; + case 16: + ret = sprintf(buf, "%u\n", + be16_to_cpup((__be16 *)data)); + break; + case 32: + ret = sprintf(buf, "%u\n", + be32_to_cpup((__be32 *)data)); + break; + case 64: + ret = sprintf(buf, "%llu\n", + (unsigned long long)be64_to_cpup((__be64 *)data)); + break; + + default: + ret = 0; + } + + return ret; +} + +static PORT_PMA_ATTR(symbol_error , 0, 16, 32); +static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48); +static PORT_PMA_ATTR(link_downed , 2, 8, 56); +static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64); +static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80); +static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96); +static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112); +static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128); +static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136); +static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152); +static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156); +static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176); +static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192); +static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); +static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); +static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); +static PORT_PMA_ATTR(port_xmit_wait , 0, 32, 320); + +/* + * Counters added by extended set + */ +static PORT_PMA_ATTR_EXT(port_xmit_data , 64, 64); +static PORT_PMA_ATTR_EXT(port_rcv_data , 64, 128); +static PORT_PMA_ATTR_EXT(port_xmit_packets , 64, 192); +static PORT_PMA_ATTR_EXT(port_rcv_packets , 64, 256); +static PORT_PMA_ATTR_EXT(unicast_xmit_packets , 64, 320); +static PORT_PMA_ATTR_EXT(unicast_rcv_packets , 64, 384); +static PORT_PMA_ATTR_EXT(multicast_xmit_packets , 64, 448); +static PORT_PMA_ATTR_EXT(multicast_rcv_packets , 64, 512); + +static struct attribute *pma_attrs[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_port_xmit_data.attr.attr, + &port_pma_attr_port_rcv_data.attr.attr, + &port_pma_attr_port_xmit_packets.attr.attr, + &port_pma_attr_port_rcv_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, + NULL +}; + +static struct attribute *pma_attrs_ext[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_ext_port_xmit_data.attr.attr, + &port_pma_attr_ext_port_rcv_data.attr.attr, + &port_pma_attr_ext_port_xmit_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, + &port_pma_attr_ext_port_rcv_packets.attr.attr, + &port_pma_attr_ext_unicast_rcv_packets.attr.attr, + &port_pma_attr_ext_unicast_xmit_packets.attr.attr, + &port_pma_attr_ext_multicast_rcv_packets.attr.attr, + &port_pma_attr_ext_multicast_xmit_packets.attr.attr, + NULL +}; + +static struct attribute *pma_attrs_noietf[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_ext_port_xmit_data.attr.attr, + &port_pma_attr_ext_port_rcv_data.attr.attr, + &port_pma_attr_ext_port_xmit_packets.attr.attr, + &port_pma_attr_ext_port_rcv_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, + NULL +}; + +static struct attribute_group pma_group = { + .name = "counters", + .attrs = pma_attrs +}; + +static struct attribute_group pma_group_ext = { + .name = "counters", + .attrs = pma_attrs_ext +}; + +static struct attribute_group pma_group_noietf = { + .name = "counters", + .attrs = pma_attrs_noietf +}; + +static void ib_port_release(struct kobject *kobj) +{ + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + struct attribute *a; + int i; + + if (p->gid_group.attrs) { + for (i = 0; (a = p->gid_group.attrs[i]); ++i) + kfree(a); + + kfree(p->gid_group.attrs); + } + + if (p->pkey_group.attrs) { + for (i = 0; (a = p->pkey_group.attrs[i]); ++i) + kfree(a); + + kfree(p->pkey_group.attrs); + } + + kfree(p); +} + +static void ib_port_gid_attr_release(struct kobject *kobj) +{ + struct gid_attr_group *g = container_of(kobj, struct gid_attr_group, + kobj); + struct attribute *a; + int i; + + if (g->ndev.attrs) { + for (i = 0; (a = g->ndev.attrs[i]); ++i) + kfree(a); + + kfree(g->ndev.attrs); + } + + if (g->type.attrs) { + for (i = 0; (a = g->type.attrs[i]); ++i) + kfree(a); + + kfree(g->type.attrs); + } + + kfree(g); +} + +static struct kobj_type port_type = { + .release = ib_port_release, + .sysfs_ops = &port_sysfs_ops, + .default_attrs = port_default_attrs +}; + +static struct kobj_type gid_attr_type = { + .sysfs_ops = &gid_attr_sysfs_ops, + .release = ib_port_gid_attr_release +}; + +static struct attribute ** +alloc_group_attrs(ssize_t (*show)(struct ib_port *, + struct port_attribute *, char *buf), + int len) +{ + struct attribute **tab_attr; + struct port_table_attribute *element; + int i; + + tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL); + if (!tab_attr) + return NULL; + + for (i = 0; i < len; i++) { + element = kzalloc(sizeof(struct port_table_attribute), + GFP_KERNEL); + if (!element) + goto err; + + if (snprintf(element->name, sizeof(element->name), + "%d", i) >= sizeof(element->name)) { + kfree(element); + goto err; + } + + element->attr.attr.name = element->name; + element->attr.attr.mode = S_IRUGO; + element->attr.show = show; + element->index = i; + sysfs_attr_init(&element->attr.attr); + + tab_attr[i] = &element->attr.attr; + } + + return tab_attr; + +err: + while (--i >= 0) + kfree(tab_attr[i]); + kfree(tab_attr); + return NULL; +} + +/* + * Figure out which counter table to use depending on + * the device capabilities. + */ +static struct attribute_group *get_counter_table(struct ib_device *dev, + int port_num) +{ + struct ib_class_port_info cpi; + + if (get_perf_mad(dev, port_num, IB_PMA_CLASS_PORT_INFO, + &cpi, 40, sizeof(cpi)) >= 0) { + if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH) + /* We have extended counters */ + return &pma_group_ext; + + if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF) + /* But not the IETF ones */ + return &pma_group_noietf; + } + + /* Fall back to normal counters */ + return &pma_group; +} + +static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats, + u8 port_num, int index) +{ + int ret; + + if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan)) + return 0; + ret = dev->get_hw_stats(dev, stats, port_num, index); + if (ret < 0) + return ret; + if (ret == stats->num_counters) + stats->timestamp = jiffies; + + return 0; +} + +static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long)stats->value[index]); +} + +static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ib_device *dev; + struct ib_port *port; + struct hw_stats_attribute *hsa; + struct rdma_hw_stats *stats; + int ret; + + hsa = container_of(attr, struct hw_stats_attribute, attr); + if (!hsa->port_num) { + dev = container_of((struct device *)kobj, + struct ib_device, dev); + stats = dev->hw_stats; + } else { + port = container_of(kobj, struct ib_port, kobj); + dev = port->ibdev; + stats = port->hw_stats; + } + ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index); + if (ret) + return ret; + return print_hw_stat(stats, hsa->index, buf); +} + +static ssize_t show_stats_lifespan(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct hw_stats_attribute *hsa; + int msecs; + + hsa = container_of(attr, struct hw_stats_attribute, attr); + if (!hsa->port_num) { + struct ib_device *dev = container_of((struct device *)kobj, + struct ib_device, dev); + msecs = jiffies_to_msecs(dev->hw_stats->lifespan); + } else { + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + msecs = jiffies_to_msecs(p->hw_stats->lifespan); + } + return sprintf(buf, "%d\n", msecs); +} + +static ssize_t set_stats_lifespan(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) +{ + struct hw_stats_attribute *hsa; + int msecs; + int jiffies; + int ret; + + ret = kstrtoint(buf, 10, &msecs); + if (ret) + return ret; + if (msecs < 0 || msecs > 10000) + return -EINVAL; + jiffies = msecs_to_jiffies(msecs); + hsa = container_of(attr, struct hw_stats_attribute, attr); + if (!hsa->port_num) { + struct ib_device *dev = container_of((struct device *)kobj, + struct ib_device, dev); + dev->hw_stats->lifespan = jiffies; + } else { + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + p->hw_stats->lifespan = jiffies; + } + return count; +} + +static void free_hsag(struct kobject *kobj, struct attribute_group *attr_group) +{ + struct attribute **attr; + + sysfs_remove_group(kobj, attr_group); + + for (attr = attr_group->attrs; *attr; attr++) + kfree(*attr); + kfree(attr_group); +} + +static struct attribute *alloc_hsa(int index, u8 port_num, const char *name) +{ + struct hw_stats_attribute *hsa; + + hsa = kmalloc(sizeof(*hsa), GFP_KERNEL); + if (!hsa) + return NULL; + + hsa->attr.name = __DECONST(char *, name); + hsa->attr.mode = S_IRUGO; + hsa->show = show_hw_stats; + hsa->store = NULL; + hsa->index = index; + hsa->port_num = port_num; + + return &hsa->attr; +} + +static struct attribute *alloc_hsa_lifespan(char *name, u8 port_num) +{ + struct hw_stats_attribute *hsa; + + hsa = kmalloc(sizeof(*hsa), GFP_KERNEL); + if (!hsa) + return NULL; + + hsa->attr.name = name; + hsa->attr.mode = S_IWUSR | S_IRUGO; + hsa->show = show_stats_lifespan; + hsa->store = set_stats_lifespan; + hsa->index = 0; + hsa->port_num = port_num; + + return &hsa->attr; +} + +static void setup_hw_stats(struct ib_device *device, struct ib_port *port, + u8 port_num) +{ + struct attribute_group *hsag; + struct rdma_hw_stats *stats; + int i, ret; + + stats = device->alloc_hw_stats(device, port_num); + + if (!stats) + return; + + if (!stats->names || stats->num_counters <= 0) + goto err_free_stats; + + /* + * Two extra attribue elements here, one for the lifespan entry and + * one to NULL terminate the list for the sysfs core code + */ + hsag = kzalloc(sizeof(*hsag) + + sizeof(void *) * (stats->num_counters + 2), + GFP_KERNEL); + if (!hsag) + goto err_free_stats; + + ret = device->get_hw_stats(device, stats, port_num, + stats->num_counters); + if (ret != stats->num_counters) + goto err_free_hsag; + + stats->timestamp = jiffies; + + hsag->name = "hw_counters"; + hsag->attrs = (void *)((char *)hsag + sizeof(*hsag)); + + for (i = 0; i < stats->num_counters; i++) { + hsag->attrs[i] = alloc_hsa(i, port_num, stats->names[i]); + if (!hsag->attrs[i]) + goto err; + sysfs_attr_init(hsag->attrs[i]); + } + + /* treat an error here as non-fatal */ + hsag->attrs[i] = alloc_hsa_lifespan("lifespan", port_num); + if (hsag->attrs[i]) + sysfs_attr_init(hsag->attrs[i]); + + if (port) { + struct kobject *kobj = &port->kobj; + ret = sysfs_create_group(kobj, hsag); + if (ret) + goto err; + port->hw_stats_ag = hsag; + port->hw_stats = stats; + } else { + struct kobject *kobj = &device->dev.kobj; + ret = sysfs_create_group(kobj, hsag); + if (ret) + goto err; + device->hw_stats_ag = hsag; + device->hw_stats = stats; + } + + return; + +err: + for (; i >= 0; i--) + kfree(hsag->attrs[i]); +err_free_hsag: + kfree(hsag); +err_free_stats: + kfree(stats); + return; +} + +static int add_port(struct ib_device *device, int port_num, + int (*port_callback)(struct ib_device *, + u8, struct kobject *)) +{ + struct ib_port *p; + struct ib_port_attr attr; + int i; + int ret; + + ret = ib_query_port(device, port_num, &attr); + if (ret) + return ret; + + p = kzalloc(sizeof *p, GFP_KERNEL); + if (!p) + return -ENOMEM; + + p->ibdev = device; + p->port_num = port_num; + + ret = kobject_init_and_add(&p->kobj, &port_type, + device->ports_parent, + "%d", port_num); + if (ret) { + kfree(p); + return ret; + } + + p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL); + if (!p->gid_attr_group) { + ret = -ENOMEM; + goto err_put; + } + + p->gid_attr_group->port = p; + ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type, + &p->kobj, "gid_attrs"); + if (ret) { + kfree(p->gid_attr_group); + goto err_put; + } + + p->pma_table = get_counter_table(device, port_num); + ret = sysfs_create_group(&p->kobj, p->pma_table); + if (ret) + goto err_put_gid_attrs; + + p->gid_group.name = "gids"; + p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); + if (!p->gid_group.attrs) { + ret = -ENOMEM; + goto err_remove_pma; + } + + ret = sysfs_create_group(&p->kobj, &p->gid_group); + if (ret) + goto err_free_gid; + + p->gid_attr_group->ndev.name = "ndevs"; + p->gid_attr_group->ndev.attrs = alloc_group_attrs(show_port_gid_attr_ndev, + attr.gid_tbl_len); + if (!p->gid_attr_group->ndev.attrs) { + ret = -ENOMEM; + goto err_remove_gid; + } + + ret = sysfs_create_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->ndev); + if (ret) + goto err_free_gid_ndev; + + p->gid_attr_group->type.name = "types"; + p->gid_attr_group->type.attrs = alloc_group_attrs(show_port_gid_attr_gid_type, + attr.gid_tbl_len); + if (!p->gid_attr_group->type.attrs) { + ret = -ENOMEM; + goto err_remove_gid_ndev; + } + + ret = sysfs_create_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->type); + if (ret) + goto err_free_gid_type; + + p->pkey_group.name = "pkeys"; + p->pkey_group.attrs = alloc_group_attrs(show_port_pkey, + attr.pkey_tbl_len); + if (!p->pkey_group.attrs) { + ret = -ENOMEM; + goto err_remove_gid_type; + } + + ret = sysfs_create_group(&p->kobj, &p->pkey_group); + if (ret) + goto err_free_pkey; + + if (port_callback) { + ret = port_callback(device, port_num, &p->kobj); + if (ret) + goto err_remove_pkey; + } + + /* + * If port == 0, it means we have only one port and the parent + * device, not this port device, should be the holder of the + * hw_counters + */ + if (device->alloc_hw_stats && port_num) + setup_hw_stats(device, p, port_num); + + list_add_tail(&p->kobj.entry, &device->port_list); + + return 0; + +err_remove_pkey: + sysfs_remove_group(&p->kobj, &p->pkey_group); + +err_free_pkey: + for (i = 0; i < attr.pkey_tbl_len; ++i) + kfree(p->pkey_group.attrs[i]); + + kfree(p->pkey_group.attrs); + p->pkey_group.attrs = NULL; + +err_remove_gid_type: + sysfs_remove_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->type); + +err_free_gid_type: + for (i = 0; i < attr.gid_tbl_len; ++i) + kfree(p->gid_attr_group->type.attrs[i]); + + kfree(p->gid_attr_group->type.attrs); + p->gid_attr_group->type.attrs = NULL; + +err_remove_gid_ndev: + sysfs_remove_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->ndev); + +err_free_gid_ndev: + for (i = 0; i < attr.gid_tbl_len; ++i) + kfree(p->gid_attr_group->ndev.attrs[i]); + + kfree(p->gid_attr_group->ndev.attrs); + p->gid_attr_group->ndev.attrs = NULL; + +err_remove_gid: + sysfs_remove_group(&p->kobj, &p->gid_group); + +err_free_gid: + for (i = 0; i < attr.gid_tbl_len; ++i) + kfree(p->gid_group.attrs[i]); + + kfree(p->gid_group.attrs); + p->gid_group.attrs = NULL; + +err_remove_pma: + sysfs_remove_group(&p->kobj, p->pma_table); + +err_put_gid_attrs: + kobject_put(&p->gid_attr_group->kobj); + +err_put: + kobject_put(&p->kobj); + return ret; +} + +static ssize_t show_node_type(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + switch (dev->node_type) { + case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); + case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type); + case RDMA_NODE_USNIC: return sprintf(buf, "%d: usNIC\n", dev->node_type); + case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type); + case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); + case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); + default: return sprintf(buf, "%d: \n", dev->node_type); + } +} + +static ssize_t show_sys_image_guid(struct device *device, + struct device_attribute *dev_attr, char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + return sprintf(buf, "%04x:%04x:%04x:%04x\n", + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]), + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[1]), + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]), + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3])); +} + +static ssize_t show_node_guid(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + return sprintf(buf, "%04x:%04x:%04x:%04x\n", + be16_to_cpu(((__be16 *) &dev->node_guid)[0]), + be16_to_cpu(((__be16 *) &dev->node_guid)[1]), + be16_to_cpu(((__be16 *) &dev->node_guid)[2]), + be16_to_cpu(((__be16 *) &dev->node_guid)[3])); +} + +static ssize_t show_node_desc(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + return sprintf(buf, "%.64s\n", dev->node_desc); +} + +static ssize_t set_node_desc(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device_modify desc = {}; + int ret; + + if (!dev->modify_device) + return -EIO; + + memcpy(desc.node_desc, buf, min_t(int, count, IB_DEVICE_NODE_DESC_MAX)); + ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc); + if (ret) + return ret; + + return count; +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + ib_get_device_fw_str(dev, buf, PAGE_SIZE); + strlcat(buf, "\n", PAGE_SIZE); + return strlen(buf); +} + +static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL); +static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL); +static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL); +static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); + +static struct device_attribute *ib_class_attributes[] = { + &dev_attr_node_type, + &dev_attr_sys_image_guid, + &dev_attr_node_guid, + &dev_attr_node_desc, + &dev_attr_fw_ver, +}; + +static void free_port_list_attributes(struct ib_device *device) +{ + struct kobject *p, *t; + + list_for_each_entry_safe(p, t, &device->port_list, entry) { + struct ib_port *port = container_of(p, struct ib_port, kobj); + list_del(&p->entry); + if (port->hw_stats) { + kfree(port->hw_stats); + free_hsag(&port->kobj, port->hw_stats_ag); + } + sysfs_remove_group(p, port->pma_table); + sysfs_remove_group(p, &port->pkey_group); + sysfs_remove_group(p, &port->gid_group); + sysfs_remove_group(&port->gid_attr_group->kobj, + &port->gid_attr_group->ndev); + sysfs_remove_group(&port->gid_attr_group->kobj, + &port->gid_attr_group->type); + kobject_put(&port->gid_attr_group->kobj); + kobject_put(p); + } + + kobject_put(device->ports_parent); +} + +int ib_device_register_sysfs(struct ib_device *device, + int (*port_callback)(struct ib_device *, + u8, struct kobject *)) +{ + struct device *class_dev = &device->dev; + int ret; + int i; + + device->dev.parent = device->dma_device; + ret = dev_set_name(class_dev, "%s", device->name); + if (ret) + return ret; + + ret = device_add(class_dev); + if (ret) + goto err; + + for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { + ret = device_create_file(class_dev, ib_class_attributes[i]); + if (ret) + goto err_unregister; + } + + device->ports_parent = kobject_create_and_add("ports", + &class_dev->kobj); + if (!device->ports_parent) { + ret = -ENOMEM; + goto err_put; + } + + if (rdma_cap_ib_switch(device)) { + ret = add_port(device, 0, port_callback); + if (ret) + goto err_put; + } else { + for (i = 1; i <= device->phys_port_cnt; ++i) { + ret = add_port(device, i, port_callback); + if (ret) + goto err_put; + } + } + + if (device->alloc_hw_stats) + setup_hw_stats(device, NULL, 0); + + return 0; + +err_put: + free_port_list_attributes(device); + +err_unregister: + device_unregister(class_dev); + +err: + return ret; +} + +void ib_device_unregister_sysfs(struct ib_device *device) +{ + int i; + + /* Hold kobject until ib_dealloc_device() */ + kobject_get(&device->dev.kobj); + + free_port_list_attributes(device); + + if (device->hw_stats) { + kfree(device->hw_stats); + free_hsag(&device->dev.kobj, device->hw_stats_ag); + } + + for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) + device_remove_file(&device->dev, ib_class_attributes[i]); + + device_unregister(&device->dev); +} Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_sysfs.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_umem.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_umem.c (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_umem.c (revision 319974) @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define LINUXKPI_PARAM_PREFIX ibcore_ + +#include +#include +#include +#include +#include + +#include "uverbs.h" + +#include + +static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) +{ + struct scatterlist *sg; + struct page *page; + int i; + + if (umem->nmap > 0) + ib_dma_unmap_sg(dev, umem->sg_head.sgl, + umem->nmap, + DMA_BIDIRECTIONAL); + + for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { + + page = sg_page(sg); + if (umem->writable && dirty) + set_page_dirty_lock(page); + put_page(page); + } + + sg_free_table(&umem->sg_head); + return; + +} + +/** + * ib_umem_get - Pin and DMA map userspace memory. + * + * If access flags indicate ODP memory, avoid pinning. Instead, stores + * the mm for future page fault handling in conjunction with MMU notifiers. + * + * @context: userspace context to pin memory for + * @addr: userspace virtual address to start at + * @size: length of region to pin + * @access: IB_ACCESS_xxx flags for memory being pinned + * @dmasync: flush in-flight DMA when the memory region is written + */ +struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, + size_t size, int access, int dmasync) +{ + struct ib_umem *umem; + struct page **page_list; + struct vm_area_struct **vma_list; + unsigned long locked; + unsigned long cur_base; + unsigned long npages; + int ret; + int i; + struct dma_attrs dma_attrs = { 0 }; + struct scatterlist *sg, *sg_list_start; + int need_release = 0; + unsigned int gup_flags = FOLL_WRITE; + + if (dmasync) + dma_attrs.flags |= DMA_ATTR_WRITE_BARRIER; + + if (!size) + return ERR_PTR(-EINVAL); + + /* + * If the combination of the addr and size requested for this memory + * region causes an integer overflow, return error. + */ + if (((addr + size) < addr) || + PAGE_ALIGN(addr + size) < (addr + size)) + return ERR_PTR(-EINVAL); + + if (priv_check(curthread, PRIV_VM_MLOCK) != 0) + return ERR_PTR(-EPERM); + + umem = kzalloc(sizeof *umem, GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + + umem->context = context; + umem->length = size; + umem->address = addr; + umem->page_size = PAGE_SIZE; + umem->pid = get_pid(task_pid(current)); + /* + * We ask for writable memory if any of the following + * access flags are set. "Local write" and "remote write" + * obviously require write access. "Remote atomic" can do + * things like fetch and add, which will modify memory, and + * "MW bind" can change permissions by binding a window. + */ + umem->writable = !!(access & + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND)); + + if (access & IB_ACCESS_ON_DEMAND) { + ret = ib_umem_odp_get(context, umem); + if (ret) { + kfree(umem); + return ERR_PTR(ret); + } + return umem; + } + + umem->odp_data = NULL; + + page_list = (struct page **) __get_free_page(GFP_KERNEL); + if (!page_list) { + kfree(umem); + return ERR_PTR(-ENOMEM); + } + + vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL); + + npages = ib_umem_num_pages(umem); + + down_write(¤t->mm->mmap_sem); + + locked = npages + current->mm->pinned_vm; + + cur_base = addr & PAGE_MASK; + + if (npages == 0 || npages > UINT_MAX) { + ret = -EINVAL; + goto out; + } + + ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); + if (ret) + goto out; + + if (!umem->writable) + gup_flags |= FOLL_FORCE; + + need_release = 1; + sg_list_start = umem->sg_head.sgl; + + while (npages) { + ret = get_user_pages(cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof (struct page *)), + gup_flags, page_list, vma_list); + + if (ret < 0) + goto out; + + umem->npages += ret; + cur_base += ret * PAGE_SIZE; + npages -= ret; + + for_each_sg(sg_list_start, sg, ret, i) { + sg_set_page(sg, page_list[i], PAGE_SIZE, 0); + } + + /* preparing for next loop */ + sg_list_start = sg; + } + + umem->nmap = ib_dma_map_sg_attrs(context->device, + umem->sg_head.sgl, + umem->npages, + DMA_BIDIRECTIONAL, + &dma_attrs); + + if (umem->nmap <= 0) { + ret = -ENOMEM; + goto out; + } + + ret = 0; + +out: + if (ret < 0) { + if (need_release) + __ib_umem_release(context->device, umem, 0); + put_pid(umem->pid); + kfree(umem); + } else + current->mm->pinned_vm = locked; + + up_write(¤t->mm->mmap_sem); + if (vma_list) + free_page((unsigned long) vma_list); + free_page((unsigned long) page_list); + + return ret < 0 ? ERR_PTR(ret) : umem; +} +EXPORT_SYMBOL(ib_umem_get); + +static void ib_umem_account(struct work_struct *work) +{ + struct ib_umem *umem = container_of(work, struct ib_umem, work); + + down_write(&umem->mm->mmap_sem); + umem->mm->pinned_vm -= umem->diff; + up_write(&umem->mm->mmap_sem); + mmput(umem->mm); + kfree(umem); +} + +/** + * ib_umem_release - release memory pinned with ib_umem_get + * @umem: umem struct to release + */ +void ib_umem_release(struct ib_umem *umem) +{ + struct ib_ucontext *context = umem->context; + struct mm_struct *mm; + struct task_struct *task; + unsigned long diff; + + if (umem->odp_data) { + ib_umem_odp_release(umem); + return; + } + + __ib_umem_release(umem->context->device, umem, 1); + + task = get_pid_task(umem->pid, PIDTYPE_PID); + put_pid(umem->pid); + if (!task) + goto out; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + diff = ib_umem_num_pages(umem); + + /* + * We may be called with the mm's mmap_sem already held. This + * can happen when a userspace munmap() is the call that drops + * the last reference to our file and calls our release + * method. If there are memory regions to destroy, we'll end + * up here and not be able to take the mmap_sem. In that case + * we defer the vm_locked accounting to the system workqueue. + */ + if (context->closing) { + if (!down_write_trylock(&mm->mmap_sem)) { + INIT_WORK(&umem->work, ib_umem_account); + umem->mm = mm; + umem->diff = diff; + + queue_work(ib_wq, &umem->work); + return; + } + } else + down_write(&mm->mmap_sem); + + mm->pinned_vm -= diff; + up_write(&mm->mmap_sem); + mmput(mm); +out: + kfree(umem); +} +EXPORT_SYMBOL(ib_umem_release); + +int ib_umem_page_count(struct ib_umem *umem) +{ + int shift; + int i; + int n; + struct scatterlist *sg; + + if (umem->odp_data) + return ib_umem_num_pages(umem); + + shift = ilog2(umem->page_size); + + n = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) + n += sg_dma_len(sg) >> shift; + + return n; +} +EXPORT_SYMBOL(ib_umem_page_count); + +/* + * Copy from the given ib_umem's pages to the given buffer. + * + * umem - the umem to copy from + * offset - offset to start copying from + * dst - destination buffer + * length - buffer length + * + * Returns 0 on success, or an error code. + */ +int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, + size_t length) +{ + size_t end = offset + length; + int ret; + + if (offset > umem->length || length > umem->length - offset) { + pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n", + offset, umem->length, end); + return -EINVAL; + } + +#ifdef __linux__ + ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length, + offset + ib_umem_offset(umem)); +#else + ret = 0; +#endif + if (ret < 0) + return ret; + else if (ret != length) + return -EINVAL; + else + return 0; +} +EXPORT_SYMBOL(ib_umem_copy_from); Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_umem.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_umem_odp.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_umem_odp.c (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_umem_odp.c (revision 319974) @@ -0,0 +1,667 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include +#include +#include + +static void ib_umem_notifier_start_account(struct ib_umem *item) +{ + mutex_lock(&item->odp_data->umem_mutex); + + /* Only update private counters for this umem if it has them. + * Otherwise skip it. All page faults will be delayed for this umem. */ + if (item->odp_data->mn_counters_active) { + int notifiers_count = item->odp_data->notifiers_count++; + + if (notifiers_count == 0) + /* Initialize the completion object for waiting on + * notifiers. Since notifier_count is zero, no one + * should be waiting right now. */ + reinit_completion(&item->odp_data->notifier_completion); + } + mutex_unlock(&item->odp_data->umem_mutex); +} + +static void ib_umem_notifier_end_account(struct ib_umem *item) +{ + mutex_lock(&item->odp_data->umem_mutex); + + /* Only update private counters for this umem if it has them. + * Otherwise skip it. All page faults will be delayed for this umem. */ + if (item->odp_data->mn_counters_active) { + /* + * This sequence increase will notify the QP page fault that + * the page that is going to be mapped in the spte could have + * been freed. + */ + ++item->odp_data->notifiers_seq; + if (--item->odp_data->notifiers_count == 0) + complete_all(&item->odp_data->notifier_completion); + } + mutex_unlock(&item->odp_data->umem_mutex); +} + +/* Account for a new mmu notifier in an ib_ucontext. */ +static void ib_ucontext_notifier_start_account(struct ib_ucontext *context) +{ + atomic_inc(&context->notifier_count); +} + +/* Account for a terminating mmu notifier in an ib_ucontext. + * + * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since + * the function takes the semaphore itself. */ +static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) +{ + int zero_notifiers = atomic_dec_and_test(&context->notifier_count); + + if (zero_notifiers && + !list_empty(&context->no_private_counters)) { + /* No currently running mmu notifiers. Now is the chance to + * add private accounting to all previously added umems. */ + struct ib_umem_odp *odp_data, *next; + + /* Prevent concurrent mmu notifiers from working on the + * no_private_counters list. */ + down_write(&context->umem_rwsem); + + /* Read the notifier_count again, with the umem_rwsem + * semaphore taken for write. */ + if (!atomic_read(&context->notifier_count)) { + list_for_each_entry_safe(odp_data, next, + &context->no_private_counters, + no_private_counters) { + mutex_lock(&odp_data->umem_mutex); + odp_data->mn_counters_active = true; + list_del(&odp_data->no_private_counters); + complete_all(&odp_data->notifier_completion); + mutex_unlock(&odp_data->umem_mutex); + } + } + + up_write(&context->umem_rwsem); + } +} + +static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) { + /* + * Increase the number of notifiers running, to + * prevent any further fault handling on this MR. + */ + ib_umem_notifier_start_account(item); + item->odp_data->dying = 1; + /* Make sure that the fact the umem is dying is out before we release + * all pending page faults. */ + smp_wmb(); + complete_all(&item->odp_data->notifier_completion); + item->context->invalidate_range(item, ib_umem_start(item), + ib_umem_end(item)); + return 0; +} + +static void ib_umem_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + ib_ucontext_notifier_start_account(context); + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, + ULLONG_MAX, + ib_umem_notifier_release_trampoline, + NULL); + up_read(&context->umem_rwsem); +} + +static int invalidate_page_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) +{ + ib_umem_notifier_start_account(item); + item->context->invalidate_range(item, start, start + PAGE_SIZE); + ib_umem_notifier_end_account(item); + return 0; +} + +static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + ib_ucontext_notifier_start_account(context); + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, address, + address + PAGE_SIZE, + invalidate_page_trampoline, NULL); + up_read(&context->umem_rwsem); + ib_ucontext_notifier_end_account(context); +} + +static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) +{ + ib_umem_notifier_start_account(item); + item->context->invalidate_range(item, start, end); + return 0; +} + +static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + ib_ucontext_notifier_start_account(context); + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + end, + invalidate_range_start_trampoline, NULL); + up_read(&context->umem_rwsem); +} + +static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) +{ + ib_umem_notifier_end_account(item); + return 0; +} + +static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + end, + invalidate_range_end_trampoline, NULL); + up_read(&context->umem_rwsem); + ib_ucontext_notifier_end_account(context); +} + +static const struct mmu_notifier_ops ib_umem_notifiers = { + .release = ib_umem_notifier_release, + .invalidate_page = ib_umem_notifier_invalidate_page, + .invalidate_range_start = ib_umem_notifier_invalidate_range_start, + .invalidate_range_end = ib_umem_notifier_invalidate_range_end, +}; + +int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) +{ + int ret_val; + pid_t our_pid; + struct mm_struct *mm = get_task_mm(current); + + if (!mm) + return -EINVAL; + + /* Prevent creating ODP MRs in child processes */ + rcu_read_lock(); + our_pid = get_pid(task_pid_group_leader(current)); + rcu_read_unlock(); + put_pid(our_pid); + if (context->tgid != our_pid) { + ret_val = -EINVAL; + goto out_mm; + } + + umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); + if (!umem->odp_data) { + ret_val = -ENOMEM; + goto out_mm; + } + umem->odp_data->umem = umem; + + mutex_init(&umem->odp_data->umem_mutex); + + init_completion(&umem->odp_data->notifier_completion); + + umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) * + sizeof(*umem->odp_data->page_list)); + if (!umem->odp_data->page_list) { + ret_val = -ENOMEM; + goto out_odp_data; + } + + umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) * + sizeof(*umem->odp_data->dma_list)); + if (!umem->odp_data->dma_list) { + ret_val = -ENOMEM; + goto out_page_list; + } + + /* + * When using MMU notifiers, we will get a + * notification before the "current" task (and MM) is + * destroyed. We use the umem_rwsem semaphore to synchronize. + */ + down_write(&context->umem_rwsem); + context->odp_mrs_count++; + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_insert(&umem->odp_data->interval_tree, + &context->umem_tree); + if (likely(!atomic_read(&context->notifier_count)) || + context->odp_mrs_count == 1) + umem->odp_data->mn_counters_active = true; + else + list_add(&umem->odp_data->no_private_counters, + &context->no_private_counters); + downgrade_write(&context->umem_rwsem); + + if (context->odp_mrs_count == 1) { + /* + * Note that at this point, no MMU notifier is running + * for this context! + */ + atomic_set(&context->notifier_count, 0); + INIT_HLIST_NODE(&context->mn.hlist); + context->mn.ops = &ib_umem_notifiers; + /* + * Lock-dep detects a false positive for mmap_sem vs. + * umem_rwsem, due to not grasping downgrade_write correctly. + */ + ret_val = mmu_notifier_register(&context->mn, mm); + if (ret_val) { + pr_err("Failed to register mmu_notifier %d\n", ret_val); + ret_val = -EBUSY; + goto out_mutex; + } + } + + up_read(&context->umem_rwsem); + + /* + * Note that doing an mmput can cause a notifier for the relevant mm. + * If the notifier is called while we hold the umem_rwsem, this will + * cause a deadlock. Therefore, we release the reference only after we + * released the semaphore. + */ + mmput(mm); + return 0; + +out_mutex: + up_read(&context->umem_rwsem); + vfree(umem->odp_data->dma_list); +out_page_list: + vfree(umem->odp_data->page_list); +out_odp_data: + kfree(umem->odp_data); +out_mm: + mmput(mm); + return ret_val; +} + +void ib_umem_odp_release(struct ib_umem *umem) +{ + struct ib_ucontext *context = umem->context; + + /* + * Ensure that no more pages are mapped in the umem. + * + * It is the driver's responsibility to ensure, before calling us, + * that the hardware will not attempt to access the MR any more. + */ + ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), + ib_umem_end(umem)); + + down_write(&context->umem_rwsem); + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_remove(&umem->odp_data->interval_tree, + &context->umem_tree); + context->odp_mrs_count--; + if (!umem->odp_data->mn_counters_active) { + list_del(&umem->odp_data->no_private_counters); + complete_all(&umem->odp_data->notifier_completion); + } + + /* + * Downgrade the lock to a read lock. This ensures that the notifiers + * (who lock the mutex for reading) will be able to finish, and we + * will be able to enventually obtain the mmu notifiers SRCU. Note + * that since we are doing it atomically, no other user could register + * and unregister while we do the check. + */ + downgrade_write(&context->umem_rwsem); + if (!context->odp_mrs_count) { + struct task_struct *owning_process = NULL; + struct mm_struct *owning_mm = NULL; + + owning_process = get_pid_task(context->tgid, + PIDTYPE_PID); + if (owning_process == NULL) + /* + * The process is already dead, notifier were removed + * already. + */ + goto out; + + owning_mm = get_task_mm(owning_process); + if (owning_mm == NULL) + /* + * The process' mm is already dead, notifier were + * removed already. + */ + goto out_put_task; + mmu_notifier_unregister(&context->mn, owning_mm); + + mmput(owning_mm); + +out_put_task: + put_task_struct(owning_process); + } +out: + up_read(&context->umem_rwsem); + + vfree(umem->odp_data->dma_list); + vfree(umem->odp_data->page_list); + kfree(umem->odp_data); + kfree(umem); +} + +/* + * Map for DMA and insert a single page into the on-demand paging page tables. + * + * @umem: the umem to insert the page to. + * @page_index: index in the umem to add the page to. + * @page: the page struct to map and add. + * @access_mask: access permissions needed for this page. + * @current_seq: sequence number for synchronization with invalidations. + * the sequence number is taken from + * umem->odp_data->notifiers_seq. + * + * The function returns -EFAULT if the DMA mapping operation fails. It returns + * -EAGAIN if a concurrent invalidation prevents us from updating the page. + * + * The page is released via put_page even if the operation failed. For + * on-demand pinning, the page is released whenever it isn't stored in the + * umem. + */ +static int ib_umem_odp_map_dma_single_page( + struct ib_umem *umem, + int page_index, + u64 base_virt_addr, + struct page *page, + u64 access_mask, + unsigned long current_seq) +{ + struct ib_device *dev = umem->context->device; + dma_addr_t dma_addr; + int stored_page = 0; + int remove_existing_mapping = 0; + int ret = 0; + + /* + * Note: we avoid writing if seq is different from the initial seq, to + * handle case of a racing notifier. This check also allows us to bail + * early if we have a notifier running in parallel with us. + */ + if (ib_umem_mmu_notifier_retry(umem, current_seq)) { + ret = -EAGAIN; + goto out; + } + if (!(umem->odp_data->dma_list[page_index])) { + dma_addr = ib_dma_map_page(dev, + page, + 0, PAGE_SIZE, + DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(dev, dma_addr)) { + ret = -EFAULT; + goto out; + } + umem->odp_data->dma_list[page_index] = dma_addr | access_mask; + umem->odp_data->page_list[page_index] = page; + stored_page = 1; + } else if (umem->odp_data->page_list[page_index] == page) { + umem->odp_data->dma_list[page_index] |= access_mask; + } else { + pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", + umem->odp_data->page_list[page_index], page); + /* Better remove the mapping now, to prevent any further + * damage. */ + remove_existing_mapping = 1; + } + +out: + /* On Demand Paging - avoid pinning the page */ + if (umem->context->invalidate_range || !stored_page) + put_page(page); + + if (remove_existing_mapping && umem->context->invalidate_range) { + invalidate_page_trampoline( + umem, + base_virt_addr + (page_index * PAGE_SIZE), + base_virt_addr + ((page_index+1)*PAGE_SIZE), + NULL); + ret = -EAGAIN; + } + + return ret; +} + +/** + * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. + * + * Pins the range of pages passed in the argument, and maps them to + * DMA addresses. The DMA addresses of the mapped pages is updated in + * umem->odp_data->dma_list. + * + * Returns the number of pages mapped in success, negative error code + * for failure. + * An -EAGAIN error code is returned when a concurrent mmu notifier prevents + * the function from completing its task. + * + * @umem: the umem to map and pin + * @user_virt: the address from which we need to map. + * @bcnt: the minimal number of bytes to pin and map. The mapping might be + * bigger due to alignment, and may also be smaller in case of an error + * pinning or mapping a page. The actual pages mapped is returned in + * the return value. + * @access_mask: bit mask of the requested access permissions for the given + * range. + * @current_seq: the MMU notifiers sequance value for synchronization with + * invalidations. the sequance number is read from + * umem->odp_data->notifiers_seq before calling this function + */ +int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, + u64 access_mask, unsigned long current_seq) +{ + struct task_struct *owning_process = NULL; + struct mm_struct *owning_mm = NULL; + struct page **local_page_list = NULL; + u64 off; + int j, k, ret = 0, start_idx, npages = 0; + u64 base_virt_addr; + unsigned int flags = 0; + + if (access_mask == 0) + return -EINVAL; + + if (user_virt < ib_umem_start(umem) || + user_virt + bcnt > ib_umem_end(umem)) + return -EFAULT; + + local_page_list = (struct page **)__get_free_page(GFP_KERNEL); + if (!local_page_list) + return -ENOMEM; + + off = user_virt & (~PAGE_MASK); + user_virt = user_virt & PAGE_MASK; + base_virt_addr = user_virt; + bcnt += off; /* Charge for the first page offset as well. */ + + owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); + if (owning_process == NULL) { + ret = -EINVAL; + goto out_no_task; + } + + owning_mm = get_task_mm(owning_process); + if (owning_mm == NULL) { + ret = -EINVAL; + goto out_put_task; + } + + if (access_mask & ODP_WRITE_ALLOWED_BIT) + flags |= FOLL_WRITE; + + start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; + k = start_idx; + + while (bcnt > 0) { + const size_t gup_num_pages = + min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE, + PAGE_SIZE / sizeof(struct page *)); + + down_read(&owning_mm->mmap_sem); + /* + * Note: this might result in redundent page getting. We can + * avoid this by checking dma_list to be 0 before calling + * get_user_pages. However, this make the code much more + * complex (and doesn't gain us much performance in most use + * cases). + */ + npages = get_user_pages_remote(owning_process, owning_mm, + user_virt, gup_num_pages, + flags, local_page_list, NULL); + up_read(&owning_mm->mmap_sem); + + if (npages < 0) + break; + + bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); + user_virt += npages << PAGE_SHIFT; + mutex_lock(&umem->odp_data->umem_mutex); + for (j = 0; j < npages; ++j) { + ret = ib_umem_odp_map_dma_single_page( + umem, k, base_virt_addr, local_page_list[j], + access_mask, current_seq); + if (ret < 0) + break; + k++; + } + mutex_unlock(&umem->odp_data->umem_mutex); + + if (ret < 0) { + /* Release left over pages when handling errors. */ + for (++j; j < npages; ++j) + put_page(local_page_list[j]); + break; + } + } + + if (ret >= 0) { + if (npages < 0 && k == start_idx) + ret = npages; + else + ret = k - start_idx; + } + + mmput(owning_mm); +out_put_task: + put_task_struct(owning_process); +out_no_task: + free_page((unsigned long)local_page_list); + return ret; +} +EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); + +void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, + u64 bound) +{ + int idx; + u64 addr; + struct ib_device *dev = umem->context->device; + + virt = max_t(u64, virt, ib_umem_start(umem)); + bound = min_t(u64, bound, ib_umem_end(umem)); + /* Note that during the run of this function, the + * notifiers_count of the MR is > 0, preventing any racing + * faults from completion. We might be racing with other + * invalidations, so we must make sure we free each page only + * once. */ + mutex_lock(&umem->odp_data->umem_mutex); + for (addr = virt; addr < bound; addr += (u64)umem->page_size) { + idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; + if (umem->odp_data->page_list[idx]) { + struct page *page = umem->odp_data->page_list[idx]; + dma_addr_t dma = umem->odp_data->dma_list[idx]; + dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; + + WARN_ON(!dma_addr); + + ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, + DMA_BIDIRECTIONAL); + if (dma & ODP_WRITE_ALLOWED_BIT) { + struct page *head_page = compound_head(page); + /* + * set_page_dirty prefers being called with + * the page lock. However, MMU notifiers are + * called sometimes with and sometimes without + * the lock. We rely on the umem_mutex instead + * to prevent other mmu notifiers from + * continuing and allowing the page mapping to + * be removed. + */ + set_page_dirty(head_page); + } + /* on demand pinning support */ + if (!umem->context->invalidate_range) + put_page(page); + umem->odp_data->page_list[idx] = NULL; + umem->odp_data->dma_list[idx] = 0; + } + } + mutex_unlock(&umem->odp_data->umem_mutex); +} +EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_umem_odp.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_umem_rbtree.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_umem_rbtree.c (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_umem_rbtree.c (revision 319974) @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +/* + * The ib_umem list keeps track of memory regions for which the HW + * device request to receive notification when the related memory + * mapping is changed. + * + * ib_umem_lock protects the list. + */ + +static inline u64 node_start(struct umem_odp_node *n) +{ + struct ib_umem_odp *umem_odp = + container_of(n, struct ib_umem_odp, interval_tree); + + return ib_umem_start(umem_odp->umem); +} + +/* Note that the representation of the intervals in the interval tree + * considers the ending point as contained in the interval, while the + * function ib_umem_end returns the first address which is not contained + * in the umem. + */ +static inline u64 node_last(struct umem_odp_node *n) +{ + struct ib_umem_odp *umem_odp = + container_of(n, struct ib_umem_odp, interval_tree); + + return ib_umem_end(umem_odp->umem) - 1; +} + +INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, + node_start, node_last, , rbt_ib_umem) + +/* @last is not a part of the interval. See comment for function + * node_last. + */ +int rbt_ib_umem_for_each_in_range(struct rb_root *root, + u64 start, u64 last, + umem_call_back cb, + void *cookie) +{ + int ret_val = 0; + struct umem_odp_node *node; + struct ib_umem_odp *umem; + + if (unlikely(start == last)) + return ret_val; + + for (node = rbt_ib_umem_iter_first(root, start, last - 1); node; + node = rbt_ib_umem_iter_next(node, start, last - 1)) { + umem = container_of(node, struct ib_umem_odp, interval_tree); + ret_val = cb(umem->umem, start, last, cookie) || ret_val; + } + + return ret_val; +} Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_umem_rbtree.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_verbs.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_verbs.c (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_verbs.c (revision 319974) @@ -0,0 +1,2066 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include + +#include "core_priv.h" + +static const char * const ib_events[] = { + [IB_EVENT_CQ_ERR] = "CQ error", + [IB_EVENT_QP_FATAL] = "QP fatal error", + [IB_EVENT_QP_REQ_ERR] = "QP request error", + [IB_EVENT_QP_ACCESS_ERR] = "QP access error", + [IB_EVENT_COMM_EST] = "communication established", + [IB_EVENT_SQ_DRAINED] = "send queue drained", + [IB_EVENT_PATH_MIG] = "path migration successful", + [IB_EVENT_PATH_MIG_ERR] = "path migration error", + [IB_EVENT_DEVICE_FATAL] = "device fatal error", + [IB_EVENT_PORT_ACTIVE] = "port active", + [IB_EVENT_PORT_ERR] = "port error", + [IB_EVENT_LID_CHANGE] = "LID change", + [IB_EVENT_PKEY_CHANGE] = "P_key change", + [IB_EVENT_SM_CHANGE] = "SM change", + [IB_EVENT_SRQ_ERR] = "SRQ error", + [IB_EVENT_SRQ_LIMIT_REACHED] = "SRQ limit reached", + [IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached", + [IB_EVENT_CLIENT_REREGISTER] = "client reregister", + [IB_EVENT_GID_CHANGE] = "GID changed", +}; + +const char *__attribute_const__ ib_event_msg(enum ib_event_type event) +{ + size_t index = event; + + return (index < ARRAY_SIZE(ib_events) && ib_events[index]) ? + ib_events[index] : "unrecognized event"; +} +EXPORT_SYMBOL(ib_event_msg); + +static const char * const wc_statuses[] = { + [IB_WC_SUCCESS] = "success", + [IB_WC_LOC_LEN_ERR] = "local length error", + [IB_WC_LOC_QP_OP_ERR] = "local QP operation error", + [IB_WC_LOC_EEC_OP_ERR] = "local EE context operation error", + [IB_WC_LOC_PROT_ERR] = "local protection error", + [IB_WC_WR_FLUSH_ERR] = "WR flushed", + [IB_WC_MW_BIND_ERR] = "memory management operation error", + [IB_WC_BAD_RESP_ERR] = "bad response error", + [IB_WC_LOC_ACCESS_ERR] = "local access error", + [IB_WC_REM_INV_REQ_ERR] = "invalid request error", + [IB_WC_REM_ACCESS_ERR] = "remote access error", + [IB_WC_REM_OP_ERR] = "remote operation error", + [IB_WC_RETRY_EXC_ERR] = "transport retry counter exceeded", + [IB_WC_RNR_RETRY_EXC_ERR] = "RNR retry counter exceeded", + [IB_WC_LOC_RDD_VIOL_ERR] = "local RDD violation error", + [IB_WC_REM_INV_RD_REQ_ERR] = "remote invalid RD request", + [IB_WC_REM_ABORT_ERR] = "operation aborted", + [IB_WC_INV_EECN_ERR] = "invalid EE context number", + [IB_WC_INV_EEC_STATE_ERR] = "invalid EE context state", + [IB_WC_FATAL_ERR] = "fatal error", + [IB_WC_RESP_TIMEOUT_ERR] = "response timeout error", + [IB_WC_GENERAL_ERR] = "general error", +}; + +const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status) +{ + size_t index = status; + + return (index < ARRAY_SIZE(wc_statuses) && wc_statuses[index]) ? + wc_statuses[index] : "unrecognized status"; +} +EXPORT_SYMBOL(ib_wc_status_msg); + +__attribute_const__ int ib_rate_to_mult(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_2_5_GBPS: return 1; + case IB_RATE_5_GBPS: return 2; + case IB_RATE_10_GBPS: return 4; + case IB_RATE_20_GBPS: return 8; + case IB_RATE_30_GBPS: return 12; + case IB_RATE_40_GBPS: return 16; + case IB_RATE_60_GBPS: return 24; + case IB_RATE_80_GBPS: return 32; + case IB_RATE_120_GBPS: return 48; + default: return -1; + } +} +EXPORT_SYMBOL(ib_rate_to_mult); + +__attribute_const__ enum ib_rate mult_to_ib_rate(int mult) +{ + switch (mult) { + case 1: return IB_RATE_2_5_GBPS; + case 2: return IB_RATE_5_GBPS; + case 4: return IB_RATE_10_GBPS; + case 8: return IB_RATE_20_GBPS; + case 12: return IB_RATE_30_GBPS; + case 16: return IB_RATE_40_GBPS; + case 24: return IB_RATE_60_GBPS; + case 32: return IB_RATE_80_GBPS; + case 48: return IB_RATE_120_GBPS; + default: return IB_RATE_PORT_CURRENT; + } +} +EXPORT_SYMBOL(mult_to_ib_rate); + +__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_2_5_GBPS: return 2500; + case IB_RATE_5_GBPS: return 5000; + case IB_RATE_10_GBPS: return 10000; + case IB_RATE_20_GBPS: return 20000; + case IB_RATE_30_GBPS: return 30000; + case IB_RATE_40_GBPS: return 40000; + case IB_RATE_60_GBPS: return 60000; + case IB_RATE_80_GBPS: return 80000; + case IB_RATE_120_GBPS: return 120000; + case IB_RATE_14_GBPS: return 14062; + case IB_RATE_56_GBPS: return 56250; + case IB_RATE_112_GBPS: return 112500; + case IB_RATE_168_GBPS: return 168750; + case IB_RATE_25_GBPS: return 25781; + case IB_RATE_100_GBPS: return 103125; + case IB_RATE_200_GBPS: return 206250; + case IB_RATE_300_GBPS: return 309375; + default: return -1; + } +} +EXPORT_SYMBOL(ib_rate_to_mbps); + +__attribute_const__ enum rdma_transport_type +rdma_node_get_transport(enum rdma_node_type node_type) +{ + switch (node_type) { + case RDMA_NODE_IB_CA: + case RDMA_NODE_IB_SWITCH: + case RDMA_NODE_IB_ROUTER: + return RDMA_TRANSPORT_IB; + case RDMA_NODE_RNIC: + return RDMA_TRANSPORT_IWARP; + case RDMA_NODE_USNIC: + return RDMA_TRANSPORT_USNIC; + case RDMA_NODE_USNIC_UDP: + return RDMA_TRANSPORT_USNIC_UDP; + default: + BUG(); + return 0; + } +} +EXPORT_SYMBOL(rdma_node_get_transport); + +enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num) +{ + if (device->get_link_layer) + return device->get_link_layer(device, port_num); + + switch (rdma_node_get_transport(device->node_type)) { + case RDMA_TRANSPORT_IB: + return IB_LINK_LAYER_INFINIBAND; + case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_USNIC: + case RDMA_TRANSPORT_USNIC_UDP: + return IB_LINK_LAYER_ETHERNET; + default: + return IB_LINK_LAYER_UNSPECIFIED; + } +} +EXPORT_SYMBOL(rdma_port_get_link_layer); + +/* Protection domains */ + +/** + * ib_alloc_pd - Allocates an unused protection domain. + * @device: The device on which to allocate the protection domain. + * + * A protection domain object provides an association between QPs, shared + * receive queues, address handles, memory regions, and memory windows. + * + * Every PD has a local_dma_lkey which can be used as the lkey value for local + * memory operations. + */ +struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, + const char *caller) +{ + struct ib_pd *pd; + int mr_access_flags = 0; + + pd = device->alloc_pd(device, NULL, NULL); + if (IS_ERR(pd)) + return pd; + + pd->device = device; + pd->uobject = NULL; + pd->__internal_mr = NULL; + atomic_set(&pd->usecnt, 0); + pd->flags = flags; + + if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) + pd->local_dma_lkey = device->local_dma_lkey; + else + mr_access_flags |= IB_ACCESS_LOCAL_WRITE; + + if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) { + pr_warn("%s: enabling unsafe global rkey\n", caller); + mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; + } + + if (mr_access_flags) { + struct ib_mr *mr; + + mr = pd->device->get_dma_mr(pd, mr_access_flags); + if (IS_ERR(mr)) { + ib_dealloc_pd(pd); + return ERR_CAST(mr); + } + + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + mr->need_inval = false; + + pd->__internal_mr = mr; + + if (!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) + pd->local_dma_lkey = pd->__internal_mr->lkey; + + if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) + pd->unsafe_global_rkey = pd->__internal_mr->rkey; + } + + return pd; +} +EXPORT_SYMBOL(__ib_alloc_pd); + +/** + * ib_dealloc_pd - Deallocates a protection domain. + * @pd: The protection domain to deallocate. + * + * It is an error to call this function while any resources in the pd still + * exist. The caller is responsible to synchronously destroy them and + * guarantee no new allocations will happen. + */ +void ib_dealloc_pd(struct ib_pd *pd) +{ + int ret; + + if (pd->__internal_mr) { + ret = pd->device->dereg_mr(pd->__internal_mr); + WARN_ON(ret); + pd->__internal_mr = NULL; + } + + /* uverbs manipulates usecnt with proper locking, while the kabi + requires the caller to guarantee we can't race here. */ + WARN_ON(atomic_read(&pd->usecnt)); + + /* Making delalloc_pd a void return is a WIP, no driver should return + an error here. */ + ret = pd->device->dealloc_pd(pd); + WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd"); +} +EXPORT_SYMBOL(ib_dealloc_pd); + +/* Address handles */ + +struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + struct ib_ah *ah; + + ah = pd->device->create_ah(pd, ah_attr); + + if (!IS_ERR(ah)) { + ah->device = pd->device; + ah->pd = pd; + ah->uobject = NULL; + atomic_inc(&pd->usecnt); + } + + return ah; +} +EXPORT_SYMBOL(ib_create_ah); + +static int ib_get_header_version(const union rdma_network_hdr *hdr) +{ + const struct ip *ip4h = (const struct ip *)&hdr->roce4grh; + struct ip ip4h_checked; + const struct ip6_hdr *ip6h = (const struct ip6_hdr *)&hdr->ibgrh; + + /* If it's IPv6, the version must be 6, otherwise, the first + * 20 bytes (before the IPv4 header) are garbled. + */ + if ((ip6h->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) + return (ip4h->ip_v == 4) ? 4 : 0; + /* version may be 6 or 4 because the first 20 bytes could be garbled */ + + /* RoCE v2 requires no options, thus header length + * must be 5 words + */ + if (ip4h->ip_hl != 5) + return 6; + + /* Verify checksum. + * We can't write on scattered buffers so we need to copy to + * temp buffer. + */ + memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked)); + ip4h_checked.ip_sum = 0; + ip4h_checked.ip_sum = in_cksum_hdr(&ip4h_checked); + /* if IPv4 header checksum is OK, believe it */ + if (ip4h->ip_sum == ip4h_checked.ip_sum) + return 4; + return 6; +} + +static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device, + u8 port_num, + const struct ib_grh *grh) +{ + int grh_version; + + if (rdma_protocol_ib(device, port_num)) + return RDMA_NETWORK_IB; + + grh_version = ib_get_header_version((const union rdma_network_hdr *)grh); + + if (grh_version == 4) + return RDMA_NETWORK_IPV4; + + if (grh->next_hdr == IPPROTO_UDP) + return RDMA_NETWORK_IPV6; + + return RDMA_NETWORK_ROCE_V1; +} + +struct find_gid_index_context { + u16 vlan_id; + enum ib_gid_type gid_type; +}; + +static bool find_gid_index(const union ib_gid *gid, + const struct ib_gid_attr *gid_attr, + void *context) +{ + struct find_gid_index_context *ctx = + (struct find_gid_index_context *)context; + + if (ctx->gid_type != gid_attr->gid_type) + return false; + + if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) || + (is_vlan_dev(gid_attr->ndev) && + vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id)) + return false; + + return true; +} + +static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num, + u16 vlan_id, const union ib_gid *sgid, + enum ib_gid_type gid_type, + u16 *gid_index) +{ + struct find_gid_index_context context = {.vlan_id = vlan_id, + .gid_type = gid_type}; + + return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index, + &context, gid_index); +} + +static int get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, + enum rdma_network_type net_type, + union ib_gid *sgid, union ib_gid *dgid) +{ + struct sockaddr_in src_in; + struct sockaddr_in dst_in; + __be32 src_saddr, dst_saddr; + + if (!sgid || !dgid) + return -EINVAL; + + if (net_type == RDMA_NETWORK_IPV4) { + memcpy(&src_in.sin_addr.s_addr, + &hdr->roce4grh.ip_src, 4); + memcpy(&dst_in.sin_addr.s_addr, + &hdr->roce4grh.ip_dst, 4); + src_saddr = src_in.sin_addr.s_addr; + dst_saddr = dst_in.sin_addr.s_addr; + ipv6_addr_set_v4mapped(src_saddr, + (struct in6_addr *)sgid); + ipv6_addr_set_v4mapped(dst_saddr, + (struct in6_addr *)dgid); + return 0; + } else if (net_type == RDMA_NETWORK_IPV6 || + net_type == RDMA_NETWORK_IB) { + *dgid = hdr->ibgrh.dgid; + *sgid = hdr->ibgrh.sgid; + return 0; + } else { + return -EINVAL; + } +} + +int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, + const struct ib_wc *wc, const struct ib_grh *grh, + struct ib_ah_attr *ah_attr) +{ + u32 flow_class; + u16 gid_index; + int ret; + enum rdma_network_type net_type = RDMA_NETWORK_IB; + enum ib_gid_type gid_type = IB_GID_TYPE_IB; + int hoplimit = 0xff; + union ib_gid dgid; + union ib_gid sgid; + + memset(ah_attr, 0, sizeof *ah_attr); + if (rdma_cap_eth_ah(device, port_num)) { + if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE) + net_type = wc->network_hdr_type; + else + net_type = ib_get_net_type_by_grh(device, port_num, grh); + gid_type = ib_network_to_gid_type(net_type); + } + ret = get_gids_from_rdma_hdr((const union rdma_network_hdr *)grh, net_type, + &sgid, &dgid); + if (ret) + return ret; + + if (rdma_protocol_roce(device, port_num)) { + int if_index = 0; + u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ? + wc->vlan_id : 0xffff; + struct net_device *idev; + struct net_device *resolved_dev; + + if (!(wc->wc_flags & IB_WC_GRH)) + return -EPROTOTYPE; + + if (!device->get_netdev) + return -EOPNOTSUPP; + + idev = device->get_netdev(device, port_num); + if (!idev) + return -ENODEV; + + ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid, + ah_attr->dmac, + wc->wc_flags & IB_WC_WITH_VLAN ? + NULL : &vlan_id, + &if_index, &hoplimit); + if (ret) { + dev_put(idev); + return ret; + } + + resolved_dev = dev_get_by_index(&init_net, if_index); + if (resolved_dev->if_flags & IFF_LOOPBACK) { + dev_put(resolved_dev); + resolved_dev = idev; + dev_hold(resolved_dev); + } + rcu_read_lock(); + if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev, + resolved_dev)) + ret = -EHOSTUNREACH; + rcu_read_unlock(); + dev_put(idev); + dev_put(resolved_dev); + if (ret) + return ret; + + ret = get_sgid_index_from_eth(device, port_num, vlan_id, + &dgid, gid_type, &gid_index); + if (ret) + return ret; + } + + ah_attr->dlid = wc->slid; + ah_attr->sl = wc->sl; + ah_attr->src_path_bits = wc->dlid_path_bits; + ah_attr->port_num = port_num; + + if (wc->wc_flags & IB_WC_GRH) { + ah_attr->ah_flags = IB_AH_GRH; + ah_attr->grh.dgid = sgid; + + if (!rdma_cap_eth_ah(device, port_num)) { + if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) { + ret = ib_find_cached_gid_by_port(device, &dgid, + IB_GID_TYPE_IB, + port_num, NULL, + &gid_index); + if (ret) + return ret; + } else { + gid_index = 0; + } + } + + ah_attr->grh.sgid_index = (u8) gid_index; + flow_class = be32_to_cpu(grh->version_tclass_flow); + ah_attr->grh.flow_label = flow_class & 0xFFFFF; + ah_attr->grh.hop_limit = hoplimit; + ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; + } + return 0; +} +EXPORT_SYMBOL(ib_init_ah_from_wc); + +struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, + const struct ib_grh *grh, u8 port_num) +{ + struct ib_ah_attr ah_attr; + int ret; + + ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr); + if (ret) + return ERR_PTR(ret); + + return ib_create_ah(pd, &ah_attr); +} +EXPORT_SYMBOL(ib_create_ah_from_wc); + +int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +{ + return ah->device->modify_ah ? + ah->device->modify_ah(ah, ah_attr) : + -ENOSYS; +} +EXPORT_SYMBOL(ib_modify_ah); + +int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +{ + return ah->device->query_ah ? + ah->device->query_ah(ah, ah_attr) : + -ENOSYS; +} +EXPORT_SYMBOL(ib_query_ah); + +int ib_destroy_ah(struct ib_ah *ah) +{ + struct ib_pd *pd; + int ret; + + pd = ah->pd; + ret = ah->device->destroy_ah(ah); + if (!ret) + atomic_dec(&pd->usecnt); + + return ret; +} +EXPORT_SYMBOL(ib_destroy_ah); + +/* Shared receive queues */ + +struct ib_srq *ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr) +{ + struct ib_srq *srq; + + if (!pd->device->create_srq) + return ERR_PTR(-ENOSYS); + + srq = pd->device->create_srq(pd, srq_init_attr, NULL); + + if (!IS_ERR(srq)) { + srq->device = pd->device; + srq->pd = pd; + srq->uobject = NULL; + srq->event_handler = srq_init_attr->event_handler; + srq->srq_context = srq_init_attr->srq_context; + srq->srq_type = srq_init_attr->srq_type; + if (srq->srq_type == IB_SRQT_XRC) { + srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; + srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq; + atomic_inc(&srq->ext.xrc.xrcd->usecnt); + atomic_inc(&srq->ext.xrc.cq->usecnt); + } + atomic_inc(&pd->usecnt); + atomic_set(&srq->usecnt, 0); + } + + return srq; +} +EXPORT_SYMBOL(ib_create_srq); + +int ib_modify_srq(struct ib_srq *srq, + struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask) +{ + return srq->device->modify_srq ? + srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) : + -ENOSYS; +} +EXPORT_SYMBOL(ib_modify_srq); + +int ib_query_srq(struct ib_srq *srq, + struct ib_srq_attr *srq_attr) +{ + return srq->device->query_srq ? + srq->device->query_srq(srq, srq_attr) : -ENOSYS; +} +EXPORT_SYMBOL(ib_query_srq); + +int ib_destroy_srq(struct ib_srq *srq) +{ + struct ib_pd *pd; + enum ib_srq_type srq_type; + struct ib_xrcd *uninitialized_var(xrcd); + struct ib_cq *uninitialized_var(cq); + int ret; + + if (atomic_read(&srq->usecnt)) + return -EBUSY; + + pd = srq->pd; + srq_type = srq->srq_type; + if (srq_type == IB_SRQT_XRC) { + xrcd = srq->ext.xrc.xrcd; + cq = srq->ext.xrc.cq; + } + + ret = srq->device->destroy_srq(srq); + if (!ret) { + atomic_dec(&pd->usecnt); + if (srq_type == IB_SRQT_XRC) { + atomic_dec(&xrcd->usecnt); + atomic_dec(&cq->usecnt); + } + } + + return ret; +} +EXPORT_SYMBOL(ib_destroy_srq); + +/* Queue pairs */ + +static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) +{ + struct ib_qp *qp = context; + unsigned long flags; + + spin_lock_irqsave(&qp->device->event_handler_lock, flags); + list_for_each_entry(event->element.qp, &qp->open_list, open_list) + if (event->element.qp->event_handler) + event->element.qp->event_handler(event, event->element.qp->qp_context); + spin_unlock_irqrestore(&qp->device->event_handler_lock, flags); +} + +static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp) +{ + mutex_lock(&xrcd->tgt_qp_mutex); + list_add(&qp->xrcd_list, &xrcd->tgt_qp_list); + mutex_unlock(&xrcd->tgt_qp_mutex); +} + +static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp, + void (*event_handler)(struct ib_event *, void *), + void *qp_context) +{ + struct ib_qp *qp; + unsigned long flags; + + qp = kzalloc(sizeof *qp, GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->real_qp = real_qp; + atomic_inc(&real_qp->usecnt); + qp->device = real_qp->device; + qp->event_handler = event_handler; + qp->qp_context = qp_context; + qp->qp_num = real_qp->qp_num; + qp->qp_type = real_qp->qp_type; + + spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); + list_add(&qp->open_list, &real_qp->open_list); + spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); + + return qp; +} + +struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, + struct ib_qp_open_attr *qp_open_attr) +{ + struct ib_qp *qp, *real_qp; + + if (qp_open_attr->qp_type != IB_QPT_XRC_TGT) + return ERR_PTR(-EINVAL); + + qp = ERR_PTR(-EINVAL); + mutex_lock(&xrcd->tgt_qp_mutex); + list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) { + if (real_qp->qp_num == qp_open_attr->qp_num) { + qp = __ib_open_qp(real_qp, qp_open_attr->event_handler, + qp_open_attr->qp_context); + break; + } + } + mutex_unlock(&xrcd->tgt_qp_mutex); + return qp; +} +EXPORT_SYMBOL(ib_open_qp); + +static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp, + struct ib_qp_init_attr *qp_init_attr) +{ + struct ib_qp *real_qp = qp; + + qp->event_handler = __ib_shared_qp_event_handler; + qp->qp_context = qp; + qp->pd = NULL; + qp->send_cq = qp->recv_cq = NULL; + qp->srq = NULL; + qp->xrcd = qp_init_attr->xrcd; + atomic_inc(&qp_init_attr->xrcd->usecnt); + INIT_LIST_HEAD(&qp->open_list); + + qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, + qp_init_attr->qp_context); + if (!IS_ERR(qp)) + __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); + else + real_qp->device->destroy_qp(real_qp); + return qp; +} + +struct ib_qp *ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr) +{ + struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device; + struct ib_qp *qp; + + if (qp_init_attr->rwq_ind_tbl && + (qp_init_attr->recv_cq || + qp_init_attr->srq || qp_init_attr->cap.max_recv_wr || + qp_init_attr->cap.max_recv_sge)) + return ERR_PTR(-EINVAL); + + qp = device->create_qp(pd, qp_init_attr, NULL); + if (IS_ERR(qp)) + return qp; + + qp->device = device; + qp->real_qp = qp; + qp->uobject = NULL; + qp->qp_type = qp_init_attr->qp_type; + qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl; + + atomic_set(&qp->usecnt, 0); + spin_lock_init(&qp->mr_lock); + + if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) + return ib_create_xrc_qp(qp, qp_init_attr); + + qp->event_handler = qp_init_attr->event_handler; + qp->qp_context = qp_init_attr->qp_context; + if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { + qp->recv_cq = NULL; + qp->srq = NULL; + } else { + qp->recv_cq = qp_init_attr->recv_cq; + if (qp_init_attr->recv_cq) + atomic_inc(&qp_init_attr->recv_cq->usecnt); + qp->srq = qp_init_attr->srq; + if (qp->srq) + atomic_inc(&qp_init_attr->srq->usecnt); + } + + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->xrcd = NULL; + + atomic_inc(&pd->usecnt); + if (qp_init_attr->send_cq) + atomic_inc(&qp_init_attr->send_cq->usecnt); + if (qp_init_attr->rwq_ind_tbl) + atomic_inc(&qp->rwq_ind_tbl->usecnt); + + /* + * Note: all hw drivers guarantee that max_send_sge is lower than + * the device RDMA WRITE SGE limit but not all hw drivers ensure that + * max_send_sge <= max_sge_rd. + */ + qp->max_write_sge = qp_init_attr->cap.max_send_sge; + qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge, + device->attrs.max_sge_rd); + + return qp; +} +EXPORT_SYMBOL(ib_create_qp); + +static const struct { + int valid; + enum ib_qp_attr_mask req_param[IB_QPT_MAX]; + enum ib_qp_attr_mask opt_param[IB_QPT_MAX]; +} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .req_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY), + [IB_QPT_RAW_PACKET] = IB_QP_PORT, + [IB_QPT_UC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_RC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + }, + }, + [IB_QPS_INIT] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_RC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + }, + [IB_QPS_RTR] = { + .valid = 1, + .req_param = { + [IB_QPT_UC] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN), + [IB_QPT_RC] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER), + [IB_QPT_XRC_INI] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN), + [IB_QPT_XRC_TGT] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER), + }, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_RC] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + }, + }, + }, + [IB_QPS_RTR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .req_param = { + [IB_QPT_UD] = IB_QP_SQ_PSN, + [IB_QPT_UC] = IB_QP_SQ_PSN, + [IB_QPT_RC] = (IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_SQ_PSN | + IB_QP_MAX_QP_RD_ATOMIC), + [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_SQ_PSN | + IB_QP_MAX_QP_RD_ATOMIC), + [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT | + IB_QP_SQ_PSN), + [IB_QPT_SMI] = IB_QP_SQ_PSN, + [IB_QPT_GSI] = IB_QP_SQ_PSN, + }, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + } + }, + [IB_QPS_RTS] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE | + IB_QP_MIN_RNR_TIMER), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE | + IB_QP_MIN_RNR_TIMER), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */ + [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY + } + }, + }, + [IB_QPS_SQD] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_AV | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_INI] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + } + }, + [IB_QPS_SQE] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + } + }, + [IB_QPS_ERR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 } + } +}; + +int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, + enum ib_qp_type type, enum ib_qp_attr_mask mask, + enum rdma_link_layer ll) +{ + enum ib_qp_attr_mask req_param, opt_param; + + if (cur_state < 0 || cur_state > IB_QPS_ERR || + next_state < 0 || next_state > IB_QPS_ERR) + return 0; + + if (mask & IB_QP_CUR_STATE && + cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS && + cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) + return 0; + + if (!qp_state_table[cur_state][next_state].valid) + return 0; + + req_param = qp_state_table[cur_state][next_state].req_param[type]; + opt_param = qp_state_table[cur_state][next_state].opt_param[type]; + + if ((mask & req_param) != req_param) + return 0; + + if (mask & ~(req_param | opt_param | IB_QP_STATE)) + return 0; + + return 1; +} +EXPORT_SYMBOL(ib_modify_qp_is_ok); + +int ib_resolve_eth_dmac(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, int *qp_attr_mask) +{ + int ret = 0; + + if (*qp_attr_mask & IB_QP_AV) { + if (qp_attr->ah_attr.port_num < rdma_start_port(qp->device) || + qp_attr->ah_attr.port_num > rdma_end_port(qp->device)) + return -EINVAL; + + if (!rdma_cap_eth_ah(qp->device, qp_attr->ah_attr.port_num)) + return 0; + + if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) { + rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, + qp_attr->ah_attr.dmac); + } else { + union ib_gid sgid; + struct ib_gid_attr sgid_attr; + int ifindex; + int hop_limit; + + ret = ib_query_gid(qp->device, + qp_attr->ah_attr.port_num, + qp_attr->ah_attr.grh.sgid_index, + &sgid, &sgid_attr); + + if (ret || !sgid_attr.ndev) { + if (!ret) + ret = -ENXIO; + goto out; + } + + ifindex = sgid_attr.ndev->if_index; + + ret = rdma_addr_find_l2_eth_by_grh(&sgid, + &qp_attr->ah_attr.grh.dgid, + qp_attr->ah_attr.dmac, + NULL, &ifindex, &hop_limit); + + dev_put(sgid_attr.ndev); + + qp_attr->ah_attr.grh.hop_limit = hop_limit; + } + } +out: + return ret; +} +EXPORT_SYMBOL(ib_resolve_eth_dmac); + + +int ib_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask) +{ + int ret; + + ret = ib_resolve_eth_dmac(qp, qp_attr, &qp_attr_mask); + if (ret) + return ret; + + return qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); +} +EXPORT_SYMBOL(ib_modify_qp); + +int ib_query_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + return qp->device->query_qp ? + qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) : + -ENOSYS; +} +EXPORT_SYMBOL(ib_query_qp); + +int ib_close_qp(struct ib_qp *qp) +{ + struct ib_qp *real_qp; + unsigned long flags; + + real_qp = qp->real_qp; + if (real_qp == qp) + return -EINVAL; + + spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); + list_del(&qp->open_list); + spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); + + atomic_dec(&real_qp->usecnt); + kfree(qp); + + return 0; +} +EXPORT_SYMBOL(ib_close_qp); + +static int __ib_destroy_shared_qp(struct ib_qp *qp) +{ + struct ib_xrcd *xrcd; + struct ib_qp *real_qp; + int ret; + + real_qp = qp->real_qp; + xrcd = real_qp->xrcd; + + mutex_lock(&xrcd->tgt_qp_mutex); + ib_close_qp(qp); + if (atomic_read(&real_qp->usecnt) == 0) + list_del(&real_qp->xrcd_list); + else + real_qp = NULL; + mutex_unlock(&xrcd->tgt_qp_mutex); + + if (real_qp) { + ret = ib_destroy_qp(real_qp); + if (!ret) + atomic_dec(&xrcd->usecnt); + else + __ib_insert_xrcd_qp(xrcd, real_qp); + } + + return 0; +} + +int ib_destroy_qp(struct ib_qp *qp) +{ + struct ib_pd *pd; + struct ib_cq *scq, *rcq; + struct ib_srq *srq; + struct ib_rwq_ind_table *ind_tbl; + int ret; + + if (atomic_read(&qp->usecnt)) + return -EBUSY; + + if (qp->real_qp != qp) + return __ib_destroy_shared_qp(qp); + + pd = qp->pd; + scq = qp->send_cq; + rcq = qp->recv_cq; + srq = qp->srq; + ind_tbl = qp->rwq_ind_tbl; + + ret = qp->device->destroy_qp(qp); + if (!ret) { + if (pd) + atomic_dec(&pd->usecnt); + if (scq) + atomic_dec(&scq->usecnt); + if (rcq) + atomic_dec(&rcq->usecnt); + if (srq) + atomic_dec(&srq->usecnt); + if (ind_tbl) + atomic_dec(&ind_tbl->usecnt); + } + + return ret; +} +EXPORT_SYMBOL(ib_destroy_qp); + +/* Completion queues */ + +struct ib_cq *ib_create_cq(struct ib_device *device, + ib_comp_handler comp_handler, + void (*event_handler)(struct ib_event *, void *), + void *cq_context, + const struct ib_cq_init_attr *cq_attr) +{ + struct ib_cq *cq; + + cq = device->create_cq(device, cq_attr, NULL, NULL); + + if (!IS_ERR(cq)) { + cq->device = device; + cq->uobject = NULL; + cq->comp_handler = comp_handler; + cq->event_handler = event_handler; + cq->cq_context = cq_context; + atomic_set(&cq->usecnt, 0); + } + + return cq; +} +EXPORT_SYMBOL(ib_create_cq); + +int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +{ + return cq->device->modify_cq ? + cq->device->modify_cq(cq, cq_count, cq_period) : -ENOSYS; +} +EXPORT_SYMBOL(ib_modify_cq); + +int ib_destroy_cq(struct ib_cq *cq) +{ + if (atomic_read(&cq->usecnt)) + return -EBUSY; + + return cq->device->destroy_cq(cq); +} +EXPORT_SYMBOL(ib_destroy_cq); + +int ib_resize_cq(struct ib_cq *cq, int cqe) +{ + return cq->device->resize_cq ? + cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS; +} +EXPORT_SYMBOL(ib_resize_cq); + +/* Memory regions */ + +int ib_dereg_mr(struct ib_mr *mr) +{ + struct ib_pd *pd = mr->pd; + int ret; + + ret = mr->device->dereg_mr(mr); + if (!ret) + atomic_dec(&pd->usecnt); + + return ret; +} +EXPORT_SYMBOL(ib_dereg_mr); + +/** + * ib_alloc_mr() - Allocates a memory region + * @pd: protection domain associated with the region + * @mr_type: memory region type + * @max_num_sg: maximum sg entries available for registration. + * + * Notes: + * Memory registeration page/sg lists must not exceed max_num_sg. + * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed + * max_num_sg * used_page_size. + * + */ +struct ib_mr *ib_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct ib_mr *mr; + + if (!pd->device->alloc_mr) + return ERR_PTR(-ENOSYS); + + mr = pd->device->alloc_mr(pd, mr_type, max_num_sg); + if (!IS_ERR(mr)) { + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + mr->need_inval = false; + } + + return mr; +} +EXPORT_SYMBOL(ib_alloc_mr); + +/* "Fast" memory regions */ + +struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, + int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct ib_fmr *fmr; + + if (!pd->device->alloc_fmr) + return ERR_PTR(-ENOSYS); + + fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr); + if (!IS_ERR(fmr)) { + fmr->device = pd->device; + fmr->pd = pd; + atomic_inc(&pd->usecnt); + } + + return fmr; +} +EXPORT_SYMBOL(ib_alloc_fmr); + +int ib_unmap_fmr(struct list_head *fmr_list) +{ + struct ib_fmr *fmr; + + if (list_empty(fmr_list)) + return 0; + + fmr = list_entry(fmr_list->next, struct ib_fmr, list); + return fmr->device->unmap_fmr(fmr_list); +} +EXPORT_SYMBOL(ib_unmap_fmr); + +int ib_dealloc_fmr(struct ib_fmr *fmr) +{ + struct ib_pd *pd; + int ret; + + pd = fmr->pd; + ret = fmr->device->dealloc_fmr(fmr); + if (!ret) + atomic_dec(&pd->usecnt); + + return ret; +} +EXPORT_SYMBOL(ib_dealloc_fmr); + +/* Multicast groups */ + +int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) +{ + int ret; + + if (!qp->device->attach_mcast) + return -ENOSYS; + if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) + return -EINVAL; + + ret = qp->device->attach_mcast(qp, gid, lid); + if (!ret) + atomic_inc(&qp->usecnt); + return ret; +} +EXPORT_SYMBOL(ib_attach_mcast); + +int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) +{ + int ret; + + if (!qp->device->detach_mcast) + return -ENOSYS; + if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) + return -EINVAL; + + ret = qp->device->detach_mcast(qp, gid, lid); + if (!ret) + atomic_dec(&qp->usecnt); + return ret; +} +EXPORT_SYMBOL(ib_detach_mcast); + +struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) +{ + struct ib_xrcd *xrcd; + + if (!device->alloc_xrcd) + return ERR_PTR(-ENOSYS); + + xrcd = device->alloc_xrcd(device, NULL, NULL); + if (!IS_ERR(xrcd)) { + xrcd->device = device; + xrcd->inode = NULL; + atomic_set(&xrcd->usecnt, 0); + mutex_init(&xrcd->tgt_qp_mutex); + INIT_LIST_HEAD(&xrcd->tgt_qp_list); + } + + return xrcd; +} +EXPORT_SYMBOL(ib_alloc_xrcd); + +int ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + struct ib_qp *qp; + int ret; + + if (atomic_read(&xrcd->usecnt)) + return -EBUSY; + + while (!list_empty(&xrcd->tgt_qp_list)) { + qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list); + ret = ib_destroy_qp(qp); + if (ret) + return ret; + } + + return xrcd->device->dealloc_xrcd(xrcd); +} +EXPORT_SYMBOL(ib_dealloc_xrcd); + +/** + * ib_create_wq - Creates a WQ associated with the specified protection + * domain. + * @pd: The protection domain associated with the WQ. + * @wq_init_attr: A list of initial attributes required to create the + * WQ. If WQ creation succeeds, then the attributes are updated to + * the actual capabilities of the created WQ. + * + * wq_init_attr->max_wr and wq_init_attr->max_sge determine + * the requested size of the WQ, and set to the actual values allocated + * on return. + * If ib_create_wq() succeeds, then max_wr and max_sge will always be + * at least as large as the requested values. + */ +struct ib_wq *ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *wq_attr) +{ + struct ib_wq *wq; + + if (!pd->device->create_wq) + return ERR_PTR(-ENOSYS); + + wq = pd->device->create_wq(pd, wq_attr, NULL); + if (!IS_ERR(wq)) { + wq->event_handler = wq_attr->event_handler; + wq->wq_context = wq_attr->wq_context; + wq->wq_type = wq_attr->wq_type; + wq->cq = wq_attr->cq; + wq->device = pd->device; + wq->pd = pd; + wq->uobject = NULL; + atomic_inc(&pd->usecnt); + atomic_inc(&wq_attr->cq->usecnt); + atomic_set(&wq->usecnt, 0); + } + return wq; +} +EXPORT_SYMBOL(ib_create_wq); + +/** + * ib_destroy_wq - Destroys the specified WQ. + * @wq: The WQ to destroy. + */ +int ib_destroy_wq(struct ib_wq *wq) +{ + int err; + struct ib_cq *cq = wq->cq; + struct ib_pd *pd = wq->pd; + + if (atomic_read(&wq->usecnt)) + return -EBUSY; + + err = wq->device->destroy_wq(wq); + if (!err) { + atomic_dec(&pd->usecnt); + atomic_dec(&cq->usecnt); + } + return err; +} +EXPORT_SYMBOL(ib_destroy_wq); + +/** + * ib_modify_wq - Modifies the specified WQ. + * @wq: The WQ to modify. + * @wq_attr: On input, specifies the WQ attributes to modify. + * @wq_attr_mask: A bit-mask used to specify which attributes of the WQ + * are being modified. + * On output, the current values of selected WQ attributes are returned. + */ +int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask) +{ + int err; + + if (!wq->device->modify_wq) + return -ENOSYS; + + err = wq->device->modify_wq(wq, wq_attr, wq_attr_mask, NULL); + return err; +} +EXPORT_SYMBOL(ib_modify_wq); + +/* + * ib_create_rwq_ind_table - Creates a RQ Indirection Table. + * @device: The device on which to create the rwq indirection table. + * @ib_rwq_ind_table_init_attr: A list of initial attributes required to + * create the Indirection Table. + * + * Note: The life time of ib_rwq_ind_table_init_attr->ind_tbl is not less + * than the created ib_rwq_ind_table object and the caller is responsible + * for its memory allocation/free. + */ +struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr) +{ + struct ib_rwq_ind_table *rwq_ind_table; + int i; + u32 table_size; + + if (!device->create_rwq_ind_table) + return ERR_PTR(-ENOSYS); + + table_size = (1 << init_attr->log_ind_tbl_size); + rwq_ind_table = device->create_rwq_ind_table(device, + init_attr, NULL); + if (IS_ERR(rwq_ind_table)) + return rwq_ind_table; + + rwq_ind_table->ind_tbl = init_attr->ind_tbl; + rwq_ind_table->log_ind_tbl_size = init_attr->log_ind_tbl_size; + rwq_ind_table->device = device; + rwq_ind_table->uobject = NULL; + atomic_set(&rwq_ind_table->usecnt, 0); + + for (i = 0; i < table_size; i++) + atomic_inc(&rwq_ind_table->ind_tbl[i]->usecnt); + + return rwq_ind_table; +} +EXPORT_SYMBOL(ib_create_rwq_ind_table); + +/* + * ib_destroy_rwq_ind_table - Destroys the specified Indirection Table. + * @wq_ind_table: The Indirection Table to destroy. +*/ +int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table) +{ + int err, i; + u32 table_size = (1 << rwq_ind_table->log_ind_tbl_size); + struct ib_wq **ind_tbl = rwq_ind_table->ind_tbl; + + if (atomic_read(&rwq_ind_table->usecnt)) + return -EBUSY; + + err = rwq_ind_table->device->destroy_rwq_ind_table(rwq_ind_table); + if (!err) { + for (i = 0; i < table_size; i++) + atomic_dec(&ind_tbl[i]->usecnt); + } + + return err; +} +EXPORT_SYMBOL(ib_destroy_rwq_ind_table); + +struct ib_flow *ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + int domain) +{ + struct ib_flow *flow_id; + if (!qp->device->create_flow) + return ERR_PTR(-ENOSYS); + + flow_id = qp->device->create_flow(qp, flow_attr, domain); + if (!IS_ERR(flow_id)) + atomic_inc(&qp->usecnt); + return flow_id; +} +EXPORT_SYMBOL(ib_create_flow); + +int ib_destroy_flow(struct ib_flow *flow_id) +{ + int err; + struct ib_qp *qp = flow_id->qp; + + err = qp->device->destroy_flow(flow_id); + if (!err) + atomic_dec(&qp->usecnt); + return err; +} +EXPORT_SYMBOL(ib_destroy_flow); + +int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + return mr->device->check_mr_status ? + mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS; +} +EXPORT_SYMBOL(ib_check_mr_status); + +int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, + int state) +{ + if (!device->set_vf_link_state) + return -ENOSYS; + + return device->set_vf_link_state(device, vf, port, state); +} +EXPORT_SYMBOL(ib_set_vf_link_state); + +int ib_get_vf_config(struct ib_device *device, int vf, u8 port, + struct ifla_vf_info *info) +{ + if (!device->get_vf_config) + return -ENOSYS; + + return device->get_vf_config(device, vf, port, info); +} +EXPORT_SYMBOL(ib_get_vf_config); + +int ib_get_vf_stats(struct ib_device *device, int vf, u8 port, + struct ifla_vf_stats *stats) +{ + if (!device->get_vf_stats) + return -ENOSYS; + + return device->get_vf_stats(device, vf, port, stats); +} +EXPORT_SYMBOL(ib_get_vf_stats); + +int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, + int type) +{ + if (!device->set_vf_guid) + return -ENOSYS; + + return device->set_vf_guid(device, vf, port, guid, type); +} +EXPORT_SYMBOL(ib_set_vf_guid); + +/** + * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list + * and set it the memory region. + * @mr: memory region + * @sg: dma mapped scatterlist + * @sg_nents: number of entries in sg + * @sg_offset: offset in bytes into sg + * @page_size: page vector desired page size + * + * Constraints: + * - The first sg element is allowed to have an offset. + * - Each sg element must either be aligned to page_size or virtually + * contiguous to the previous element. In case an sg element has a + * non-contiguous offset, the mapping prefix will not include it. + * - The last sg element is allowed to have length less than page_size. + * - If sg_nents total byte length exceeds the mr max_num_sge * page_size + * then only max_num_sg entries will be mapped. + * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS, none of these + * constraints holds and the page_size argument is ignored. + * + * Returns the number of sg elements that were mapped to the memory region. + * + * After this completes successfully, the memory region + * is ready for registration. + */ +int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset, unsigned int page_size) +{ + if (unlikely(!mr->device->map_mr_sg)) + return -ENOSYS; + + mr->page_size = page_size; + + return mr->device->map_mr_sg(mr, sg, sg_nents, sg_offset); +} +EXPORT_SYMBOL(ib_map_mr_sg); + +/** + * ib_sg_to_pages() - Convert the largest prefix of a sg list + * to a page vector + * @mr: memory region + * @sgl: dma mapped scatterlist + * @sg_nents: number of entries in sg + * @sg_offset_p: IN: start offset in bytes into sg + * OUT: offset in bytes for element n of the sg of the first + * byte that has not been processed where n is the return + * value of this function. + * @set_page: driver page assignment function pointer + * + * Core service helper for drivers to convert the largest + * prefix of given sg list to a page vector. The sg list + * prefix converted is the prefix that meet the requirements + * of ib_map_mr_sg. + * + * Returns the number of sg elements that were assigned to + * a page vector. + */ +int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, + unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64)) +{ + struct scatterlist *sg; + u64 last_end_dma_addr = 0; + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; + unsigned int last_page_off = 0; + u64 page_mask = ~((u64)mr->page_size - 1); + int i, ret; + + if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0]))) + return -EINVAL; + + mr->iova = sg_dma_address(&sgl[0]) + sg_offset; + mr->length = 0; + + for_each_sg(sgl, sg, sg_nents, i) { + u64 dma_addr = sg_dma_address(sg) + sg_offset; + u64 prev_addr = dma_addr; + unsigned int dma_len = sg_dma_len(sg) - sg_offset; + u64 end_dma_addr = dma_addr + dma_len; + u64 page_addr = dma_addr & page_mask; + + /* + * For the second and later elements, check whether either the + * end of element i-1 or the start of element i is not aligned + * on a page boundary. + */ + if (i && (last_page_off != 0 || page_addr != dma_addr)) { + /* Stop mapping if there is a gap. */ + if (last_end_dma_addr != dma_addr) + break; + + /* + * Coalesce this element with the last. If it is small + * enough just update mr->length. Otherwise start + * mapping from the next page. + */ + goto next_page; + } + + do { + ret = set_page(mr, page_addr); + if (unlikely(ret < 0)) { + sg_offset = prev_addr - sg_dma_address(sg); + mr->length += prev_addr - dma_addr; + if (sg_offset_p) + *sg_offset_p = sg_offset; + return i || sg_offset ? i : ret; + } + prev_addr = page_addr; +next_page: + page_addr += mr->page_size; + } while (page_addr < end_dma_addr); + + mr->length += dma_len; + last_end_dma_addr = end_dma_addr; + last_page_off = end_dma_addr & ~page_mask; + + sg_offset = 0; + } + + if (sg_offset_p) + *sg_offset_p = 0; + return i; +} +EXPORT_SYMBOL(ib_sg_to_pages); + +struct ib_drain_cqe { + struct ib_cqe cqe; + struct completion done; +}; + +static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe, + cqe); + + complete(&cqe->done); +} + +/* + * Post a WR and block until its completion is reaped for the SQ. + */ +static void __ib_drain_sq(struct ib_qp *qp) +{ + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct ib_drain_cqe sdrain; + struct ib_send_wr swr = {}, *bad_swr; + int ret; + + if (qp->send_cq->poll_ctx == IB_POLL_DIRECT) { + WARN_ONCE(qp->send_cq->poll_ctx == IB_POLL_DIRECT, + "IB_POLL_DIRECT poll_ctx not supported for drain\n"); + return; + } + + swr.wr_cqe = &sdrain.cqe; + sdrain.cqe.done = ib_drain_qp_done; + init_completion(&sdrain.done); + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + ret = ib_post_send(qp, &swr, &bad_swr); + if (ret) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + wait_for_completion(&sdrain.done); +} + +/* + * Post a WR and block until its completion is reaped for the RQ. + */ +static void __ib_drain_rq(struct ib_qp *qp) +{ + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct ib_drain_cqe rdrain; + struct ib_recv_wr rwr = {}, *bad_rwr; + int ret; + + if (qp->recv_cq->poll_ctx == IB_POLL_DIRECT) { + WARN_ONCE(qp->recv_cq->poll_ctx == IB_POLL_DIRECT, + "IB_POLL_DIRECT poll_ctx not supported for drain\n"); + return; + } + + rwr.wr_cqe = &rdrain.cqe; + rdrain.cqe.done = ib_drain_qp_done; + init_completion(&rdrain.done); + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + ret = ib_post_recv(qp, &rwr, &bad_rwr); + if (ret) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + wait_for_completion(&rdrain.done); +} + +/** + * ib_drain_sq() - Block until all SQ CQEs have been consumed by the + * application. + * @qp: queue pair to drain + * + * If the device has a provider-specific drain function, then + * call that. Otherwise call the generic drain function + * __ib_drain_sq(). + * + * The caller must: + * + * ensure there is room in the CQ and SQ for the drain work request and + * completion. + * + * allocate the CQ using ib_alloc_cq() and the CQ poll context cannot be + * IB_POLL_DIRECT. + * + * ensure that there are no other contexts that are posting WRs concurrently. + * Otherwise the drain is not guaranteed. + */ +void ib_drain_sq(struct ib_qp *qp) +{ + if (qp->device->drain_sq) + qp->device->drain_sq(qp); + else + __ib_drain_sq(qp); +} +EXPORT_SYMBOL(ib_drain_sq); + +/** + * ib_drain_rq() - Block until all RQ CQEs have been consumed by the + * application. + * @qp: queue pair to drain + * + * If the device has a provider-specific drain function, then + * call that. Otherwise call the generic drain function + * __ib_drain_rq(). + * + * The caller must: + * + * ensure there is room in the CQ and RQ for the drain work request and + * completion. + * + * allocate the CQ using ib_alloc_cq() and the CQ poll context cannot be + * IB_POLL_DIRECT. + * + * ensure that there are no other contexts that are posting WRs concurrently. + * Otherwise the drain is not guaranteed. + */ +void ib_drain_rq(struct ib_qp *qp) +{ + if (qp->device->drain_rq) + qp->device->drain_rq(qp); + else + __ib_drain_rq(qp); +} +EXPORT_SYMBOL(ib_drain_rq); + +/** + * ib_drain_qp() - Block until all CQEs have been consumed by the + * application on both the RQ and SQ. + * @qp: queue pair to drain + * + * The caller must: + * + * ensure there is room in the CQ(s), SQ, and RQ for drain work requests + * and completions. + * + * allocate the CQs using ib_alloc_cq() and the CQ poll context cannot be + * IB_POLL_DIRECT. + * + * ensure that there are no other contexts that are posting WRs concurrently. + * Otherwise the drain is not guaranteed. + */ +void ib_drain_qp(struct ib_qp *qp) +{ + ib_drain_sq(qp); + if (!qp->srq) + ib_drain_rq(qp); +} +EXPORT_SYMBOL(ib_drain_qp); Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ib_verbs.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/iwcm.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/iwcm.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/iwcm.c (revision 319974) @@ -1,1285 +1,1050 @@ /* * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. - * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ -#include "opt_inet.h" - #include #include #include #include #include #include #include #include #include #include #include -#include -#include -#include -#include #include #include +#include #include "iwcm.h" MODULE_AUTHOR("Tom Tucker"); MODULE_DESCRIPTION("iWARP CM"); MODULE_LICENSE("Dual BSD/GPL"); static struct workqueue_struct *iwcm_wq; struct iwcm_work { struct work_struct work; struct iwcm_id_private *cm_id; struct list_head list; struct iw_cm_event event; struct list_head free_list; }; -struct iwcm_listen_work { - struct work_struct work; - struct iw_cm_id *cm_id; -}; -static LIST_HEAD(listen_port_list); +static unsigned int default_backlog = 256; -static DEFINE_MUTEX(listen_port_mutex); - -struct listen_port_info { - struct list_head list; - uint16_t port_num; - uint32_t refcnt; -}; - -static int32_t -add_port_to_listenlist(uint16_t port) -{ - struct listen_port_info *port_info; - int err = 0; - - mutex_lock(&listen_port_mutex); - - list_for_each_entry(port_info, &listen_port_list, list) - if (port_info->port_num == port) - goto found_port; - - port_info = kmalloc(sizeof(*port_info), GFP_KERNEL); - if (!port_info) { - err = -ENOMEM; - mutex_unlock(&listen_port_mutex); - goto out; - } - - port_info->port_num = port; - port_info->refcnt = 0; - - list_add(&port_info->list, &listen_port_list); - -found_port: - ++(port_info->refcnt); - mutex_unlock(&listen_port_mutex); - return port_info->refcnt; -out: - return err; -} - -static int32_t -rem_port_from_listenlist(uint16_t port) -{ - struct listen_port_info *port_info; - int ret, found_port = 0; - - mutex_lock(&listen_port_mutex); - - list_for_each_entry(port_info, &listen_port_list, list) - if (port_info->port_num == port) { - found_port = 1; - break; - } - - if (found_port) { - --(port_info->refcnt); - ret = port_info->refcnt; - if (port_info->refcnt == 0) { - /* Remove this entry from the list as there are no - * more listeners for this port_num. - */ - list_del(&port_info->list); - kfree(port_info); - } - } else { - ret = -EINVAL; - } - mutex_unlock(&listen_port_mutex); - return ret; - -} - /* * The following services provide a mechanism for pre-allocating iwcm_work * elements. The design pre-allocates them based on the cm_id type: * LISTENING IDS: Get enough elements preallocated to handle the * listen backlog. * ACTIVE IDS: 4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE * PASSIVE IDS: 3: ESTABLISHED, DISCONNECT, CLOSE * * Allocating them in connect and listen avoids having to deal * with allocation failures on the event upcall from the provider (which * is called in the interrupt context). * * One exception is when creating the cm_id for incoming connection requests. * There are two cases: * 1) in the event upcall, cm_event_handler(), for a listening cm_id. If * the backlog is exceeded, then no more connection request events will * be processed. cm_event_handler() returns -ENOMEM in this case. Its up * to the provider to reject the connection request. * 2) in the connection request workqueue handler, cm_conn_req_handler(). * If work elements cannot be allocated for the new connect request cm_id, * then IWCM will call the provider reject method. This is ok since * cm_conn_req_handler() runs in the workqueue thread context. */ static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv) { struct iwcm_work *work; if (list_empty(&cm_id_priv->work_free_list)) return NULL; work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work, free_list); list_del_init(&work->free_list); return work; } static void put_work(struct iwcm_work *work) { list_add(&work->free_list, &work->cm_id->work_free_list); } static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv) { struct list_head *e, *tmp; list_for_each_safe(e, tmp, &cm_id_priv->work_free_list) kfree(list_entry(e, struct iwcm_work, free_list)); } static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count) { struct iwcm_work *work; BUG_ON(!list_empty(&cm_id_priv->work_free_list)); while (count--) { work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL); if (!work) { dealloc_work_entries(cm_id_priv); return -ENOMEM; } work->cm_id = cm_id_priv; INIT_LIST_HEAD(&work->list); put_work(work); } return 0; } /* * Save private data from incoming connection requests to * iw_cm_event, so the low level driver doesn't have to. Adjust * the event ptr to point to the local copy. */ static int copy_private_data(struct iw_cm_event *event) { void *p; p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC); if (!p) return -ENOMEM; event->private_data = p; return 0; } static void free_cm_id(struct iwcm_id_private *cm_id_priv) { dealloc_work_entries(cm_id_priv); kfree(cm_id_priv); } /* * Release a reference on cm_id. If the last reference is being - * released, enable the waiting thread (in iw_destroy_cm_id) to - * get woken up, and return 1 if a thread is already waiting. + * released, free the cm_id and return 1. */ static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv) { BUG_ON(atomic_read(&cm_id_priv->refcount)==0); if (atomic_dec_and_test(&cm_id_priv->refcount)) { BUG_ON(!list_empty(&cm_id_priv->work_list)); - complete(&cm_id_priv->destroy_comp); + free_cm_id(cm_id_priv); return 1; } return 0; } static void add_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); atomic_inc(&cm_id_priv->refcount); } static void rem_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; - int cb_destroy; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); - /* - * Test bit before deref in case the cm_id gets freed on another - * thread. - */ - cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); - if (iwcm_deref_id(cm_id_priv) && cb_destroy) { - BUG_ON(!list_empty(&cm_id_priv->work_list)); - free_cm_id(cm_id_priv); - } + (void)iwcm_deref_id(cm_id_priv); } static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event); struct iw_cm_id *iw_create_cm_id(struct ib_device *device, - struct socket *so, iw_cm_handler cm_handler, void *context) { struct iwcm_id_private *cm_id_priv; cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL); if (!cm_id_priv) return ERR_PTR(-ENOMEM); cm_id_priv->state = IW_CM_STATE_IDLE; cm_id_priv->id.device = device; cm_id_priv->id.cm_handler = cm_handler; cm_id_priv->id.context = context; cm_id_priv->id.event_handler = cm_event_handler; cm_id_priv->id.add_ref = add_ref; cm_id_priv->id.rem_ref = rem_ref; - cm_id_priv->id.so = so; spin_lock_init(&cm_id_priv->lock); atomic_set(&cm_id_priv->refcount, 1); init_waitqueue_head(&cm_id_priv->connect_wait); init_completion(&cm_id_priv->destroy_comp); INIT_LIST_HEAD(&cm_id_priv->work_list); INIT_LIST_HEAD(&cm_id_priv->work_free_list); return &cm_id_priv->id; } EXPORT_SYMBOL(iw_create_cm_id); static int iwcm_modify_qp_err(struct ib_qp *qp) { struct ib_qp_attr qp_attr; if (!qp) return -EINVAL; qp_attr.qp_state = IB_QPS_ERR; return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); } /* * This is really the RDMAC CLOSING state. It is most similar to the * IB SQD QP state. */ static int iwcm_modify_qp_sqd(struct ib_qp *qp) { struct ib_qp_attr qp_attr; BUG_ON(qp == NULL); qp_attr.qp_state = IB_QPS_SQD; return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); } /* * CM_ID <-- CLOSING * * Block if a passive or active connection is currently being processed. Then * process the event as follows: * - If we are ESTABLISHED, move to CLOSING and modify the QP state * based on the abrupt flag * - If the connection is already in the CLOSING or IDLE state, the peer is * disconnecting concurrently with us and we've already seen the * DISCONNECT event -- ignore the request and return 0 * - Disconnect on a listening endpoint returns -EINVAL */ int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt) { struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret = 0; struct ib_qp *qp = NULL; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); /* Wait if we're currently in a connect or accept downcall */ wait_event(cm_id_priv->connect_wait, !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_ESTABLISHED: cm_id_priv->state = IW_CM_STATE_CLOSING; /* QP could be for user-mode client */ if (cm_id_priv->qp) qp = cm_id_priv->qp; else ret = -EINVAL; break; case IW_CM_STATE_LISTEN: ret = -EINVAL; break; case IW_CM_STATE_CLOSING: /* remote peer closed first */ case IW_CM_STATE_IDLE: /* accept or connect returned !0 */ break; case IW_CM_STATE_CONN_RECV: /* * App called disconnect before/without calling accept after * connect_request event delivered. */ break; case IW_CM_STATE_CONN_SENT: /* Can only get here if wait above fails */ default: BUG(); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (qp) { if (abrupt) ret = iwcm_modify_qp_err(qp); else ret = iwcm_modify_qp_sqd(qp); /* * If both sides are disconnecting the QP could * already be in ERR or SQD states */ ret = 0; } return ret; } EXPORT_SYMBOL(iw_cm_disconnect); -static struct socket * -dequeue_socket(struct socket *head) -{ - struct socket *so; - struct sockaddr_in *remote; - int error; - - SOLISTEN_LOCK(head); - error = solisten_dequeue(head, &so, SOCK_NONBLOCK); - if (error == EWOULDBLOCK) - return (NULL); - remote = NULL; - soaccept(so, (struct sockaddr **)&remote); - - free(remote, M_SONAME); - return so; -} - -static void -iw_so_event_handler(struct work_struct *_work) -{ -#ifdef INET - struct iwcm_listen_work *work = container_of(_work, - struct iwcm_listen_work, work); - struct iw_cm_id *listen_cm_id = work->cm_id; - struct iwcm_id_private *cm_id_priv; - struct iw_cm_id *real_cm_id; - struct sockaddr_in *local; - struct socket *so; - - cm_id_priv = container_of(listen_cm_id, struct iwcm_id_private, id); - - if (cm_id_priv->state != IW_CM_STATE_LISTEN) { - kfree(work); - return; - } - - /* Dequeue & process all new 'so' connection requests for this cmid */ - while ((so = dequeue_socket(work->cm_id->so)) != NULL) { - if (rdma_cma_any_addr((struct sockaddr *) - &listen_cm_id->local_addr)) { - in_getsockaddr(so, (struct sockaddr **)&local); - if (rdma_find_cmid_laddr(local, ARPHRD_ETHER, - (void **) &real_cm_id)) { - free(local, M_SONAME); - goto err; - } - free(local, M_SONAME); - - real_cm_id->device->iwcm->newconn(real_cm_id, so); - } else { - listen_cm_id->device->iwcm->newconn(listen_cm_id, so); - } - } -err: - kfree(work); -#endif - return; -} - -static int -iw_so_upcall(struct socket *parent_so, void *arg, int waitflag) -{ - struct iwcm_listen_work *work; - struct iw_cm_id *cm_id = arg; - - /* check whether iw_so_event_handler() already dequeued this 'so' */ - if (TAILQ_EMPTY(&parent_so->sol_comp)) - return SU_OK; - work = kzalloc(sizeof(*work), waitflag); - if (!work) - return -ENOMEM; - work->cm_id = cm_id; - - INIT_WORK(&work->work, iw_so_event_handler); - queue_work(iwcm_wq, &work->work); - - return SU_OK; -} - -static int -iw_create_listen(struct iw_cm_id *cm_id, int backlog) -{ - struct sockopt sopt; - struct socket *so = cm_id->so; - int on = 1; - int rc; - - rc = -solisten(cm_id->so, backlog, curthread); - if (rc != 0) - return (rc); - SOLISTEN_LOCK(so); - solisten_upcall_set(so, iw_so_upcall, cm_id); - so->so_state |= SS_NBIO; - SOLISTEN_UNLOCK(so); - sopt.sopt_dir = SOPT_SET; - sopt.sopt_level = IPPROTO_TCP; - sopt.sopt_name = TCP_NODELAY; - sopt.sopt_val = (caddr_t)&on; - sopt.sopt_valsize = sizeof(on); - sopt.sopt_td = NULL; - sosetopt(so, &sopt); - return (0); -} - -static int -iw_destroy_listen(struct iw_cm_id *cm_id) -{ - struct socket *so = cm_id->so; - - SOLISTEN_LOCK(so); - solisten_upcall_set(so, NULL, NULL); - SOLISTEN_UNLOCK(so); - return (0); -} - - /* * CM_ID <-- DESTROYING * * Clean up all resources associated with the connection and release * the initial reference taken by iw_create_cm_id. */ static void destroy_cm_id(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; unsigned long flags; - int ret = 0, refcnt; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); /* * Wait if we're currently in a connect or accept downcall. A * listening endpoint should never block here. */ wait_event(cm_id_priv->connect_wait, !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); + /* + * Since we're deleting the cm_id, drop any events that + * might arrive before the last dereference. + */ + set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags); + spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_LISTEN: cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) { - refcnt = - rem_port_from_listenlist(cm_id->local_addr.sin_port); - - if (refcnt == 0) - ret = iw_destroy_listen(cm_id); - - cm_id->device->iwcm->destroy_listen_ep(cm_id); - } else { - ret = iw_destroy_listen(cm_id); - cm_id->device->iwcm->destroy_listen_ep(cm_id); - } + /* destroy the listening endpoint */ + cm_id->device->iwcm->destroy_listen(cm_id); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_ESTABLISHED: cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); /* Abrupt close of the connection */ (void)iwcm_modify_qp_err(cm_id_priv->qp); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_IDLE: case IW_CM_STATE_CLOSING: cm_id_priv->state = IW_CM_STATE_DESTROYING; break; case IW_CM_STATE_CONN_RECV: /* * App called destroy before/without calling accept after * receiving connection request event notification or * returned non zero from the event callback function. * In either case, must tell the provider to reject. */ cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_id->device->iwcm->reject(cm_id, NULL, 0); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_DESTROYING: default: BUG(); break; } if (cm_id_priv->qp) { cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); cm_id_priv->qp = NULL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); (void)iwcm_deref_id(cm_id_priv); } /* * This function is only called by the application thread and cannot * be called by the event thread. The function will wait for all * references to be released on the cm_id and then kfree the cm_id * object. */ void iw_destroy_cm_id(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); - BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags)); - destroy_cm_id(cm_id); +} +EXPORT_SYMBOL(iw_destroy_cm_id); - wait_for_completion(&cm_id_priv->destroy_comp); +/** + * iw_cm_map - Use portmapper to map the ports + * @cm_id: connection manager pointer + * @active: Indicates the active side when true + * returns nonzero for error only if iwpm_create_mapinfo() fails + * + * Tries to add a mapping for a port using the Portmapper. If + * successful in mapping the IP/Port it will check the remote + * mapped IP address for a wildcard IP address and replace the + * zero IP address with the remote_addr. + */ +static int iw_cm_map(struct iw_cm_id *cm_id, bool active) +{ + cm_id->m_local_addr = cm_id->local_addr; + cm_id->m_remote_addr = cm_id->remote_addr; - if (cm_id->so) - sock_release(cm_id->so); - - free_cm_id(cm_id_priv); + return 0; } -EXPORT_SYMBOL(iw_destroy_cm_id); /* * CM_ID <-- LISTEN * * Start listening for connect requests. Generates one CONNECT_REQUEST * event for each inbound connect request. */ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog) { struct iwcm_id_private *cm_id_priv; unsigned long flags; - int ret, refcnt; + int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + if (!backlog) + backlog = default_backlog; + ret = alloc_work_entries(cm_id_priv, backlog); if (ret) return ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_IDLE: cm_id_priv->state = IW_CM_STATE_LISTEN; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - - if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) { - refcnt = - add_port_to_listenlist(cm_id->local_addr.sin_port); - - if (refcnt == 1) { - ret = iw_create_listen(cm_id, backlog); - } else if (refcnt <= 0) { - ret = -EINVAL; - } else { - /* if refcnt > 1, a socket listener created - * already. And we need not create socket - * listener on other rdma devices/listen cm_id's - * due to TOE. That is when a socket listener is - * created with INADDR_ANY all registered TOE - * devices will get a call to start - * hardware listeners. - */ - } - } else { - ret = iw_create_listen(cm_id, backlog); - } + ret = iw_cm_map(cm_id, false); if (!ret) - cm_id->device->iwcm->create_listen_ep(cm_id, backlog); - else + ret = cm_id->device->iwcm->create_listen(cm_id, backlog); + if (ret) cm_id_priv->state = IW_CM_STATE_IDLE; - spin_lock_irqsave(&cm_id_priv->lock, flags); break; default: ret = -EINVAL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(iw_cm_listen); /* * CM_ID <-- IDLE * * Rejects an inbound connection request. No events are generated. */ int iw_cm_reject(struct iw_cm_id *cm_id, const void *private_data, u8 private_data_len) { struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id_priv->state = IW_CM_STATE_IDLE; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id->device->iwcm->reject(cm_id, private_data, private_data_len); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return ret; } EXPORT_SYMBOL(iw_cm_reject); /* * CM_ID <-- ESTABLISHED * * Accepts an inbound connection request and generates an ESTABLISHED * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block * until the ESTABLISHED event is received from the provider. */ int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) { struct iwcm_id_private *cm_id_priv; struct ib_qp *qp; unsigned long flags; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } /* Get the ib_qp given the QPN */ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); if (!qp) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id->device->iwcm->add_ref(qp); cm_id_priv->qp = qp; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id->device->iwcm->accept(cm_id, iw_param); if (ret) { /* An error on accept precludes provider events */ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); cm_id_priv->state = IW_CM_STATE_IDLE; spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->qp) { cm_id->device->iwcm->rem_ref(qp); cm_id_priv->qp = NULL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); } return ret; } EXPORT_SYMBOL(iw_cm_accept); /* * Active Side: CM_ID <-- CONN_SENT * * If successful, results in the generation of a CONNECT_REPLY * event. iw_cm_disconnect and iw_cm_destroy will block until the * CONNECT_REPLY event is received from the provider. */ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) { struct iwcm_id_private *cm_id_priv; int ret; unsigned long flags; struct ib_qp *qp; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); ret = alloc_work_entries(cm_id_priv, 4); if (ret) return ret; set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_IDLE) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); - wake_up_all(&cm_id_priv->connect_wait); - return -EINVAL; + ret = -EINVAL; + goto err; } /* Get the ib_qp given the QPN */ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); if (!qp) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); - wake_up_all(&cm_id_priv->connect_wait); - return -EINVAL; + ret = -EINVAL; + goto err; } cm_id->device->iwcm->add_ref(qp); cm_id_priv->qp = qp; cm_id_priv->state = IW_CM_STATE_CONN_SENT; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = cm_id->device->iwcm->connect(cm_id, iw_param); - if (ret) { - spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id_priv->qp) { - cm_id->device->iwcm->rem_ref(qp); - cm_id_priv->qp = NULL; - } - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); - cm_id_priv->state = IW_CM_STATE_IDLE; - clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); - wake_up_all(&cm_id_priv->connect_wait); - } + ret = iw_cm_map(cm_id, true); + if (!ret) + ret = cm_id->device->iwcm->connect(cm_id, iw_param); + if (!ret) + return 0; /* success */ + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id_priv->qp) { + cm_id->device->iwcm->rem_ref(qp); + cm_id_priv->qp = NULL; + } + cm_id_priv->state = IW_CM_STATE_IDLE; +err: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); return ret; } EXPORT_SYMBOL(iw_cm_connect); /* * Passive Side: new CM_ID <-- CONN_RECV * * Handles an inbound connect request. The function creates a new * iw_cm_id to represent the new connection and inherits the client * callback function and other attributes from the listening parent. * * The work item contains a pointer to the listen_cm_id and the event. The * listen_cm_id contains the client cm_handler, context and * device. These are copied when the device is cloned. The event * contains the new four tuple. * * An error on the child should not affect the parent, so this * function does not return a value. */ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; struct iw_cm_id *cm_id; struct iwcm_id_private *cm_id_priv; int ret; /* * The provider should never generate a connection request * event with a bad status. */ BUG_ON(iw_event->status); cm_id = iw_create_cm_id(listen_id_priv->id.device, - iw_event->so, listen_id_priv->id.cm_handler, listen_id_priv->id.context); /* If the cm_id could not be created, ignore the request */ if (IS_ERR(cm_id)) goto out; cm_id->provider_data = iw_event->provider_data; - cm_id->local_addr = iw_event->local_addr; + cm_id->m_local_addr = iw_event->local_addr; + cm_id->m_remote_addr = iw_event->remote_addr; + cm_id->local_addr = listen_id_priv->id.local_addr; cm_id->remote_addr = iw_event->remote_addr; - cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); cm_id_priv->state = IW_CM_STATE_CONN_RECV; /* * We could be destroying the listening id. If so, ignore this * upcall. */ spin_lock_irqsave(&listen_id_priv->lock, flags); if (listen_id_priv->state != IW_CM_STATE_LISTEN) { spin_unlock_irqrestore(&listen_id_priv->lock, flags); iw_cm_reject(cm_id, NULL, 0); iw_destroy_cm_id(cm_id); goto out; } spin_unlock_irqrestore(&listen_id_priv->lock, flags); ret = alloc_work_entries(cm_id_priv, 3); if (ret) { iw_cm_reject(cm_id, NULL, 0); iw_destroy_cm_id(cm_id); goto out; } /* Call the client CM handler */ ret = cm_id->cm_handler(cm_id, iw_event); if (ret) { iw_cm_reject(cm_id, NULL, 0); - set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); - destroy_cm_id(cm_id); - if (atomic_read(&cm_id_priv->refcount)==0) - free_cm_id(cm_id_priv); + iw_destroy_cm_id(cm_id); } out: if (iw_event->private_data_len) kfree(iw_event->private_data); } /* * Passive Side: CM_ID <-- ESTABLISHED * * The provider generated an ESTABLISHED event which means that * the MPA negotion has completed successfully and we are now in MPA * FPDU mode. * * This event can only be received in the CONN_RECV state. If the * remote peer closed, the ESTABLISHED event would be received followed * by the CLOSE event. If the app closes, it will block until we wake * it up after processing this event. */ static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); /* * We clear the CONNECT_WAIT bit here to allow the callback * function to call iw_cm_disconnect. Calling iw_destroy_cm_id * from a callback handler is not allowed. */ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); cm_id_priv->state = IW_CM_STATE_ESTABLISHED; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); wake_up_all(&cm_id_priv->connect_wait); return ret; } /* * Active Side: CM_ID <-- ESTABLISHED * * The app has called connect and is waiting for the established event to * post it's requests to the server. This event will wake up anyone * blocked in iw_cm_disconnect or iw_destroy_id. */ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); /* * Clear the connect wait bit so a callback function calling * iw_cm_disconnect will not wait and deadlock this thread */ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); if (iw_event->status == 0) { - cm_id_priv->id.local_addr = iw_event->local_addr; - cm_id_priv->id.remote_addr = iw_event->remote_addr; + cm_id_priv->id.m_local_addr = iw_event->local_addr; + cm_id_priv->id.m_remote_addr = iw_event->remote_addr; + iw_event->local_addr = cm_id_priv->id.local_addr; + iw_event->remote_addr = cm_id_priv->id.remote_addr; cm_id_priv->state = IW_CM_STATE_ESTABLISHED; } else { /* REJECTED or RESET */ cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); cm_id_priv->qp = NULL; cm_id_priv->state = IW_CM_STATE_IDLE; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); if (iw_event->private_data_len) kfree(iw_event->private_data); /* Wake up waiters on connect complete */ wake_up_all(&cm_id_priv->connect_wait); return ret; } /* * CM_ID <-- CLOSING * * If in the ESTABLISHED state, move to CLOSING. */ static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED) cm_id_priv->state = IW_CM_STATE_CLOSING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); } /* * CM_ID <-- IDLE * * If in the ESTBLISHED or CLOSING states, the QP will have have been * moved by the provider to the ERR state. Disassociate the CM_ID from * the QP, move to IDLE, and remove the 'connected' reference. * * If in some other state, the cm_id was destroyed asynchronously. * This is the last reference that will result in waking up * the app thread blocked in iw_destroy_cm_id. */ static int cm_close_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; int ret = 0; spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->qp) { cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); cm_id_priv->qp = NULL; } switch (cm_id_priv->state) { case IW_CM_STATE_ESTABLISHED: case IW_CM_STATE_CLOSING: cm_id_priv->state = IW_CM_STATE_IDLE; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_DESTROYING: break; default: BUG(); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int process_event(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { int ret = 0; switch (iw_event->event) { case IW_CM_EVENT_CONNECT_REQUEST: cm_conn_req_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_CONNECT_REPLY: ret = cm_conn_rep_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_ESTABLISHED: ret = cm_conn_est_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_DISCONNECT: cm_disconnect_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_CLOSE: ret = cm_close_handler(cm_id_priv, iw_event); break; default: BUG(); } return ret; } /* * Process events on the work_list for the cm_id. If the callback * function requests that the cm_id be deleted, a flag is set in the * cm_id flags to indicate that when the last reference is * removed, the cm_id is to be destroyed. This is necessary to * distinguish between an object that will be destroyed by the app * thread asleep on the destroy_comp list vs. an object destroyed * here synchronously when the last reference is removed. */ static void cm_work_handler(struct work_struct *_work) { struct iwcm_work *work = container_of(_work, struct iwcm_work, work); struct iw_cm_event levent; struct iwcm_id_private *cm_id_priv = work->cm_id; unsigned long flags; int empty; int ret = 0; - int destroy_id; spin_lock_irqsave(&cm_id_priv->lock, flags); empty = list_empty(&cm_id_priv->work_list); while (!empty) { work = list_entry(cm_id_priv->work_list.next, struct iwcm_work, list); list_del_init(&work->list); empty = list_empty(&cm_id_priv->work_list); levent = work->event; put_work(work); spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = process_event(cm_id_priv, &levent); - if (ret) { - set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); - destroy_cm_id(&cm_id_priv->id); - } - BUG_ON(atomic_read(&cm_id_priv->refcount)==0); - destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); - if (iwcm_deref_id(cm_id_priv)) { - if (destroy_id) { - BUG_ON(!list_empty(&cm_id_priv->work_list)); - free_cm_id(cm_id_priv); - } + if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { + ret = process_event(cm_id_priv, &levent); + if (ret) + destroy_cm_id(&cm_id_priv->id); + } else + pr_debug("dropping event %d\n", levent.event); + if (iwcm_deref_id(cm_id_priv)) return; - } if (empty) return; spin_lock_irqsave(&cm_id_priv->lock, flags); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); } /* * This function is called on interrupt context. Schedule events on * the iwcm_wq thread to allow callback functions to downcall into * the CM and/or block. Events are queued to a per-CM_ID * work_list. If this is the first event on the work_list, the work * element is also queued on the iwcm_wq thread. * * Each event holds a reference on the cm_id. Until the last posted * event has been delivered and processed, the cm_id cannot be * deleted. * * Returns: * 0 - the event was handled. * -ENOMEM - the event was not handled due to lack of resources. */ static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct iwcm_work *work; struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret = 0; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); work = get_work(cm_id_priv); if (!work) { ret = -ENOMEM; goto out; } INIT_WORK(&work->work, cm_work_handler); work->cm_id = cm_id_priv; work->event = *iw_event; if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST || work->event.event == IW_CM_EVENT_CONNECT_REPLY) && work->event.private_data_len) { ret = copy_private_data(&work->event); if (ret) { put_work(work); goto out; } } atomic_inc(&cm_id_priv->refcount); if (list_empty(&cm_id_priv->work_list)) { list_add_tail(&work->list, &cm_id_priv->work_list); queue_work(iwcm_wq, &work->work); } else list_add_tail(&work->list, &cm_id_priv->work_list); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_IDLE: case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_CONN_RECV: case IW_CM_STATE_ESTABLISHED: *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE| IB_ACCESS_REMOTE_READ; ret = 0; break; default: ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_IDLE: case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_CONN_RECV: case IW_CM_STATE_ESTABLISHED: *qp_attr_mask = 0; ret = 0; break; default: ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct iwcm_id_private *cm_id_priv; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); switch (qp_attr->qp_state) { case IB_QPS_INIT: case IB_QPS_RTR: ret = iwcm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask); break; case IB_QPS_RTS: ret = iwcm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask); break; default: ret = -EINVAL; break; } return ret; } EXPORT_SYMBOL(iw_cm_init_qp_attr); static int __init iw_cm_init(void) { - iwcm_wq = create_singlethread_workqueue("iw_cm_wq"); + iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM); if (!iwcm_wq) return -ENOMEM; return 0; } static void __exit iw_cm_cleanup(void) { destroy_workqueue(iwcm_wq); } module_init(iw_cm_init); module_exit(iw_cm_cleanup); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/iwcm.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/iwcm.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/iwcm.h (revision 319974) @@ -1,62 +1,62 @@ /* * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef IWCM_H #define IWCM_H enum iw_cm_state { IW_CM_STATE_IDLE, /* unbound, inactive */ IW_CM_STATE_LISTEN, /* listen waiting for connect */ IW_CM_STATE_CONN_RECV, /* inbound waiting for user accept */ IW_CM_STATE_CONN_SENT, /* outbound waiting for peer accept */ IW_CM_STATE_ESTABLISHED, /* established */ IW_CM_STATE_CLOSING, /* disconnect */ IW_CM_STATE_DESTROYING /* object being deleted */ }; struct iwcm_id_private { struct iw_cm_id id; enum iw_cm_state state; unsigned long flags; struct ib_qp *qp; struct completion destroy_comp; wait_queue_head_t connect_wait; struct list_head work_list; spinlock_t lock; atomic_t refcount; struct list_head work_free_list; }; -#define IWCM_F_CALLBACK_DESTROY 1 +#define IWCM_F_DROP_EVENTS 1 #define IWCM_F_CONNECT_WAIT 2 #endif /* IWCM_H */ Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/iwpm_util.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/iwpm_util.h (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/iwpm_util.h (revision 319974) @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _IWPM_UTIL_H +#define _IWPM_UTIL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define IWPM_PID_UNDEFINED -1 +#define IWPM_PID_UNAVAILABLE -2 + +#define IWPM_REG_UNDEF 0x01 +#define IWPM_REG_VALID 0x02 +#define IWPM_REG_INCOMPL 0x04 + +/** + * iwpm_compare_sockaddr - Compare two sockaddr storage structs + * + * Returns 0 if they are holding the same ip/tcp address info, + * otherwise returns 1 + */ +int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, + struct sockaddr_storage *b_sockaddr); + +/** + * iwpm_print_sockaddr - Print IPv4/IPv6 address and TCP port + * @sockaddr: Socket address to print + * @msg: Message to print + */ +void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg); +#endif Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/iwpm_util.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/mad.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/mad.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/mad.c (revision 319974) @@ -1,3694 +1,3339 @@ /* * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #define LINUXKPI_PARAM_PREFIX ibcore_ +#define KBUILD_MODNAME "ibcore" +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include -#include #include #include "mad_priv.h" #include "mad_rmpp.h" #include "smi.h" +#include "opa_smi.h" #include "agent.h" +#include "core_priv.h" -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_DESCRIPTION("kernel IB MAD API"); -MODULE_AUTHOR("Hal Rosenstock"); -MODULE_AUTHOR("Sean Hefty"); - static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; module_param_named(send_queue_size, mad_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests"); module_param_named(recv_queue_size, mad_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); -static struct kmem_cache *ib_mad_cache; - static struct list_head ib_mad_port_list; static u32 ib_mad_client_id = 0; - -/* - * Timeout FIFO (tf) param - */ -enum { - /* min time between 2 consecutive activations of tf workqueue */ - MIN_BETWEEN_ACTIVATIONS_MS = 5 -}; - -/* - * SA congestion control params - */ -enum { - MAX_OUTSTANDING_SA_MADS = 10, - MIN_TIME_FOR_SA_MAD_SEND_MS = 20, - MAX_SA_MADS = 10000 -}; - /* Port list lock */ static DEFINE_SPINLOCK(ib_mad_port_list_lock); /* Forward declarations */ static int method_in_use(struct ib_mad_mgmt_method_table **method, struct ib_mad_reg_req *mad_reg_req); static void remove_mad_reg_req(struct ib_mad_agent_private *priv); static struct ib_mad_agent_private *find_mad_agent( struct ib_mad_port_private *port_priv, - struct ib_mad *mad); + const struct ib_mad_hdr *mad); static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, struct ib_mad_private *mad); static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv); static void timeout_sends(struct work_struct *work); static void local_completions(struct work_struct *work); static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv, u8 mgmt_class); static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv); -static int send_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr, - u32 timeout_ms, u32 retries_left); +static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, + struct ib_wc *wc); +static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc); - /* - * Timeout FIFO functions - implements FIFO with timeout mechanism - */ - -static void activate_timeout_handler_task(unsigned long data) -{ - struct to_fifo *tf; - - tf = (struct to_fifo *)data; - del_timer(&tf->timer); - queue_work(tf->workq, &tf->work); -} - -static unsigned long adjusted_time(unsigned long last, unsigned long next) -{ - unsigned long min_next; - - min_next = last + msecs_to_jiffies(MIN_BETWEEN_ACTIVATIONS_MS); - if (time_after(min_next, next)) - return min_next; - - return next; -} - -static void notify_failure(struct ib_mad_send_wr_private *mad_send_wr, - enum ib_wc_status status) -{ - struct ib_mad_send_wc mad_send_wc; - struct ib_mad_agent_private *mad_agent_priv; - - mad_send_wc.status = status; - mad_send_wc.vendor_err = 0; - mad_send_wc.send_buf = &mad_send_wr->send_buf; - mad_agent_priv = mad_send_wr->mad_agent_priv; - mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); -} - -static inline struct sa_cc_data * -get_cc_obj(struct ib_mad_send_wr_private *mad_send_wr) -{ - return &mad_send_wr->mad_agent_priv->qp_info->port_priv->sa_cc; -} - -static inline struct ib_mad_send_wr_private *tfe_to_mad(struct tf_entry *tfe) -{ - return container_of(tfe, struct ib_mad_send_wr_private, tf_list); -} - -static void timeout_handler_task(struct work_struct *work) -{ - struct tf_entry *tmp1, *tmp2; - struct list_head *list_item, exp_lst; - unsigned long flags, curr_time; - int lst_empty; - struct to_fifo *tf; - - tf = container_of(work, struct to_fifo, work); - do { - INIT_LIST_HEAD(&exp_lst); - - spin_lock_irqsave(&tf->lists_lock, flags); - curr_time = jiffies; - list_for_each(list_item, &tf->to_head) { - tmp1 = list_entry(list_item, struct tf_entry, to_list); - if (time_before(curr_time, tmp1->exp_time)) - break; - list_del(&tmp1->fifo_list); - tf->num_items--; - } - - /* cut list up to and including list_item->prev */ - list_cut_position(&exp_lst, &tf->to_head, list_item->prev); - spin_unlock_irqrestore(&tf->lists_lock, flags); - - lst_empty = list_empty(&exp_lst); - list_for_each_entry_safe(tmp1, tmp2, &exp_lst, to_list) { - list_del(&tmp1->to_list); - if (tmp1->canceled) { - tmp1->canceled = 0; - notify_failure(tfe_to_mad(tmp1), IB_WC_WR_FLUSH_ERR); - } else { - notify_failure(tfe_to_mad(tmp1), IB_WC_RESP_TIMEOUT_ERR); - } - } - } while (!lst_empty); - - spin_lock_irqsave(&tf->lists_lock, flags); - if (!list_empty(&tf->to_head)) { - tmp1 = list_entry(tf->to_head.next, struct tf_entry, to_list); - mod_timer(&tf->timer, adjusted_time(curr_time, tmp1->exp_time)); - } - spin_unlock_irqrestore(&tf->lists_lock, flags); -} - -/** - * tf_create - creates new timeout-fifo object - * @fifo_size: Maximum fifo size - * - * Allocate and initialize new timeout-fifo object - */ -static struct to_fifo *tf_create(u32 fifo_size) -{ - struct to_fifo *tf; - - tf = kzalloc(sizeof(*tf), GFP_KERNEL); - if (tf) { - tf->workq = create_singlethread_workqueue("to_fifo"); - if (!tf->workq) { - kfree(tf); - return NULL; - } - spin_lock_init(&tf->lists_lock); - INIT_LIST_HEAD(&tf->to_head); - INIT_LIST_HEAD(&tf->fifo_head); - init_timer(&tf->timer); - INIT_WORK(&tf->work, timeout_handler_task); - tf->timer.data = (unsigned long) tf; - tf->timer.function = activate_timeout_handler_task; - tf->timer.expires = jiffies; - tf->fifo_size = fifo_size; - tf->stop_enqueue = 0; - tf->num_items = 0; - } - - return tf; -} - -/** - * tf_enqueue - enqueue item to timeout-fifo object - * @tf:timeout-fifo object - * @item: item to enqueue. - * @timeout_ms: item expiration time in ms. - * - * Enqueue item to fifo and modify expiration timer when required. - * - * Returns 0 on success and negative on failure. - */ -static int tf_enqueue(struct to_fifo *tf, struct tf_entry *item, u32 timeout_ms) -{ - struct tf_entry *tmp; - struct list_head *list_item; - unsigned long flags; - - item->exp_time = jiffies + msecs_to_jiffies(timeout_ms); - - spin_lock_irqsave(&tf->lists_lock, flags); - if (tf->num_items >= tf->fifo_size || tf->stop_enqueue) { - spin_unlock_irqrestore(&tf->lists_lock, flags); - return -EBUSY; - } - - /* Insert item to timeout list */ - list_for_each_prev(list_item, &tf->to_head) { - tmp = list_entry(list_item, struct tf_entry, to_list); - if (time_after(item->exp_time, tmp->exp_time)) - break; - } - - list_add(&item->to_list, list_item); - - /* Insert item to fifo list */ - list_add_tail(&item->fifo_list, &tf->fifo_head); - - tf->num_items++; - - /* modify expiration timer if required */ - if (list_item == &tf->to_head) - mod_timer(&tf->timer, item->exp_time); - - spin_unlock_irqrestore(&tf->lists_lock, flags); - - return 0; -} - -/** - * tf_dequeue - dequeue item from timeout-fifo object - * @tf:timeout-fifo object - * @time_left_ms: returns the time left for expiration in ms. - * - * Dequeue item from fifo and modify expiration timer when required. - * - * Returns pointer to tf_entry on success and NULL on failure. - */ -static struct tf_entry *tf_dequeue(struct to_fifo *tf, u32 *time_left_ms) -{ - unsigned long flags; - unsigned long time_left; - struct tf_entry *tmp, *tmp1; - bool found = false; - - spin_lock_irqsave(&tf->lists_lock, flags); - if (list_empty(&tf->fifo_head)) { - spin_unlock_irqrestore(&tf->lists_lock, flags); - return NULL; - } - - list_for_each_entry(tmp, &tf->fifo_head, fifo_list) { - if (!tmp->canceled) { - found = true; - break; - } - } - - if (!found) { - spin_unlock_irqrestore(&tf->lists_lock, flags); - return NULL; - } - - /* modify timer in case enqueued item is the next to expire */ - if (tf->to_head.next == &tmp->to_list) { - if (list_is_last(&tmp->to_list, &tf->to_head)) { - del_timer(&tf->timer); - } else { - tmp1 = list_entry(tmp->to_list.next, struct tf_entry, to_list); - mod_timer(&tf->timer, tmp1->exp_time); - } - } - list_del(&tmp->fifo_list); - list_del(&tmp->to_list); - tf->num_items--; - spin_unlock_irqrestore(&tf->lists_lock, flags); - - time_left = tmp->exp_time - jiffies; - if ((long) time_left <= 0) - time_left = 0; - *time_left_ms = jiffies_to_msecs(time_left); - - return tmp; -} - -static void tf_stop_enqueue(struct to_fifo *tf) -{ - unsigned long flags; - - spin_lock_irqsave(&tf->lists_lock, flags); - tf->stop_enqueue = 1; - spin_unlock_irqrestore(&tf->lists_lock, flags); -} - -/** - * tf_free - free empty timeout-fifo object - * @tf:timeout-fifo object - * - */ -static void tf_free(struct to_fifo *tf) -{ - del_timer_sync(&tf->timer); - flush_workqueue(tf->workq); - destroy_workqueue(tf->workq); - - kfree(tf); -} - -/** - * tf_free_agent - free MADs related to specific MAD agent from timeout-fifo - * @tf:timeout-fifo object - * @mad_agent_priv: MAD agent. - * - */ -static void tf_free_agent(struct to_fifo *tf, struct ib_mad_agent_private *mad_agent_priv) -{ - unsigned long flags; - struct tf_entry *tmp, *tmp1; - struct list_head tmp_head; - - INIT_LIST_HEAD(&tmp_head); - spin_lock_irqsave(&tf->lists_lock, flags); - list_for_each_entry_safe(tmp, tmp1, &tf->fifo_head, fifo_list) { - if (tfe_to_mad(tmp)->mad_agent_priv == mad_agent_priv) { - list_del(&tmp->to_list); - list_move(&tmp->fifo_list, &tmp_head); - tf->num_items--; - } - } - spin_unlock_irqrestore(&tf->lists_lock, flags); - - list_for_each_entry_safe(tmp, tmp1, &tmp_head, fifo_list) { - list_del(&tmp->fifo_list); - notify_failure(tfe_to_mad(tmp), IB_WC_WR_FLUSH_ERR); - } -} - -/** - * tf_modify_item - to modify expiration time for specific item - * @tf:timeout-fifo object - * @mad_agent_priv: MAD agent. - * @send_buf: the MAD to modify in queue - * @timeout_ms: new timeout to set. - * - * Returns 0 if item found on list and -ENXIO if not. - * - * Note: The send_buf may point on MAD that is already released. - * Therefore we can't use this struct before finding it in the list - */ -static int tf_modify_item(struct to_fifo *tf, - struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_send_buf *send_buf, u32 timeout_ms) -{ - struct tf_entry *tmp, *item; - struct list_head *list_item; - unsigned long flags; - int found = 0; - - spin_lock_irqsave(&tf->lists_lock, flags); - list_for_each_entry(item, &tf->fifo_head, fifo_list) { - if (tfe_to_mad(item)->mad_agent_priv == mad_agent_priv && - &tfe_to_mad(item)->send_buf == send_buf) { - found = 1; - break; - } - } - - if (!found) { - spin_unlock_irqrestore(&tf->lists_lock, flags); - return -ENXIO; - } - - item->exp_time = jiffies + msecs_to_jiffies(timeout_ms); - - if (timeout_ms) { - list_del(&item->to_list); - list_for_each_prev(list_item, &tf->to_head) { - tmp = list_entry(list_item, struct tf_entry, to_list); - if (time_after(item->exp_time, tmp->exp_time)) - break; - } - list_add(&item->to_list, list_item); - - /* modify expiration timer if required */ - if (list_item == &tf->to_head) - mod_timer(&tf->timer, item->exp_time); - } else { - /* - * when item canceled (timeout_ms == 0) move item to - * head of timeout list and to the tail of fifo list - */ - item->canceled = 1; - list_move(&item->to_list, &tf->to_head); - list_move_tail(&item->fifo_list, &tf->fifo_head); - mod_timer(&tf->timer, item->exp_time); - } - spin_unlock_irqrestore(&tf->lists_lock, flags); - - return 0; -} - -/* - * SA congestion control functions - */ - -/* - * Defines which MAD is under congestion control. - */ -static int is_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr) -{ - struct ib_mad_hdr *mad; - - mad = (struct ib_mad_hdr *)mad_send_wr->send_buf.mad; - - return ((mad_send_wr->send_buf.timeout_ms) && - (mad->mgmt_class == IB_MGMT_CLASS_SUBN_ADM) && - ((mad->method == IB_MGMT_METHOD_GET) || - (mad->method == IB_MGMT_METHOD_SET))); -} - -/* - * Notify that SA congestion controlled MAD is done. - * to allow dequeuing SA MAD from congestion control queue. - */ -static void sa_cc_mad_done(struct sa_cc_data *cc_obj) -{ - unsigned long flags; - struct tf_entry *tfe; - struct ib_mad_send_wr_private *mad_send_wr; - u32 time_left_ms, timeout_ms, retries; - int ret; - - do { - spin_lock_irqsave(&cc_obj->lock, flags); - tfe = tf_dequeue(cc_obj->tf, &time_left_ms); - if (!tfe) { - if (cc_obj->outstanding > 0) - cc_obj->outstanding--; - spin_unlock_irqrestore(&cc_obj->lock, flags); - break; - } - spin_unlock_irqrestore(&cc_obj->lock, flags); - mad_send_wr = tfe_to_mad(tfe); - time_left_ms += MIN_TIME_FOR_SA_MAD_SEND_MS; - if (time_left_ms > mad_send_wr->send_buf.timeout_ms) { - retries = time_left_ms / mad_send_wr->send_buf.timeout_ms - 1; - timeout_ms = mad_send_wr->send_buf.timeout_ms; - } else { - retries = 0; - timeout_ms = time_left_ms; - } - ret = send_sa_cc_mad(mad_send_wr, timeout_ms, retries); - if (ret) { - if (ret == -ENOMEM) - notify_failure(mad_send_wr, IB_WC_GENERAL_ERR); - else - notify_failure(mad_send_wr, IB_WC_LOC_QP_OP_ERR); - } - } while (ret); -} - -/* - * Send SA MAD under congestion control. - */ -static int sa_cc_mad_send(struct ib_mad_send_wr_private *mad_send_wr) -{ - unsigned long flags; - int ret; - struct sa_cc_data *cc_obj; - - cc_obj = get_cc_obj(mad_send_wr); - spin_lock_irqsave(&cc_obj->lock, flags); - if (cc_obj->outstanding < MAX_OUTSTANDING_SA_MADS) { - cc_obj->outstanding++; - spin_unlock_irqrestore(&cc_obj->lock, flags); - ret = send_sa_cc_mad(mad_send_wr, mad_send_wr->send_buf.timeout_ms, - mad_send_wr->retries_left); - if (ret) - sa_cc_mad_done(cc_obj); - - } else { - int qtime = (mad_send_wr->send_buf.timeout_ms * - (mad_send_wr->retries_left + 1)) - - MIN_TIME_FOR_SA_MAD_SEND_MS; - - if (qtime < 0) - qtime = 0; - ret = tf_enqueue(cc_obj->tf, &mad_send_wr->tf_list, (u32)qtime); - - spin_unlock_irqrestore(&cc_obj->lock, flags); - } - - return ret; -} - -/* - * Initialize SA congestion control. - */ -static int sa_cc_init(struct sa_cc_data *cc_obj) -{ - spin_lock_init(&cc_obj->lock); - cc_obj->outstanding = 0; - cc_obj->tf = tf_create(MAX_SA_MADS); - if (!cc_obj->tf) - return -ENOMEM; - return 0; -} - -/* - * Cancel SA MADs from congestion control queue. - */ -static void cancel_sa_cc_mads(struct ib_mad_agent_private *mad_agent_priv) -{ - tf_free_agent(mad_agent_priv->qp_info->port_priv->sa_cc.tf, - mad_agent_priv); -} - -/* - * Modify timeout of SA MAD on congestion control queue. - */ -static int modify_sa_cc_mad(struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_send_buf *send_buf, u32 timeout_ms) -{ - int ret; - int qtime = 0; - - if (timeout_ms > MIN_TIME_FOR_SA_MAD_SEND_MS) - qtime = timeout_ms - MIN_TIME_FOR_SA_MAD_SEND_MS; - - ret = tf_modify_item(mad_agent_priv->qp_info->port_priv->sa_cc.tf, - mad_agent_priv, send_buf, (u32)qtime); - return ret; -} - -static void sa_cc_destroy(struct sa_cc_data *cc_obj) -{ - struct ib_mad_send_wr_private *mad_send_wr; - struct tf_entry *tfe; - struct ib_mad_send_wc mad_send_wc; - struct ib_mad_agent_private *mad_agent_priv; - u32 time_left_ms; - - mad_send_wc.status = IB_WC_WR_FLUSH_ERR; - mad_send_wc.vendor_err = 0; - - tf_stop_enqueue(cc_obj->tf); - tfe = tf_dequeue(cc_obj->tf, &time_left_ms); - while (tfe) { - mad_send_wr = tfe_to_mad(tfe); - mad_send_wc.send_buf = &mad_send_wr->send_buf; - mad_agent_priv = mad_send_wr->mad_agent_priv; - mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, - &mad_send_wc); - tfe = tf_dequeue(cc_obj->tf, &time_left_ms); - } - tf_free(cc_obj->tf); -} - -/* * Returns a ib_mad_port_private structure or NULL for a device/port * Assumes ib_mad_port_list_lock is being held */ static inline struct ib_mad_port_private * __ib_get_mad_port(struct ib_device *device, int port_num) { struct ib_mad_port_private *entry; list_for_each_entry(entry, &ib_mad_port_list, port_list) { if (entry->device == device && entry->port_num == port_num) return entry; } return NULL; } /* * Wrapper function to return a ib_mad_port_private structure or NULL * for a device/port */ static inline struct ib_mad_port_private * ib_get_mad_port(struct ib_device *device, int port_num) { struct ib_mad_port_private *entry; unsigned long flags; spin_lock_irqsave(&ib_mad_port_list_lock, flags); entry = __ib_get_mad_port(device, port_num); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); return entry; } static inline u8 convert_mgmt_class(u8 mgmt_class) { /* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */ return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ? 0 : mgmt_class; } static int get_spl_qp_index(enum ib_qp_type qp_type) { switch (qp_type) { case IB_QPT_SMI: return 0; case IB_QPT_GSI: return 1; default: return -1; } } static int vendor_class_index(u8 mgmt_class) { return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START; } static int is_vendor_class(u8 mgmt_class) { if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) || (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END)) return 0; return 1; } static int is_vendor_oui(char *oui) { if (oui[0] || oui[1] || oui[2]) return 1; return 0; } static int is_vendor_method_in_use( struct ib_mad_mgmt_vendor_class *vendor_class, struct ib_mad_reg_req *mad_reg_req) { struct ib_mad_mgmt_method_table *method; int i; for (i = 0; i < MAX_MGMT_OUI; i++) { if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) { method = vendor_class->method_table[i]; if (method) { if (method_in_use(&method, mad_reg_req)) return 1; else break; } } } return 0; } -int ib_response_mad(struct ib_mad *mad) +int ib_response_mad(const struct ib_mad_hdr *hdr) { - return ((mad->mad_hdr.method & IB_MGMT_METHOD_RESP) || - (mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) || - ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_BM) && - (mad->mad_hdr.attr_mod & IB_BM_ATTR_MOD_RESP))); + return ((hdr->method & IB_MGMT_METHOD_RESP) || + (hdr->method == IB_MGMT_METHOD_TRAP_REPRESS) || + ((hdr->mgmt_class == IB_MGMT_CLASS_BM) && + (hdr->attr_mod & IB_BM_ATTR_MOD_RESP))); } EXPORT_SYMBOL(ib_response_mad); /* * ib_register_mad_agent - Register to send/receive MADs */ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, u8 port_num, enum ib_qp_type qp_type, struct ib_mad_reg_req *mad_reg_req, u8 rmpp_version, ib_mad_send_handler send_handler, ib_mad_recv_handler recv_handler, - void *context) + void *context, + u32 registration_flags) { struct ib_mad_port_private *port_priv; struct ib_mad_agent *ret = ERR_PTR(-EINVAL); struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_reg_req *reg_req = NULL; struct ib_mad_mgmt_class_table *class; struct ib_mad_mgmt_vendor_class_table *vendor; struct ib_mad_mgmt_vendor_class *vendor_class; struct ib_mad_mgmt_method_table *method; int ret2, qpn; unsigned long flags; u8 mgmt_class, vclass; /* Validate parameters */ qpn = get_spl_qp_index(qp_type); - if (qpn == -1) + if (qpn == -1) { + dev_notice(&device->dev, + "ib_register_mad_agent: invalid QP Type %d\n", + qp_type); goto error1; + } - if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) + if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) { + dev_notice(&device->dev, + "ib_register_mad_agent: invalid RMPP Version %u\n", + rmpp_version); goto error1; + } /* Validate MAD registration request if supplied */ if (mad_reg_req) { - if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) + if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) { + dev_notice(&device->dev, + "ib_register_mad_agent: invalid Class Version %u\n", + mad_reg_req->mgmt_class_version); goto error1; - if (!recv_handler) + } + if (!recv_handler) { + dev_notice(&device->dev, + "ib_register_mad_agent: no recv_handler\n"); goto error1; + } if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) { /* * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only * one in this range currently allowed */ if (mad_reg_req->mgmt_class != - IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } else if (mad_reg_req->mgmt_class == 0) { /* * Class 0 is reserved in IBA and is used for * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE */ + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid Mgmt Class 0\n"); goto error1; } else if (is_vendor_class(mad_reg_req->mgmt_class)) { /* * If class is in "new" vendor range, * ensure supplied OUI is not zero */ - if (!is_vendor_oui(mad_reg_req->oui)) + if (!is_vendor_oui(mad_reg_req->oui)) { + dev_notice(&device->dev, + "ib_register_mad_agent: No OUI specified for class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } /* Make sure class supplied is consistent with RMPP */ if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) { - if (rmpp_version) + if (rmpp_version) { + dev_notice(&device->dev, + "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } + /* Make sure class supplied is consistent with QP type */ if (qp_type == IB_QPT_SMI) { if ((mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) && (mad_reg_req->mgmt_class != - IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } else { if ((mad_reg_req->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || (mad_reg_req->mgmt_class == - IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } } else { /* No registration request supplied */ if (!send_handler) goto error1; + if (registration_flags & IB_MAD_USER_RMPP) + goto error1; } /* Validate device and port */ port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { + dev_notice(&device->dev, "ib_register_mad_agent: Invalid port\n"); ret = ERR_PTR(-ENODEV); goto error1; } /* Verify the QP requested is supported. For example, Ethernet devices * will not have QP0 */ if (!port_priv->qp_info[qpn].qp) { + dev_notice(&device->dev, + "ib_register_mad_agent: QP %d not supported\n", qpn); ret = ERR_PTR(-EPROTONOSUPPORT); goto error1; } /* Allocate structures */ mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL); if (!mad_agent_priv) { ret = ERR_PTR(-ENOMEM); goto error1; } - mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(mad_agent_priv->agent.mr)) { - ret = ERR_PTR(-ENOMEM); - goto error2; - } - if (mad_reg_req) { reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL); if (!reg_req) { ret = ERR_PTR(-ENOMEM); goto error3; } } /* Now, fill in the various structures */ mad_agent_priv->qp_info = &port_priv->qp_info[qpn]; mad_agent_priv->reg_req = reg_req; mad_agent_priv->agent.rmpp_version = rmpp_version; mad_agent_priv->agent.device = device; mad_agent_priv->agent.recv_handler = recv_handler; mad_agent_priv->agent.send_handler = send_handler; mad_agent_priv->agent.context = context; mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp; mad_agent_priv->agent.port_num = port_num; + mad_agent_priv->agent.flags = registration_flags; spin_lock_init(&mad_agent_priv->lock); INIT_LIST_HEAD(&mad_agent_priv->send_list); INIT_LIST_HEAD(&mad_agent_priv->wait_list); INIT_LIST_HEAD(&mad_agent_priv->done_list); INIT_LIST_HEAD(&mad_agent_priv->rmpp_list); INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends); INIT_LIST_HEAD(&mad_agent_priv->local_list); INIT_WORK(&mad_agent_priv->local_work, local_completions); atomic_set(&mad_agent_priv->refcount, 1); init_completion(&mad_agent_priv->comp); spin_lock_irqsave(&port_priv->reg_lock, flags); mad_agent_priv->agent.hi_tid = ++ib_mad_client_id; /* * Make sure MAD registration (if supplied) * is non overlapping with any existing ones */ if (mad_reg_req) { mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class); if (!is_vendor_class(mgmt_class)) { class = port_priv->version[mad_reg_req-> mgmt_class_version].class; if (class) { method = class->method_table[mgmt_class]; if (method) { if (method_in_use(&method, mad_reg_req)) goto error4; } } ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv, mgmt_class); } else { /* "New" vendor class range */ vendor = port_priv->version[mad_reg_req-> mgmt_class_version].vendor; if (vendor) { vclass = vendor_class_index(mgmt_class); vendor_class = vendor->vendor_class[vclass]; if (vendor_class) { if (is_vendor_method_in_use( vendor_class, mad_reg_req)) goto error4; } } ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv); } if (ret2) { ret = ERR_PTR(ret2); goto error4; } } /* Add mad agent into port's agent list */ list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list); spin_unlock_irqrestore(&port_priv->reg_lock, flags); return &mad_agent_priv->agent; error4: spin_unlock_irqrestore(&port_priv->reg_lock, flags); kfree(reg_req); error3: - ib_dereg_mr(mad_agent_priv->agent.mr); -error2: kfree(mad_agent_priv); error1: return ret; } EXPORT_SYMBOL(ib_register_mad_agent); static inline int is_snooping_sends(int mad_snoop_flags) { return (mad_snoop_flags & (/*IB_MAD_SNOOP_POSTED_SENDS | IB_MAD_SNOOP_RMPP_SENDS |*/ IB_MAD_SNOOP_SEND_COMPLETIONS /*| IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/)); } static inline int is_snooping_recvs(int mad_snoop_flags) { return (mad_snoop_flags & (IB_MAD_SNOOP_RECVS /*| IB_MAD_SNOOP_RMPP_RECVS*/)); } static int register_snoop_agent(struct ib_mad_qp_info *qp_info, struct ib_mad_snoop_private *mad_snoop_priv) { struct ib_mad_snoop_private **new_snoop_table; unsigned long flags; int i; spin_lock_irqsave(&qp_info->snoop_lock, flags); /* Check for empty slot in array. */ for (i = 0; i < qp_info->snoop_table_size; i++) if (!qp_info->snoop_table[i]) break; if (i == qp_info->snoop_table_size) { /* Grow table. */ new_snoop_table = krealloc(qp_info->snoop_table, sizeof mad_snoop_priv * (qp_info->snoop_table_size + 1), GFP_ATOMIC); if (!new_snoop_table) { i = -ENOMEM; goto out; } qp_info->snoop_table = new_snoop_table; qp_info->snoop_table_size++; } qp_info->snoop_table[i] = mad_snoop_priv; atomic_inc(&qp_info->snoop_count); out: spin_unlock_irqrestore(&qp_info->snoop_lock, flags); return i; } struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device, u8 port_num, enum ib_qp_type qp_type, int mad_snoop_flags, ib_mad_snoop_handler snoop_handler, ib_mad_recv_handler recv_handler, void *context) { struct ib_mad_port_private *port_priv; struct ib_mad_agent *ret; struct ib_mad_snoop_private *mad_snoop_priv; int qpn; /* Validate parameters */ if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) || (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) { ret = ERR_PTR(-EINVAL); goto error1; } qpn = get_spl_qp_index(qp_type); if (qpn == -1) { ret = ERR_PTR(-EINVAL); goto error1; } port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { ret = ERR_PTR(-ENODEV); goto error1; } /* Allocate structures */ mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL); if (!mad_snoop_priv) { ret = ERR_PTR(-ENOMEM); goto error1; } /* Now, fill in the various structures */ mad_snoop_priv->qp_info = &port_priv->qp_info[qpn]; mad_snoop_priv->agent.device = device; mad_snoop_priv->agent.recv_handler = recv_handler; mad_snoop_priv->agent.snoop_handler = snoop_handler; mad_snoop_priv->agent.context = context; mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp; mad_snoop_priv->agent.port_num = port_num; mad_snoop_priv->mad_snoop_flags = mad_snoop_flags; init_completion(&mad_snoop_priv->comp); mad_snoop_priv->snoop_index = register_snoop_agent( &port_priv->qp_info[qpn], mad_snoop_priv); if (mad_snoop_priv->snoop_index < 0) { ret = ERR_PTR(mad_snoop_priv->snoop_index); goto error2; } atomic_set(&mad_snoop_priv->refcount, 1); return &mad_snoop_priv->agent; error2: kfree(mad_snoop_priv); error1: return ret; } EXPORT_SYMBOL(ib_register_mad_snoop); static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv) { if (atomic_dec_and_test(&mad_agent_priv->refcount)) complete(&mad_agent_priv->comp); } static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv) { if (atomic_dec_and_test(&mad_snoop_priv->refcount)) complete(&mad_snoop_priv->comp); } static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) { struct ib_mad_port_private *port_priv; unsigned long flags; /* Note that we could still be handling received MADs */ /* * Canceling all sends results in dropping received response * MADs, preventing us from queuing additional work */ cancel_mads(mad_agent_priv); port_priv = mad_agent_priv->qp_info->port_priv; - cancel_delayed_work_sync(&mad_agent_priv->timed_work); + cancel_delayed_work(&mad_agent_priv->timed_work); spin_lock_irqsave(&port_priv->reg_lock, flags); remove_mad_reg_req(mad_agent_priv); list_del(&mad_agent_priv->agent_list); spin_unlock_irqrestore(&port_priv->reg_lock, flags); flush_workqueue(port_priv->wq); ib_cancel_rmpp_recvs(mad_agent_priv); deref_mad_agent(mad_agent_priv); wait_for_completion(&mad_agent_priv->comp); kfree(mad_agent_priv->reg_req); - ib_dereg_mr(mad_agent_priv->agent.mr); kfree(mad_agent_priv); } static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv) { struct ib_mad_qp_info *qp_info; unsigned long flags; qp_info = mad_snoop_priv->qp_info; spin_lock_irqsave(&qp_info->snoop_lock, flags); qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL; atomic_dec(&qp_info->snoop_count); spin_unlock_irqrestore(&qp_info->snoop_lock, flags); deref_snoop_agent(mad_snoop_priv); wait_for_completion(&mad_snoop_priv->comp); kfree(mad_snoop_priv); } /* * ib_unregister_mad_agent - Unregisters a client from using MAD services */ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_snoop_private *mad_snoop_priv; - if (!IS_ERR(mad_agent)) { /* If the TID is zero, the agent can only snoop. */ if (mad_agent->hi_tid) { mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, agent); unregister_mad_agent(mad_agent_priv); } else { mad_snoop_priv = container_of(mad_agent, struct ib_mad_snoop_private, agent); unregister_mad_snoop(mad_snoop_priv); } - } - return 0; } EXPORT_SYMBOL(ib_unregister_mad_agent); static void dequeue_mad(struct ib_mad_list_head *mad_list) { struct ib_mad_queue *mad_queue; unsigned long flags; BUG_ON(!mad_list->mad_queue); mad_queue = mad_list->mad_queue; spin_lock_irqsave(&mad_queue->lock, flags); list_del(&mad_list->list); mad_queue->count--; spin_unlock_irqrestore(&mad_queue->lock, flags); } static void snoop_send(struct ib_mad_qp_info *qp_info, struct ib_mad_send_buf *send_buf, struct ib_mad_send_wc *mad_send_wc, int mad_snoop_flags) { struct ib_mad_snoop_private *mad_snoop_priv; unsigned long flags; int i; spin_lock_irqsave(&qp_info->snoop_lock, flags); for (i = 0; i < qp_info->snoop_table_size; i++) { mad_snoop_priv = qp_info->snoop_table[i]; if (!mad_snoop_priv || !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags)) continue; atomic_inc(&mad_snoop_priv->refcount); spin_unlock_irqrestore(&qp_info->snoop_lock, flags); mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent, send_buf, mad_send_wc); deref_snoop_agent(mad_snoop_priv); spin_lock_irqsave(&qp_info->snoop_lock, flags); } spin_unlock_irqrestore(&qp_info->snoop_lock, flags); } static void snoop_recv(struct ib_mad_qp_info *qp_info, struct ib_mad_recv_wc *mad_recv_wc, int mad_snoop_flags) { struct ib_mad_snoop_private *mad_snoop_priv; unsigned long flags; int i; spin_lock_irqsave(&qp_info->snoop_lock, flags); for (i = 0; i < qp_info->snoop_table_size; i++) { mad_snoop_priv = qp_info->snoop_table[i]; if (!mad_snoop_priv || !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags)) continue; atomic_inc(&mad_snoop_priv->refcount); spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, + mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, NULL, mad_recv_wc); deref_snoop_agent(mad_snoop_priv); spin_lock_irqsave(&qp_info->snoop_lock, flags); } spin_unlock_irqrestore(&qp_info->snoop_lock, flags); } -static void build_smp_wc(struct ib_qp *qp, - u64 wr_id, u16 slid, u16 pkey_index, u8 port_num, - struct ib_wc *wc) +static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid, + u16 pkey_index, u8 port_num, struct ib_wc *wc) { memset(wc, 0, sizeof *wc); - wc->wr_id = wr_id; + wc->wr_cqe = cqe; wc->status = IB_WC_SUCCESS; wc->opcode = IB_WC_RECV; wc->pkey_index = pkey_index; wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh); wc->src_qp = IB_QP0; wc->qp = qp; wc->slid = slid; wc->sl = 0; wc->dlid_path_bits = 0; wc->port_num = port_num; } +static size_t mad_priv_size(const struct ib_mad_private *mp) +{ + return sizeof(struct ib_mad_private) + mp->mad_size; +} + +static struct ib_mad_private *alloc_mad_private(size_t mad_size, gfp_t flags) +{ + size_t size = sizeof(struct ib_mad_private) + mad_size; + struct ib_mad_private *ret = kzalloc(size, flags); + + if (ret) + ret->mad_size = mad_size; + + return ret; +} + +static size_t port_mad_size(const struct ib_mad_port_private *port_priv) +{ + return rdma_max_mad_size(port_priv->device, port_priv->port_num); +} + +static size_t mad_priv_dma_size(const struct ib_mad_private *mp) +{ + return sizeof(struct ib_grh) + mp->mad_size; +} + /* * Return 0 if SMP is to be sent * Return 1 if SMP was consumed locally (whether or not solicited) * Return < 0 if error */ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_send_wr_private *mad_send_wr) { int ret = 0; struct ib_smp *smp = mad_send_wr->send_buf.mad; + struct opa_smp *opa_smp = (struct opa_smp *)smp; unsigned long flags; struct ib_mad_local_private *local; struct ib_mad_private *mad_priv; struct ib_mad_port_private *port_priv; struct ib_mad_agent_private *recv_mad_agent = NULL; struct ib_device *device = mad_agent_priv->agent.device; u8 port_num; struct ib_wc mad_wc; - struct ib_send_wr *send_wr = &mad_send_wr->send_wr; + struct ib_ud_wr *send_wr = &mad_send_wr->send_wr; + size_t mad_size = port_mad_size(mad_agent_priv->qp_info->port_priv); + u16 out_mad_pkey_index = 0; + u16 drslid; + bool opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device, + mad_agent_priv->qp_info->port_priv->port_num); - if (device->node_type == RDMA_NODE_IB_SWITCH && + if (rdma_cap_ib_switch(device) && smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) - port_num = send_wr->wr.ud.port_num; + port_num = send_wr->port_num; else port_num = mad_agent_priv->agent.port_num; /* * Directed route handling starts if the initial LID routed part of * a request or the ending LID routed part of a response is empty. * If we are at the start of the LID routed part, don't update the * hop_ptr or hop_cnt. See section 14.2.2, Vol 1 IB spec. */ - if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) != - IB_LID_PERMISSIVE) - goto out; - if (smi_handle_dr_smp_send(smp, device->node_type, port_num) == - IB_SMI_DISCARD) { - ret = -EINVAL; - printk(KERN_ERR PFX "Invalid directed route\n"); - goto out; - } + if (opa && smp->class_version == OPA_SMP_CLASS_VERSION) { + u32 opa_drslid; - /* Check to post send on QP or process locally */ - if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD && - smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD) - goto out; + if ((opa_get_smp_direction(opa_smp) + ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) == + OPA_LID_PERMISSIVE && + opa_smi_handle_dr_smp_send(opa_smp, + rdma_cap_ib_switch(device), + port_num) == IB_SMI_DISCARD) { + ret = -EINVAL; + dev_err(&device->dev, "OPA Invalid directed route\n"); + goto out; + } + opa_drslid = be32_to_cpu(opa_smp->route.dr.dr_slid); + if (opa_drslid != be32_to_cpu(OPA_LID_PERMISSIVE) && + opa_drslid & 0xffff0000) { + ret = -EINVAL; + dev_err(&device->dev, "OPA Invalid dr_slid 0x%x\n", + opa_drslid); + goto out; + } + drslid = (u16)(opa_drslid & 0x0000ffff); + /* Check to post send on QP or process locally */ + if (opa_smi_check_local_smp(opa_smp, device) == IB_SMI_DISCARD && + opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD) + goto out; + } else { + if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) == + IB_LID_PERMISSIVE && + smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) == + IB_SMI_DISCARD) { + ret = -EINVAL; + dev_err(&device->dev, "Invalid directed route\n"); + goto out; + } + drslid = be16_to_cpu(smp->dr_slid); + + /* Check to post send on QP or process locally */ + if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD && + smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD) + goto out; + } + local = kmalloc(sizeof *local, GFP_ATOMIC); if (!local) { ret = -ENOMEM; - printk(KERN_ERR PFX "No memory for ib_mad_local_private\n"); + dev_err(&device->dev, "No memory for ib_mad_local_private\n"); goto out; } local->mad_priv = NULL; local->recv_mad_agent = NULL; - mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC); + mad_priv = alloc_mad_private(mad_size, GFP_ATOMIC); if (!mad_priv) { ret = -ENOMEM; - printk(KERN_ERR PFX "No memory for local response MAD\n"); + dev_err(&device->dev, "No memory for local response MAD\n"); kfree(local); goto out; } build_smp_wc(mad_agent_priv->agent.qp, - send_wr->wr_id, be16_to_cpu(smp->dr_slid), - send_wr->wr.ud.pkey_index, - send_wr->wr.ud.port_num, &mad_wc); + send_wr->wr.wr_cqe, drslid, + send_wr->pkey_index, + send_wr->port_num, &mad_wc); + if (opa && smp->base_version == OPA_MGMT_BASE_VERSION) { + mad_wc.byte_len = mad_send_wr->send_buf.hdr_len + + mad_send_wr->send_buf.data_len + + sizeof(struct ib_grh); + } + /* No GRH for DR SMP */ ret = device->process_mad(device, 0, port_num, &mad_wc, NULL, - (struct ib_mad *)smp, - (struct ib_mad *)&mad_priv->mad); + (const struct ib_mad_hdr *)smp, mad_size, + (struct ib_mad_hdr *)mad_priv->mad, + &mad_size, &out_mad_pkey_index); switch (ret) { case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY: - if (ib_response_mad(&mad_priv->mad.mad) && + if (ib_response_mad((const struct ib_mad_hdr *)mad_priv->mad) && mad_agent_priv->agent.recv_handler) { local->mad_priv = mad_priv; local->recv_mad_agent = mad_agent_priv; /* * Reference MAD agent until receive * side of local completion handled */ atomic_inc(&mad_agent_priv->refcount); } else - kmem_cache_free(ib_mad_cache, mad_priv); + kfree(mad_priv); break; case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED: - kmem_cache_free(ib_mad_cache, mad_priv); + kfree(mad_priv); break; case IB_MAD_RESULT_SUCCESS: /* Treat like an incoming receive MAD */ port_priv = ib_get_mad_port(mad_agent_priv->agent.device, mad_agent_priv->agent.port_num); if (port_priv) { - memcpy(&mad_priv->mad.mad, smp, sizeof(struct ib_mad)); + memcpy(mad_priv->mad, smp, mad_priv->mad_size); recv_mad_agent = find_mad_agent(port_priv, - &mad_priv->mad.mad); + (const struct ib_mad_hdr *)mad_priv->mad); } if (!port_priv || !recv_mad_agent) { /* * No receiving agent so drop packet and * generate send completion. */ - kmem_cache_free(ib_mad_cache, mad_priv); + kfree(mad_priv); break; } local->mad_priv = mad_priv; local->recv_mad_agent = recv_mad_agent; break; default: - kmem_cache_free(ib_mad_cache, mad_priv); + kfree(mad_priv); kfree(local); ret = -EINVAL; goto out; } local->mad_send_wr = mad_send_wr; + if (opa) { + local->mad_send_wr->send_wr.pkey_index = out_mad_pkey_index; + local->return_wc_byte_len = mad_size; + } /* Reference MAD agent until send side of local completion handled */ atomic_inc(&mad_agent_priv->refcount); /* Queue local completion to local list */ spin_lock_irqsave(&mad_agent_priv->lock, flags); list_add_tail(&local->completion_list, &mad_agent_priv->local_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); queue_work(mad_agent_priv->qp_info->port_priv->wq, &mad_agent_priv->local_work); ret = 1; out: return ret; } -static int get_pad_size(int hdr_len, int data_len) +static int get_pad_size(int hdr_len, int data_len, size_t mad_size) { int seg_size, pad; - seg_size = sizeof(struct ib_mad) - hdr_len; + seg_size = mad_size - hdr_len; if (data_len && seg_size) { pad = seg_size - data_len % seg_size; return pad == seg_size ? 0 : pad; } else return seg_size; } static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_rmpp_segment *s, *t; list_for_each_entry_safe(s, t, &mad_send_wr->rmpp_list, list) { list_del(&s->list); kfree(s); } } static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr, - gfp_t gfp_mask) + size_t mad_size, gfp_t gfp_mask) { struct ib_mad_send_buf *send_buf = &send_wr->send_buf; struct ib_rmpp_mad *rmpp_mad = send_buf->mad; struct ib_rmpp_segment *seg = NULL; int left, seg_size, pad; - send_buf->seg_size = sizeof (struct ib_mad) - send_buf->hdr_len; + send_buf->seg_size = mad_size - send_buf->hdr_len; + send_buf->seg_rmpp_size = mad_size - IB_MGMT_RMPP_HDR; seg_size = send_buf->seg_size; pad = send_wr->pad; /* Allocate data segments. */ for (left = send_buf->data_len + pad; left > 0; left -= seg_size) { seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask); if (!seg) { - printk(KERN_ERR "alloc_send_rmpp_segs: RMPP mem " - "alloc failed for len %zd, gfp %#x\n", - sizeof (*seg) + seg_size, gfp_mask); + dev_err(&send_buf->mad_agent->device->dev, + "alloc_send_rmpp_segs: RMPP mem alloc failed for len %zd, gfp %#x\n", + sizeof (*seg) + seg_size, gfp_mask); free_send_rmpp_list(send_wr); return -ENOMEM; } seg->num = ++send_buf->seg_count; list_add_tail(&seg->list, &send_wr->rmpp_list); } /* Zero any padding */ if (pad) memset(seg->data + seg_size - pad, 0, pad); rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv-> agent.rmpp_version; rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA; ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); send_wr->cur_seg = container_of(send_wr->rmpp_list.next, struct ib_rmpp_segment, list); send_wr->last_ack_seg = send_wr->cur_seg; return 0; } +int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent) +{ + return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP); +} +EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent); + struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, u32 remote_qpn, u16 pkey_index, int rmpp_active, int hdr_len, int data_len, - gfp_t gfp_mask) + gfp_t gfp_mask, + u8 base_version) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; int pad, message_size, ret, size; void *buf; + size_t mad_size; + bool opa; mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, agent); - pad = get_pad_size(hdr_len, data_len); + + opa = rdma_cap_opa_mad(mad_agent->device, mad_agent->port_num); + + if (opa && base_version == OPA_MGMT_BASE_VERSION) + mad_size = sizeof(struct opa_mad); + else + mad_size = sizeof(struct ib_mad); + + pad = get_pad_size(hdr_len, data_len, mad_size); message_size = hdr_len + data_len + pad; - if ((!mad_agent->rmpp_version && - (rmpp_active || message_size > sizeof(struct ib_mad))) || - (!rmpp_active && message_size > sizeof(struct ib_mad))) - return ERR_PTR(-EINVAL); + if (ib_mad_kernel_rmpp_agent(mad_agent)) { + if (!rmpp_active && message_size > mad_size) + return ERR_PTR(-EINVAL); + } else + if (rmpp_active || message_size > mad_size) + return ERR_PTR(-EINVAL); - size = rmpp_active ? hdr_len : sizeof(struct ib_mad); + size = rmpp_active ? hdr_len : mad_size; buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask); if (!buf) return ERR_PTR(-ENOMEM); - mad_send_wr = buf + size; + mad_send_wr = (struct ib_mad_send_wr_private *)((char *)buf + size); INIT_LIST_HEAD(&mad_send_wr->rmpp_list); mad_send_wr->send_buf.mad = buf; mad_send_wr->send_buf.hdr_len = hdr_len; mad_send_wr->send_buf.data_len = data_len; mad_send_wr->pad = pad; mad_send_wr->mad_agent_priv = mad_agent_priv; mad_send_wr->sg_list[0].length = hdr_len; - mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey; - mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len; - mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey; + mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey; - mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr; - mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list; - mad_send_wr->send_wr.num_sge = 2; - mad_send_wr->send_wr.opcode = IB_WR_SEND; - mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED; - mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn; - mad_send_wr->send_wr.wr.ud.remote_qkey = IB_QP_SET_QKEY; - mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index; + /* OPA MADs don't have to be the full 2048 bytes */ + if (opa && base_version == OPA_MGMT_BASE_VERSION && + data_len < mad_size - hdr_len) + mad_send_wr->sg_list[1].length = data_len; + else + mad_send_wr->sg_list[1].length = mad_size - hdr_len; + mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey; + + mad_send_wr->mad_list.cqe.done = ib_mad_send_done; + + mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe; + mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list; + mad_send_wr->send_wr.wr.num_sge = 2; + mad_send_wr->send_wr.wr.opcode = IB_WR_SEND; + mad_send_wr->send_wr.wr.send_flags = IB_SEND_SIGNALED; + mad_send_wr->send_wr.remote_qpn = remote_qpn; + mad_send_wr->send_wr.remote_qkey = IB_QP_SET_QKEY; + mad_send_wr->send_wr.pkey_index = pkey_index; + if (rmpp_active) { - ret = alloc_send_rmpp_list(mad_send_wr, gfp_mask); + ret = alloc_send_rmpp_list(mad_send_wr, mad_size, gfp_mask); if (ret) { kfree(buf); return ERR_PTR(ret); } } mad_send_wr->send_buf.mad_agent = mad_agent; atomic_inc(&mad_agent_priv->refcount); return &mad_send_wr->send_buf; } EXPORT_SYMBOL(ib_create_send_mad); int ib_get_mad_data_offset(u8 mgmt_class) { if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM) return IB_MGMT_SA_HDR; else if ((mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) || (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) || (mgmt_class == IB_MGMT_CLASS_BIS)) return IB_MGMT_DEVICE_HDR; else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) && (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)) return IB_MGMT_VENDOR_HDR; else return IB_MGMT_MAD_HDR; } EXPORT_SYMBOL(ib_get_mad_data_offset); int ib_is_mad_class_rmpp(u8 mgmt_class) { if ((mgmt_class == IB_MGMT_CLASS_SUBN_ADM) || (mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) || (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) || (mgmt_class == IB_MGMT_CLASS_BIS) || ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) && (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))) return 1; return 0; } EXPORT_SYMBOL(ib_is_mad_class_rmpp); void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num) { struct ib_mad_send_wr_private *mad_send_wr; struct list_head *list; mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); list = &mad_send_wr->cur_seg->list; if (mad_send_wr->cur_seg->num < seg_num) { list_for_each_entry(mad_send_wr->cur_seg, list, list) if (mad_send_wr->cur_seg->num == seg_num) break; } else if (mad_send_wr->cur_seg->num > seg_num) { list_for_each_entry_reverse(mad_send_wr->cur_seg, list, list) if (mad_send_wr->cur_seg->num == seg_num) break; } return mad_send_wr->cur_seg->data; } EXPORT_SYMBOL(ib_get_rmpp_segment); static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr) { if (mad_send_wr->send_buf.seg_count) return ib_get_rmpp_segment(&mad_send_wr->send_buf, mad_send_wr->seg_num); else - return mad_send_wr->send_buf.mad + + return (char *)mad_send_wr->send_buf.mad + mad_send_wr->send_buf.hdr_len; } void ib_free_send_mad(struct ib_mad_send_buf *send_buf) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; mad_agent_priv = container_of(send_buf->mad_agent, struct ib_mad_agent_private, agent); mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); free_send_rmpp_list(mad_send_wr); kfree(send_buf->mad); deref_mad_agent(mad_agent_priv); } EXPORT_SYMBOL(ib_free_send_mad); int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_mad_qp_info *qp_info; struct list_head *list; struct ib_send_wr *bad_send_wr; struct ib_mad_agent *mad_agent; struct ib_sge *sge; unsigned long flags; int ret; /* Set WR ID to find mad_send_wr upon completion */ qp_info = mad_send_wr->mad_agent_priv->qp_info; - mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list; mad_send_wr->mad_list.mad_queue = &qp_info->send_queue; + mad_send_wr->mad_list.cqe.done = ib_mad_send_done; + mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe; mad_agent = mad_send_wr->send_buf.mad_agent; sge = mad_send_wr->sg_list; sge[0].addr = ib_dma_map_single(mad_agent->device, mad_send_wr->send_buf.mad, sge[0].length, DMA_TO_DEVICE); if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr))) return -ENOMEM; + mad_send_wr->header_mapping = sge[0].addr; + sge[1].addr = ib_dma_map_single(mad_agent->device, ib_get_payload(mad_send_wr), sge[1].length, DMA_TO_DEVICE); - if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) { - ret = -ENOMEM; - goto dma1_err; + ib_dma_unmap_single(mad_agent->device, + mad_send_wr->header_mapping, + sge[0].length, DMA_TO_DEVICE); + return -ENOMEM; } - - mad_send_wr->header_mapping = sge[0].addr; mad_send_wr->payload_mapping = sge[1].addr; spin_lock_irqsave(&qp_info->send_queue.lock, flags); if (qp_info->send_queue.count < qp_info->send_queue.max_active) { - ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr, + ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr, &bad_send_wr); list = &qp_info->send_queue.list; } else { ret = 0; list = &qp_info->overflow_list; } if (!ret) { qp_info->send_queue.count++; list_add_tail(&mad_send_wr->mad_list.list, list); } spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); - - if (!ret) - return 0; - + if (ret) { ib_dma_unmap_single(mad_agent->device, mad_send_wr->header_mapping, - sge[1].length, DMA_TO_DEVICE); -dma1_err: + sge[0].length, DMA_TO_DEVICE); ib_dma_unmap_single(mad_agent->device, mad_send_wr->payload_mapping, - sge[0].length, DMA_TO_DEVICE); - return ret; -} - -/* - * Send SA MAD that passed congestion control - */ -static int send_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr, - u32 timeout_ms, u32 retries_left) -{ - int ret; - unsigned long flags; - struct ib_mad_agent_private *mad_agent_priv; - - mad_agent_priv = mad_send_wr->mad_agent_priv; - mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); - mad_send_wr->retries_left = retries_left; - mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); - - /* Reference MAD agent until send completes */ - atomic_inc(&mad_agent_priv->refcount); - spin_lock_irqsave(&mad_agent_priv->lock, flags); - list_add_tail(&mad_send_wr->agent_list, - &mad_agent_priv->send_list); - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - - ret = ib_send_mad(mad_send_wr); - if (ret < 0) { - /* Fail send request */ - spin_lock_irqsave(&mad_agent_priv->lock, flags); - list_del(&mad_send_wr->agent_list); - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - atomic_dec(&mad_agent_priv->refcount); + sge[1].length, DMA_TO_DEVICE); } - return ret; } /* * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated * with the registered client */ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, struct ib_mad_send_buf **bad_send_buf) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_buf *next_send_buf; struct ib_mad_send_wr_private *mad_send_wr; unsigned long flags; int ret = -EINVAL; /* Walk list of send WRs and post each on send list */ for (; send_buf; send_buf = next_send_buf) { mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); mad_agent_priv = mad_send_wr->mad_agent_priv; if (!send_buf->mad_agent->send_handler || (send_buf->timeout_ms && !send_buf->mad_agent->recv_handler)) { ret = -EINVAL; goto error; } if (!ib_is_mad_class_rmpp(((struct ib_mad_hdr *) send_buf->mad)->mgmt_class)) { if (mad_agent_priv->agent.rmpp_version) { ret = -EINVAL; goto error; } } /* * Save pointer to next work request to post in case the * current one completes, and the user modifies the work * request associated with the completion */ next_send_buf = send_buf->next; - mad_send_wr->send_wr.wr.ud.ah = send_buf->ah; + mad_send_wr->send_wr.ah = send_buf->ah; if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { ret = handle_outgoing_dr_smp(mad_agent_priv, mad_send_wr); if (ret < 0) /* error */ goto error; else if (ret == 1) /* locally consumed */ continue; } mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid; /* Timeout will be updated after send completes */ mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms); mad_send_wr->max_retries = send_buf->retries; mad_send_wr->retries_left = send_buf->retries; send_buf->retries = 0; /* Reference for work request to QP + response */ mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); mad_send_wr->status = IB_WC_SUCCESS; - if (is_sa_cc_mad(mad_send_wr)) { - mad_send_wr->is_sa_cc_mad = 1; - ret = sa_cc_mad_send(mad_send_wr); - if (ret < 0) - goto error; - } else { - /* Reference MAD agent until send completes */ - atomic_inc(&mad_agent_priv->refcount); - spin_lock_irqsave(&mad_agent_priv->lock, flags); - list_add_tail(&mad_send_wr->agent_list, - &mad_agent_priv->send_list); - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + /* Reference MAD agent until send completes */ + atomic_inc(&mad_agent_priv->refcount); + spin_lock_irqsave(&mad_agent_priv->lock, flags); + list_add_tail(&mad_send_wr->agent_list, + &mad_agent_priv->send_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - if (mad_agent_priv->agent.rmpp_version) { - ret = ib_send_rmpp_mad(mad_send_wr); - if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED) - ret = ib_send_mad(mad_send_wr); - } else + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { + ret = ib_send_rmpp_mad(mad_send_wr); + if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED) ret = ib_send_mad(mad_send_wr); - if (ret < 0) { - /* Fail send request */ - spin_lock_irqsave(&mad_agent_priv->lock, flags); - list_del(&mad_send_wr->agent_list); - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - atomic_dec(&mad_agent_priv->refcount); - goto error; - } + } else + ret = ib_send_mad(mad_send_wr); + if (ret < 0) { + /* Fail send request */ + spin_lock_irqsave(&mad_agent_priv->lock, flags); + list_del(&mad_send_wr->agent_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + atomic_dec(&mad_agent_priv->refcount); + goto error; } } return 0; error: if (bad_send_buf) *bad_send_buf = send_buf; return ret; } EXPORT_SYMBOL(ib_post_send_mad); /* * ib_free_recv_mad - Returns data buffers used to receive * a MAD to the access layer */ void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc) { struct ib_mad_recv_buf *mad_recv_buf, *temp_recv_buf; struct ib_mad_private_header *mad_priv_hdr; struct ib_mad_private *priv; struct list_head free_list; INIT_LIST_HEAD(&free_list); list_splice_init(&mad_recv_wc->rmpp_list, &free_list); list_for_each_entry_safe(mad_recv_buf, temp_recv_buf, &free_list, list) { mad_recv_wc = container_of(mad_recv_buf, struct ib_mad_recv_wc, recv_buf); mad_priv_hdr = container_of(mad_recv_wc, struct ib_mad_private_header, recv_wc); priv = container_of(mad_priv_hdr, struct ib_mad_private, header); - kmem_cache_free(ib_mad_cache, priv); + kfree(priv); } } EXPORT_SYMBOL(ib_free_recv_mad); struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp, u8 rmpp_version, ib_mad_send_handler send_handler, ib_mad_recv_handler recv_handler, void *context) { return ERR_PTR(-EINVAL); /* XXX: for now */ } EXPORT_SYMBOL(ib_redirect_mad_qp); int ib_process_mad_wc(struct ib_mad_agent *mad_agent, struct ib_wc *wc) { - printk(KERN_ERR PFX "ib_process_mad_wc() not implemented yet\n"); + dev_err(&mad_agent->device->dev, + "ib_process_mad_wc() not implemented yet\n"); return 0; } EXPORT_SYMBOL(ib_process_mad_wc); static int method_in_use(struct ib_mad_mgmt_method_table **method, struct ib_mad_reg_req *mad_reg_req) { int i; for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) { if ((*method)->agent[i]) { - printk(KERN_ERR PFX "Method %d already in use\n", i); + pr_err("Method %d already in use\n", i); return -EINVAL; } } return 0; } static int allocate_method_table(struct ib_mad_mgmt_method_table **method) { /* Allocate management method table */ *method = kzalloc(sizeof **method, GFP_ATOMIC); if (!*method) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_method_table\n"); + pr_err("No memory for ib_mad_mgmt_method_table\n"); return -ENOMEM; } return 0; } /* * Check to see if there are any methods still in use */ static int check_method_table(struct ib_mad_mgmt_method_table *method) { int i; for (i = 0; i < IB_MGMT_MAX_METHODS; i++) if (method->agent[i]) return 1; return 0; } /* * Check to see if there are any method tables for this class still in use */ static int check_class_table(struct ib_mad_mgmt_class_table *class) { int i; for (i = 0; i < MAX_MGMT_CLASS; i++) if (class->method_table[i]) return 1; return 0; } static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class) { int i; for (i = 0; i < MAX_MGMT_OUI; i++) if (vendor_class->method_table[i]) return 1; return 0; } static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class, - char *oui) + const char *oui) { int i; for (i = 0; i < MAX_MGMT_OUI; i++) /* Is there matching OUI for this vendor class ? */ if (!memcmp(vendor_class->oui[i], oui, 3)) return i; return -1; } static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor) { int i; for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++) if (vendor->vendor_class[i]) return 1; return 0; } static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method, struct ib_mad_agent_private *agent) { int i; /* Remove any methods for this mad agent */ for (i = 0; i < IB_MGMT_MAX_METHODS; i++) { if (method->agent[i] == agent) { method->agent[i] = NULL; } } } static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv, u8 mgmt_class) { struct ib_mad_port_private *port_priv; struct ib_mad_mgmt_class_table **class; struct ib_mad_mgmt_method_table **method; int i, ret; port_priv = agent_priv->qp_info->port_priv; class = &port_priv->version[mad_reg_req->mgmt_class_version].class; if (!*class) { /* Allocate management class table for "new" class version */ *class = kzalloc(sizeof **class, GFP_ATOMIC); if (!*class) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_class_table\n"); + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_class_table\n"); ret = -ENOMEM; goto error1; } /* Allocate method table for this management class */ method = &(*class)->method_table[mgmt_class]; if ((ret = allocate_method_table(method))) goto error2; } else { method = &(*class)->method_table[mgmt_class]; if (!*method) { /* Allocate method table for this management class */ if ((ret = allocate_method_table(method))) goto error1; } } /* Now, make sure methods are not already in use */ if (method_in_use(method, mad_reg_req)) goto error3; /* Finally, add in methods being registered */ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) (*method)->agent[i] = agent_priv; return 0; error3: /* Remove any methods for this mad agent */ remove_methods_mad_agent(*method, agent_priv); /* Now, check to see if there are any methods in use */ if (!check_method_table(*method)) { /* If not, release management method table */ kfree(*method); *method = NULL; } ret = -EINVAL; goto error1; error2: kfree(*class); *class = NULL; error1: return ret; } static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv) { struct ib_mad_port_private *port_priv; struct ib_mad_mgmt_vendor_class_table **vendor_table; struct ib_mad_mgmt_vendor_class_table *vendor = NULL; struct ib_mad_mgmt_vendor_class *vendor_class = NULL; struct ib_mad_mgmt_method_table **method; int i, ret = -ENOMEM; u8 vclass; /* "New" vendor (with OUI) class */ vclass = vendor_class_index(mad_reg_req->mgmt_class); port_priv = agent_priv->qp_info->port_priv; vendor_table = &port_priv->version[ mad_reg_req->mgmt_class_version].vendor; if (!*vendor_table) { /* Allocate mgmt vendor class table for "new" class version */ vendor = kzalloc(sizeof *vendor, GFP_ATOMIC); if (!vendor) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_vendor_class_table\n"); + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_vendor_class_table\n"); goto error1; } *vendor_table = vendor; } if (!(*vendor_table)->vendor_class[vclass]) { /* Allocate table for this management vendor class */ vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC); if (!vendor_class) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_vendor_class\n"); + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_vendor_class\n"); goto error2; } (*vendor_table)->vendor_class[vclass] = vendor_class; } for (i = 0; i < MAX_MGMT_OUI; i++) { /* Is there matching OUI for this vendor class ? */ if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i], mad_reg_req->oui, 3)) { method = &(*vendor_table)->vendor_class[ vclass]->method_table[i]; BUG_ON(!*method); goto check_in_use; } } for (i = 0; i < MAX_MGMT_OUI; i++) { /* OUI slot available ? */ if (!is_vendor_oui((*vendor_table)->vendor_class[ vclass]->oui[i])) { method = &(*vendor_table)->vendor_class[ vclass]->method_table[i]; BUG_ON(*method); /* Allocate method table for this OUI */ if ((ret = allocate_method_table(method))) goto error3; memcpy((*vendor_table)->vendor_class[vclass]->oui[i], mad_reg_req->oui, 3); goto check_in_use; } } - printk(KERN_ERR PFX "All OUI slots in use\n"); + dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n"); goto error3; check_in_use: /* Now, make sure methods are not already in use */ if (method_in_use(method, mad_reg_req)) goto error4; /* Finally, add in methods being registered */ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) (*method)->agent[i] = agent_priv; return 0; error4: /* Remove any methods for this mad agent */ remove_methods_mad_agent(*method, agent_priv); /* Now, check to see if there are any methods in use */ if (!check_method_table(*method)) { /* If not, release management method table */ kfree(*method); *method = NULL; } ret = -EINVAL; error3: if (vendor_class) { (*vendor_table)->vendor_class[vclass] = NULL; kfree(vendor_class); } error2: if (vendor) { *vendor_table = NULL; kfree(vendor); } error1: return ret; } static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv) { struct ib_mad_port_private *port_priv; struct ib_mad_mgmt_class_table *class; struct ib_mad_mgmt_method_table *method; struct ib_mad_mgmt_vendor_class_table *vendor; struct ib_mad_mgmt_vendor_class *vendor_class; int index; u8 mgmt_class; /* * Was MAD registration request supplied * with original registration ? */ if (!agent_priv->reg_req) { goto out; } port_priv = agent_priv->qp_info->port_priv; mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class); class = port_priv->version[ agent_priv->reg_req->mgmt_class_version].class; if (!class) goto vendor_check; method = class->method_table[mgmt_class]; if (method) { /* Remove any methods for this mad agent */ remove_methods_mad_agent(method, agent_priv); /* Now, check to see if there are any methods still in use */ if (!check_method_table(method)) { /* If not, release management method table */ - kfree(method); - class->method_table[mgmt_class] = NULL; - /* Any management classes left ? */ + kfree(method); + class->method_table[mgmt_class] = NULL; + /* Any management classes left ? */ if (!check_class_table(class)) { /* If not, release management class table */ kfree(class); port_priv->version[ agent_priv->reg_req-> mgmt_class_version].class = NULL; } } } vendor_check: if (!is_vendor_class(mgmt_class)) goto out; /* normalize mgmt_class to vendor range 2 */ mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class); vendor = port_priv->version[ agent_priv->reg_req->mgmt_class_version].vendor; if (!vendor) goto out; vendor_class = vendor->vendor_class[mgmt_class]; if (vendor_class) { index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui); if (index < 0) goto out; method = vendor_class->method_table[index]; if (method) { /* Remove any methods for this mad agent */ remove_methods_mad_agent(method, agent_priv); /* * Now, check to see if there are * any methods still in use */ if (!check_method_table(method)) { /* If not, release management method table */ kfree(method); vendor_class->method_table[index] = NULL; memset(vendor_class->oui[index], 0, 3); /* Any OUIs left ? */ if (!check_vendor_class(vendor_class)) { /* If not, release vendor class table */ kfree(vendor_class); vendor->vendor_class[mgmt_class] = NULL; /* Any other vendor classes left ? */ if (!check_vendor_table(vendor)) { kfree(vendor); port_priv->version[ agent_priv->reg_req-> mgmt_class_version]. vendor = NULL; } } } } } out: return; } static struct ib_mad_agent_private * find_mad_agent(struct ib_mad_port_private *port_priv, - struct ib_mad *mad) + const struct ib_mad_hdr *mad_hdr) { struct ib_mad_agent_private *mad_agent = NULL; unsigned long flags; spin_lock_irqsave(&port_priv->reg_lock, flags); - if (ib_response_mad(mad)) { + if (ib_response_mad(mad_hdr)) { u32 hi_tid; struct ib_mad_agent_private *entry; /* * Routing is based on high 32 bits of transaction ID * of MAD. */ - hi_tid = be64_to_cpu(mad->mad_hdr.tid) >> 32; + hi_tid = be64_to_cpu(mad_hdr->tid) >> 32; list_for_each_entry(entry, &port_priv->agent_list, agent_list) { if (entry->agent.hi_tid == hi_tid) { mad_agent = entry; break; } } } else { struct ib_mad_mgmt_class_table *class; struct ib_mad_mgmt_method_table *method; struct ib_mad_mgmt_vendor_class_table *vendor; struct ib_mad_mgmt_vendor_class *vendor_class; - struct ib_vendor_mad *vendor_mad; + const struct ib_vendor_mad *vendor_mad; int index; /* * Routing is based on version, class, and method * For "newer" vendor MADs, also based on OUI */ - if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION) + if (mad_hdr->class_version >= MAX_MGMT_VERSION) goto out; - if (!is_vendor_class(mad->mad_hdr.mgmt_class)) { + if (!is_vendor_class(mad_hdr->mgmt_class)) { class = port_priv->version[ - mad->mad_hdr.class_version].class; + mad_hdr->class_version].class; if (!class) goto out; - if (convert_mgmt_class(mad->mad_hdr.mgmt_class) >= + if (convert_mgmt_class(mad_hdr->mgmt_class) >= IB_MGMT_MAX_METHODS) goto out; method = class->method_table[convert_mgmt_class( - mad->mad_hdr.mgmt_class)]; + mad_hdr->mgmt_class)]; if (method) - mad_agent = method->agent[mad->mad_hdr.method & + mad_agent = method->agent[mad_hdr->method & ~IB_MGMT_METHOD_RESP]; } else { vendor = port_priv->version[ - mad->mad_hdr.class_version].vendor; + mad_hdr->class_version].vendor; if (!vendor) goto out; vendor_class = vendor->vendor_class[vendor_class_index( - mad->mad_hdr.mgmt_class)]; + mad_hdr->mgmt_class)]; if (!vendor_class) goto out; /* Find matching OUI */ - vendor_mad = (struct ib_vendor_mad *)mad; + vendor_mad = (const struct ib_vendor_mad *)mad_hdr; index = find_vendor_oui(vendor_class, vendor_mad->oui); if (index == -1) goto out; method = vendor_class->method_table[index]; if (method) { - mad_agent = method->agent[mad->mad_hdr.method & + mad_agent = method->agent[mad_hdr->method & ~IB_MGMT_METHOD_RESP]; } } } if (mad_agent) { if (mad_agent->agent.recv_handler) atomic_inc(&mad_agent->refcount); else { - printk(KERN_NOTICE PFX "No receive handler for client " - "%p on port %d\n", - &mad_agent->agent, port_priv->port_num); + dev_notice(&port_priv->device->dev, + "No receive handler for client %p on port %d\n", + &mad_agent->agent, port_priv->port_num); mad_agent = NULL; } } out: spin_unlock_irqrestore(&port_priv->reg_lock, flags); return mad_agent; } -static int validate_mad(struct ib_mad *mad, u32 qp_num) +static int validate_mad(const struct ib_mad_hdr *mad_hdr, + const struct ib_mad_qp_info *qp_info, + bool opa) { int valid = 0; + u32 qp_num = qp_info->qp->qp_num; /* Make sure MAD base version is understood */ - if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) { - printk(KERN_ERR PFX "MAD received with unsupported base " - "version %d\n", mad->mad_hdr.base_version); + if (mad_hdr->base_version != IB_MGMT_BASE_VERSION && + (!opa || mad_hdr->base_version != OPA_MGMT_BASE_VERSION)) { + pr_err("MAD received with unsupported base version %d %s\n", + mad_hdr->base_version, opa ? "(opa)" : ""); goto out; } /* Filter SMI packets sent to other than QP0 */ - if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || - (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || + (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { if (qp_num == 0) valid = 1; } else { + /* CM attributes other than ClassPortInfo only use Send method */ + if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_CM) && + (mad_hdr->attr_id != IB_MGMT_CLASSPORTINFO_ATTR_ID) && + (mad_hdr->method != IB_MGMT_METHOD_SEND)) + goto out; /* Filter GSI packets sent to QP0 */ if (qp_num != 0) valid = 1; } out: return valid; } -static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_hdr *mad_hdr) +static int is_rmpp_data_mad(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_hdr *mad_hdr) { - struct ib_rmpp_mad *rmpp_mad; + const struct ib_rmpp_mad *rmpp_mad; - rmpp_mad = (struct ib_rmpp_mad *)mad_hdr; + rmpp_mad = (const struct ib_rmpp_mad *)mad_hdr; return !mad_agent_priv->agent.rmpp_version || + !ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) || !(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE) || (rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA); } -static inline int rcv_has_same_class(struct ib_mad_send_wr_private *wr, - struct ib_mad_recv_wc *rwc) +static inline int rcv_has_same_class(const struct ib_mad_send_wr_private *wr, + const struct ib_mad_recv_wc *rwc) { - return ((struct ib_mad *)(wr->send_buf.mad))->mad_hdr.mgmt_class == + return ((struct ib_mad_hdr *)(wr->send_buf.mad))->mgmt_class == rwc->recv_buf.mad->mad_hdr.mgmt_class; } -static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_send_wr_private *wr, - struct ib_mad_recv_wc *rwc ) +static inline int rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_send_wr_private *wr, + const struct ib_mad_recv_wc *rwc ) { struct ib_ah_attr attr; u8 send_resp, rcv_resp; union ib_gid sgid; struct ib_device *device = mad_agent_priv->agent.device; u8 port_num = mad_agent_priv->agent.port_num; u8 lmc; - send_resp = ib_response_mad((struct ib_mad *)wr->send_buf.mad); - rcv_resp = ib_response_mad(rwc->recv_buf.mad); + send_resp = ib_response_mad((struct ib_mad_hdr *)wr->send_buf.mad); + rcv_resp = ib_response_mad(&rwc->recv_buf.mad->mad_hdr); if (send_resp == rcv_resp) /* both requests, or both responses. GIDs different */ return 0; if (ib_query_ah(wr->send_buf.ah, &attr)) /* Assume not equal, to avoid false positives. */ return 0; if (!!(attr.ah_flags & IB_AH_GRH) != !!(rwc->wc->wc_flags & IB_WC_GRH)) /* one has GID, other does not. Assume different */ return 0; if (!send_resp && rcv_resp) { /* is request/response. */ if (!(attr.ah_flags & IB_AH_GRH)) { if (ib_get_cached_lmc(device, port_num, &lmc)) return 0; return (!lmc || !((attr.src_path_bits ^ rwc->wc->dlid_path_bits) & ((1 << lmc) - 1))); } else { if (ib_get_cached_gid(device, port_num, - attr.grh.sgid_index, &sgid)) + attr.grh.sgid_index, &sgid, NULL)) return 0; return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw, 16); } } if (!(attr.ah_flags & IB_AH_GRH)) return attr.dlid == rwc->wc->slid; else return !memcmp(attr.grh.dgid.raw, rwc->recv_buf.grh->sgid.raw, 16); } static inline int is_direct(u8 class) { return (class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE); } struct ib_mad_send_wr_private* -ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_recv_wc *wc) +ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_recv_wc *wc) { struct ib_mad_send_wr_private *wr; - struct ib_mad *mad; + const struct ib_mad_hdr *mad_hdr; - mad = (struct ib_mad *)wc->recv_buf.mad; + mad_hdr = &wc->recv_buf.mad->mad_hdr; list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) { - if ((wr->tid == mad->mad_hdr.tid) && + if ((wr->tid == mad_hdr->tid) && rcv_has_same_class(wr, wc) && /* * Don't check GID for direct routed MADs. * These might have permissive LIDs. */ - (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) || + (is_direct(mad_hdr->mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) return (wr->status == IB_WC_SUCCESS) ? wr : NULL; } /* * It's possible to receive the response before we've * been notified that the send has completed */ list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) { - if (is_data_mad(mad_agent_priv, wr->send_buf.mad) && - wr->tid == mad->mad_hdr.tid && + if (is_rmpp_data_mad(mad_agent_priv, wr->send_buf.mad) && + wr->tid == mad_hdr->tid && wr->timeout && rcv_has_same_class(wr, wc) && /* * Don't check GID for direct routed MADs. * These might have permissive LIDs. */ - (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) || + (is_direct(mad_hdr->mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) /* Verify request has not been canceled */ return (wr->status == IB_WC_SUCCESS) ? wr : NULL; } return NULL; } void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr) { mad_send_wr->timeout = 0; if (mad_send_wr->refcount == 1) list_move_tail(&mad_send_wr->agent_list, &mad_send_wr->mad_agent_priv->done_list); } static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc mad_send_wc; unsigned long flags; INIT_LIST_HEAD(&mad_recv_wc->rmpp_list); list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list); - if (mad_agent_priv->agent.rmpp_version) { + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv, mad_recv_wc); if (!mad_recv_wc) { deref_mad_agent(mad_agent_priv); return; } } /* Complete corresponding request */ - if (ib_response_mad(mad_recv_wc->recv_buf.mad)) { + if (ib_response_mad(&mad_recv_wc->recv_buf.mad->mad_hdr)) { spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc); if (!mad_send_wr) { spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - ib_free_recv_mad(mad_recv_wc); - deref_mad_agent(mad_agent_priv); - return; - } - ib_mark_mad_done(mad_send_wr); - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) + && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class) + && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr) + & IB_MGMT_RMPP_FLAG_ACTIVE)) { + /* user rmpp is in effect + * and this is an active RMPP MAD + */ + mad_agent_priv->agent.recv_handler( + &mad_agent_priv->agent, NULL, + mad_recv_wc); + atomic_dec(&mad_agent_priv->refcount); + } else { + /* not user rmpp, revert to normal behavior and + * drop the mad */ + ib_free_recv_mad(mad_recv_wc); + deref_mad_agent(mad_agent_priv); + return; + } + } else { + ib_mark_mad_done(mad_send_wr); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - /* Defined behavior is to complete response before request */ - mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf; - mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, - mad_recv_wc); - atomic_dec(&mad_agent_priv->refcount); + /* Defined behavior is to complete response before request */ + mad_agent_priv->agent.recv_handler( + &mad_agent_priv->agent, + &mad_send_wr->send_buf, + mad_recv_wc); + atomic_dec(&mad_agent_priv->refcount); - mad_send_wc.status = IB_WC_SUCCESS; - mad_send_wc.vendor_err = 0; - mad_send_wc.send_buf = &mad_send_wr->send_buf; - ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); + mad_send_wc.status = IB_WC_SUCCESS; + mad_send_wc.vendor_err = 0; + mad_send_wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); + } } else { - mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, + mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL, mad_recv_wc); deref_mad_agent(mad_agent_priv); } } -static bool generate_unmatched_resp(struct ib_mad_private *recv, - struct ib_mad_private *response) +static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv, + const struct ib_mad_qp_info *qp_info, + const struct ib_wc *wc, + int port_num, + struct ib_mad_private *recv, + struct ib_mad_private *response) { - if (recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_GET || - recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_SET) { - memcpy(response, recv, sizeof *response); + enum smi_forward_action retsmi; + struct ib_smp *smp = (struct ib_smp *)recv->mad; + + if (smi_handle_dr_smp_recv(smp, + rdma_cap_ib_switch(port_priv->device), + port_num, + port_priv->device->phys_port_cnt) == + IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + retsmi = smi_check_forward_dr_smp(smp); + if (retsmi == IB_SMI_LOCAL) + return IB_SMI_HANDLE; + + if (retsmi == IB_SMI_SEND) { /* don't forward */ + if (smi_handle_dr_smp_send(smp, + rdma_cap_ib_switch(port_priv->device), + port_num) == IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + if (smi_check_local_smp(smp, port_priv->device) == IB_SMI_DISCARD) + return IB_SMI_DISCARD; + } else if (rdma_cap_ib_switch(port_priv->device)) { + /* forward case for switches */ + memcpy(response, recv, mad_priv_size(response)); response->header.recv_wc.wc = &response->header.wc; - response->header.recv_wc.recv_buf.mad = &response->mad.mad; + response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad; response->header.recv_wc.recv_buf.grh = &response->grh; - response->mad.mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP; - response->mad.mad.mad_hdr.status = - cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB); - if (recv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) - response->mad.mad.mad_hdr.status |= IB_SMP_DIRECTION; + agent_send_response((const struct ib_mad_hdr *)response->mad, + &response->grh, wc, + port_priv->device, + smi_get_fwd_port(smp), + qp_info->qp->qp_num, + response->mad_size, + false); + + return IB_SMI_DISCARD; + } + return IB_SMI_HANDLE; +} + +static bool generate_unmatched_resp(const struct ib_mad_private *recv, + struct ib_mad_private *response, + size_t *resp_len, bool opa) +{ + const struct ib_mad_hdr *recv_hdr = (const struct ib_mad_hdr *)recv->mad; + struct ib_mad_hdr *resp_hdr = (struct ib_mad_hdr *)response->mad; + + if (recv_hdr->method == IB_MGMT_METHOD_GET || + recv_hdr->method == IB_MGMT_METHOD_SET) { + memcpy(response, recv, mad_priv_size(response)); + response->header.recv_wc.wc = &response->header.wc; + response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad; + response->header.recv_wc.recv_buf.grh = &response->grh; + resp_hdr->method = IB_MGMT_METHOD_GET_RESP; + resp_hdr->status = cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB); + if (recv_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + resp_hdr->status |= IB_SMP_DIRECTION; + + if (opa && recv_hdr->base_version == OPA_MGMT_BASE_VERSION) { + if (recv_hdr->mgmt_class == + IB_MGMT_CLASS_SUBN_LID_ROUTED || + recv_hdr->mgmt_class == + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + *resp_len = opa_get_smp_header_size( + (const struct opa_smp *)recv->mad); + else + *resp_len = sizeof(struct ib_mad_hdr); + } + return true; } else { return false; } } -static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, - struct ib_wc *wc) + +static enum smi_action +handle_opa_smi(struct ib_mad_port_private *port_priv, + struct ib_mad_qp_info *qp_info, + struct ib_wc *wc, + int port_num, + struct ib_mad_private *recv, + struct ib_mad_private *response) { + enum smi_forward_action retsmi; + struct opa_smp *smp = (struct opa_smp *)recv->mad; + + if (opa_smi_handle_dr_smp_recv(smp, + rdma_cap_ib_switch(port_priv->device), + port_num, + port_priv->device->phys_port_cnt) == + IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + retsmi = opa_smi_check_forward_dr_smp(smp); + if (retsmi == IB_SMI_LOCAL) + return IB_SMI_HANDLE; + + if (retsmi == IB_SMI_SEND) { /* don't forward */ + if (opa_smi_handle_dr_smp_send(smp, + rdma_cap_ib_switch(port_priv->device), + port_num) == IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + if (opa_smi_check_local_smp(smp, port_priv->device) == + IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + } else if (rdma_cap_ib_switch(port_priv->device)) { + /* forward case for switches */ + memcpy(response, recv, mad_priv_size(response)); + response->header.recv_wc.wc = &response->header.wc; + response->header.recv_wc.recv_buf.opa_mad = + (struct opa_mad *)response->mad; + response->header.recv_wc.recv_buf.grh = &response->grh; + + agent_send_response((const struct ib_mad_hdr *)response->mad, + &response->grh, wc, + port_priv->device, + opa_smi_get_fwd_port(smp), + qp_info->qp->qp_num, + recv->header.wc.byte_len, + true); + + return IB_SMI_DISCARD; + } + + return IB_SMI_HANDLE; +} + +static enum smi_action +handle_smi(struct ib_mad_port_private *port_priv, + struct ib_mad_qp_info *qp_info, + struct ib_wc *wc, + int port_num, + struct ib_mad_private *recv, + struct ib_mad_private *response, + bool opa) +{ + struct ib_mad_hdr *mad_hdr = (struct ib_mad_hdr *)recv->mad; + + if (opa && mad_hdr->base_version == OPA_MGMT_BASE_VERSION && + mad_hdr->class_version == OPA_SMI_CLASS_VERSION) + return handle_opa_smi(port_priv, qp_info, wc, port_num, recv, + response); + + return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response); +} + +static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_mad_port_private *port_priv = cq->cq_context; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); struct ib_mad_qp_info *qp_info; struct ib_mad_private_header *mad_priv_hdr; struct ib_mad_private *recv, *response = NULL; - struct ib_mad_list_head *mad_list; struct ib_mad_agent_private *mad_agent; int port_num; int ret = IB_MAD_RESULT_SUCCESS; + size_t mad_size; + u16 resp_mad_pkey_index = 0; + bool opa; - mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; + if (list_empty_careful(&port_priv->port_list)) + return; + + if (wc->status != IB_WC_SUCCESS) { + /* + * Receive errors indicate that the QP has entered the error + * state - error handling/shutdown code will cleanup + */ + return; + } + qp_info = mad_list->mad_queue->qp_info; dequeue_mad(mad_list); + opa = rdma_cap_opa_mad(qp_info->port_priv->device, + qp_info->port_priv->port_num); + mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header, mad_list); recv = container_of(mad_priv_hdr, struct ib_mad_private, header); ib_dma_unmap_single(port_priv->device, recv->header.mapping, - sizeof(struct ib_mad_private) - - sizeof(struct ib_mad_private_header), + mad_priv_dma_size(recv), DMA_FROM_DEVICE); /* Setup MAD receive work completion from "normal" work completion */ recv->header.wc = *wc; recv->header.recv_wc.wc = &recv->header.wc; - recv->header.recv_wc.mad_len = sizeof(struct ib_mad); - recv->header.recv_wc.recv_buf.mad = &recv->mad.mad; + + if (opa && ((struct ib_mad_hdr *)(recv->mad))->base_version == OPA_MGMT_BASE_VERSION) { + recv->header.recv_wc.mad_len = wc->byte_len - sizeof(struct ib_grh); + recv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad); + } else { + recv->header.recv_wc.mad_len = sizeof(struct ib_mad); + recv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad); + } + + recv->header.recv_wc.recv_buf.mad = (struct ib_mad *)recv->mad; recv->header.recv_wc.recv_buf.grh = &recv->grh; if (atomic_read(&qp_info->snoop_count)) snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS); /* Validate MAD */ - if (!validate_mad(&recv->mad.mad, qp_info->qp->qp_num)) + if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa)) goto out; - response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); + mad_size = recv->mad_size; + response = alloc_mad_private(mad_size, GFP_KERNEL); if (!response) { - printk(KERN_ERR PFX "ib_mad_recv_done_handler no memory " - "for response buffer\n"); + dev_err(&port_priv->device->dev, + "%s: no memory for response buffer\n", __func__); goto out; } - if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) + if (rdma_cap_ib_switch(port_priv->device)) port_num = wc->port_num; else port_num = port_priv->port_num; - if (recv->mad.mad.mad_hdr.mgmt_class == + if (((struct ib_mad_hdr *)recv->mad)->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { - enum smi_forward_action retsmi; - - if (smi_handle_dr_smp_recv(&recv->mad.smp, - port_priv->device->node_type, - port_num, - port_priv->device->phys_port_cnt) == - IB_SMI_DISCARD) + if (handle_smi(port_priv, qp_info, wc, port_num, recv, + response, opa) + == IB_SMI_DISCARD) goto out; - - retsmi = smi_check_forward_dr_smp(&recv->mad.smp); - if (retsmi == IB_SMI_LOCAL) - goto local; - - if (retsmi == IB_SMI_SEND) { /* don't forward */ - if (smi_handle_dr_smp_send(&recv->mad.smp, - port_priv->device->node_type, - port_num) == IB_SMI_DISCARD) - goto out; - - if (smi_check_local_smp(&recv->mad.smp, port_priv->device) == IB_SMI_DISCARD) - goto out; - } else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) { - /* forward case for switches */ - memcpy(response, recv, sizeof(*response)); - response->header.recv_wc.wc = &response->header.wc; - response->header.recv_wc.recv_buf.mad = &response->mad.mad; - response->header.recv_wc.recv_buf.grh = &response->grh; - - agent_send_response(&response->mad.mad, - &response->grh, wc, - port_priv->device, - smi_get_fwd_port(&recv->mad.smp), - qp_info->qp->qp_num); - - goto out; - } } -local: /* Give driver "right of first refusal" on incoming MAD */ if (port_priv->device->process_mad) { ret = port_priv->device->process_mad(port_priv->device, 0, port_priv->port_num, wc, &recv->grh, - &recv->mad.mad, - &response->mad.mad); + (const struct ib_mad_hdr *)recv->mad, + recv->mad_size, + (struct ib_mad_hdr *)response->mad, + &mad_size, &resp_mad_pkey_index); + + if (opa) + wc->pkey_index = resp_mad_pkey_index; + if (ret & IB_MAD_RESULT_SUCCESS) { if (ret & IB_MAD_RESULT_CONSUMED) goto out; if (ret & IB_MAD_RESULT_REPLY) { - agent_send_response(&response->mad.mad, + agent_send_response((const struct ib_mad_hdr *)response->mad, &recv->grh, wc, port_priv->device, port_num, - qp_info->qp->qp_num); + qp_info->qp->qp_num, + mad_size, opa); goto out; } } } - mad_agent = find_mad_agent(port_priv, &recv->mad.mad); + mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad); if (mad_agent) { ib_mad_complete_recv(mad_agent, &recv->header.recv_wc); /* * recv is freed up in error cases in ib_mad_complete_recv * or via recv_handler in ib_mad_complete_recv() */ recv = NULL; } else if ((ret & IB_MAD_RESULT_SUCCESS) && - generate_unmatched_resp(recv, response)) { - agent_send_response(&response->mad.mad, &recv->grh, wc, - port_priv->device, port_num, qp_info->qp->qp_num); + generate_unmatched_resp(recv, response, &mad_size, opa)) { + agent_send_response((const struct ib_mad_hdr *)response->mad, &recv->grh, wc, + port_priv->device, port_num, + qp_info->qp->qp_num, mad_size, opa); } out: /* Post another receive request for this QP */ if (response) { ib_mad_post_receive_mads(qp_info, response); - if (recv) - kmem_cache_free(ib_mad_cache, recv); + kfree(recv); } else ib_mad_post_receive_mads(qp_info, recv); } static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv) { struct ib_mad_send_wr_private *mad_send_wr; unsigned long delay; if (list_empty(&mad_agent_priv->wait_list)) { cancel_delayed_work(&mad_agent_priv->timed_work); } else { mad_send_wr = list_entry(mad_agent_priv->wait_list.next, struct ib_mad_send_wr_private, agent_list); if (time_after(mad_agent_priv->timeout, mad_send_wr->timeout)) { mad_agent_priv->timeout = mad_send_wr->timeout; delay = mad_send_wr->timeout - jiffies; if ((long)delay <= 0) delay = 1; mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq, &mad_agent_priv->timed_work, delay); } } } static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *temp_mad_send_wr; struct list_head *list_item; unsigned long delay; mad_agent_priv = mad_send_wr->mad_agent_priv; list_del(&mad_send_wr->agent_list); delay = mad_send_wr->timeout; mad_send_wr->timeout += jiffies; if (delay) { list_for_each_prev(list_item, &mad_agent_priv->wait_list) { temp_mad_send_wr = list_entry(list_item, struct ib_mad_send_wr_private, agent_list); if (time_after(mad_send_wr->timeout, temp_mad_send_wr->timeout)) break; } } else list_item = &mad_agent_priv->wait_list; list_add(&mad_send_wr->agent_list, list_item); /* Reschedule a work item if we have a shorter timeout */ if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list) mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq, &mad_agent_priv->timed_work, delay); } void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, int timeout_ms) { mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); wait_for_response(mad_send_wr); } /* * Process a send work completion */ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, struct ib_mad_send_wc *mad_send_wc) { struct ib_mad_agent_private *mad_agent_priv; unsigned long flags; int ret; mad_agent_priv = mad_send_wr->mad_agent_priv; spin_lock_irqsave(&mad_agent_priv->lock, flags); - if (mad_agent_priv->agent.rmpp_version) { + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc); if (ret == IB_RMPP_RESULT_CONSUMED) goto done; } else ret = IB_RMPP_RESULT_UNHANDLED; if (mad_send_wc->status != IB_WC_SUCCESS && mad_send_wr->status == IB_WC_SUCCESS) { mad_send_wr->status = mad_send_wc->status; mad_send_wr->refcount -= (mad_send_wr->timeout > 0); } if (--mad_send_wr->refcount > 0) { if (mad_send_wr->refcount == 1 && mad_send_wr->timeout && mad_send_wr->status == IB_WC_SUCCESS) { wait_for_response(mad_send_wr); } goto done; } /* Remove send from MAD agent and notify client of completion */ list_del(&mad_send_wr->agent_list); adjust_timeout(mad_agent_priv); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); if (mad_send_wr->status != IB_WC_SUCCESS ) mad_send_wc->status = mad_send_wr->status; if (ret == IB_RMPP_RESULT_INTERNAL) ib_rmpp_send_handler(mad_send_wc); - else { - if (mad_send_wr->is_sa_cc_mad) - sa_cc_mad_done(get_cc_obj(mad_send_wr)); + else mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, mad_send_wc); - } /* Release reference on agent taken when sending */ deref_mad_agent(mad_agent_priv); return; done: spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } -static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv, - struct ib_wc *wc) +static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc) { + struct ib_mad_port_private *port_priv = cq->cq_context; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr; - struct ib_mad_list_head *mad_list; struct ib_mad_qp_info *qp_info; struct ib_mad_queue *send_queue; struct ib_send_wr *bad_send_wr; struct ib_mad_send_wc mad_send_wc; unsigned long flags; int ret; - mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; + if (list_empty_careful(&port_priv->port_list)) + return; + + if (wc->status != IB_WC_SUCCESS) { + if (!ib_mad_send_error(port_priv, wc)) + return; + } + mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); send_queue = mad_list->mad_queue; qp_info = send_queue->qp_info; retry: ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device, mad_send_wr->header_mapping, mad_send_wr->sg_list[0].length, DMA_TO_DEVICE); ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device, mad_send_wr->payload_mapping, mad_send_wr->sg_list[1].length, DMA_TO_DEVICE); queued_send_wr = NULL; spin_lock_irqsave(&send_queue->lock, flags); list_del(&mad_list->list); /* Move queued send to the send queue */ if (send_queue->count-- > send_queue->max_active) { mad_list = container_of(qp_info->overflow_list.next, struct ib_mad_list_head, list); queued_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); list_move_tail(&mad_list->list, &send_queue->list); } spin_unlock_irqrestore(&send_queue->lock, flags); mad_send_wc.send_buf = &mad_send_wr->send_buf; mad_send_wc.status = wc->status; mad_send_wc.vendor_err = wc->vendor_err; if (atomic_read(&qp_info->snoop_count)) snoop_send(qp_info, &mad_send_wr->send_buf, &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS); ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); if (queued_send_wr) { - ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr, + ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr, &bad_send_wr); if (ret) { - printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret); + dev_err(&port_priv->device->dev, + "ib_post_send failed: %d\n", ret); mad_send_wr = queued_send_wr; wc->status = IB_WC_LOC_QP_OP_ERR; goto retry; } } } static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info) { struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_list_head *mad_list; unsigned long flags; spin_lock_irqsave(&qp_info->send_queue.lock, flags); list_for_each_entry(mad_list, &qp_info->send_queue.list, list) { mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); mad_send_wr->retry = 1; } spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); } -static void mad_error_handler(struct ib_mad_port_private *port_priv, - struct ib_wc *wc) +static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, + struct ib_wc *wc) { - struct ib_mad_list_head *mad_list; - struct ib_mad_qp_info *qp_info; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); + struct ib_mad_qp_info *qp_info = mad_list->mad_queue->qp_info; struct ib_mad_send_wr_private *mad_send_wr; int ret; - /* Determine if failure was a send or receive */ - mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; - qp_info = mad_list->mad_queue->qp_info; - if (mad_list->mad_queue == &qp_info->recv_queue) - /* - * Receive errors indicate that the QP has entered the error - * state - error handling/shutdown code will cleanup - */ - return; - /* * Send errors will transition the QP to SQE - move * QP to RTS and repost flushed work requests */ mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); if (wc->status == IB_WC_WR_FLUSH_ERR) { if (mad_send_wr->retry) { /* Repost send */ struct ib_send_wr *bad_send_wr; mad_send_wr->retry = 0; - ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr, + ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr, &bad_send_wr); - if (ret) - ib_mad_send_done_handler(port_priv, wc); - } else - ib_mad_send_done_handler(port_priv, wc); + if (!ret) + return false; + } } else { struct ib_qp_attr *attr; /* Transition QP to RTS and fail offending send */ attr = kmalloc(sizeof *attr, GFP_KERNEL); if (attr) { attr->qp_state = IB_QPS_RTS; attr->cur_qp_state = IB_QPS_SQE; ret = ib_modify_qp(qp_info->qp, attr, IB_QP_STATE | IB_QP_CUR_STATE); kfree(attr); if (ret) - printk(KERN_ERR PFX "mad_error_handler - " - "ib_modify_qp to RTS : %d\n", ret); + dev_err(&port_priv->device->dev, + "%s - ib_modify_qp to RTS: %d\n", + __func__, ret); else mark_sends_for_retry(qp_info); } - ib_mad_send_done_handler(port_priv, wc); } -} -/* - * IB MAD completion callback - */ -static void ib_mad_completion_handler(struct work_struct *work) -{ - struct ib_mad_port_private *port_priv; - struct ib_wc wc; - - port_priv = container_of(work, struct ib_mad_port_private, work); - ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); - - while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) { - if (wc.status == IB_WC_SUCCESS) { - switch (wc.opcode) { - case IB_WC_SEND: - ib_mad_send_done_handler(port_priv, &wc); - break; - case IB_WC_RECV: - ib_mad_recv_done_handler(port_priv, &wc); - break; - default: - BUG_ON(1); - break; - } - } else - mad_error_handler(port_priv, &wc); - } + return true; } static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) { unsigned long flags; struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr; struct ib_mad_send_wc mad_send_wc; struct list_head cancel_list; INIT_LIST_HEAD(&cancel_list); - cancel_sa_cc_mads(mad_agent_priv); spin_lock_irqsave(&mad_agent_priv->lock, flags); list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, &mad_agent_priv->send_list, agent_list) { if (mad_send_wr->status == IB_WC_SUCCESS) { mad_send_wr->status = IB_WC_WR_FLUSH_ERR; mad_send_wr->refcount -= (mad_send_wr->timeout > 0); } } /* Empty wait list to prevent receives from finding a request */ list_splice_init(&mad_agent_priv->wait_list, &cancel_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); /* Report all cancelled requests */ mad_send_wc.status = IB_WC_WR_FLUSH_ERR; mad_send_wc.vendor_err = 0; list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, &cancel_list, agent_list) { mad_send_wc.send_buf = &mad_send_wr->send_buf; list_del(&mad_send_wr->agent_list); - if (mad_send_wr->is_sa_cc_mad) - sa_cc_mad_done(get_cc_obj(mad_send_wr)); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); atomic_dec(&mad_agent_priv->refcount); } } static struct ib_mad_send_wr_private* find_send_wr(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_send_buf *send_buf) { struct ib_mad_send_wr_private *mad_send_wr; list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list, agent_list) { if (&mad_send_wr->send_buf == send_buf) return mad_send_wr; } list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list, agent_list) { - if (is_data_mad(mad_agent_priv, mad_send_wr->send_buf.mad) && + if (is_rmpp_data_mad(mad_agent_priv, + mad_send_wr->send_buf.mad) && &mad_send_wr->send_buf == send_buf) return mad_send_wr; } return NULL; } int ib_modify_mad(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, u32 timeout_ms) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; unsigned long flags; int active; mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, agent); spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = find_send_wr(mad_agent_priv, send_buf); - if (!mad_send_wr) { + if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) { spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - if (modify_sa_cc_mad(mad_agent_priv, send_buf, timeout_ms)) - return -EINVAL; - return 0; - } - if (mad_send_wr->status != IB_WC_SUCCESS) { - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); return -EINVAL; } active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1); if (!timeout_ms) { mad_send_wr->status = IB_WC_WR_FLUSH_ERR; mad_send_wr->refcount -= (mad_send_wr->timeout > 0); } mad_send_wr->send_buf.timeout_ms = timeout_ms; if (active) mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); else ib_reset_mad_timeout(mad_send_wr, timeout_ms); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); return 0; } EXPORT_SYMBOL(ib_modify_mad); void ib_cancel_mad(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf) { ib_modify_mad(mad_agent, send_buf, 0); } EXPORT_SYMBOL(ib_cancel_mad); static void local_completions(struct work_struct *work) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_local_private *local; struct ib_mad_agent_private *recv_mad_agent; unsigned long flags; int free_mad; struct ib_wc wc; struct ib_mad_send_wc mad_send_wc; + bool opa; mad_agent_priv = container_of(work, struct ib_mad_agent_private, local_work); + opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device, + mad_agent_priv->qp_info->port_priv->port_num); + spin_lock_irqsave(&mad_agent_priv->lock, flags); while (!list_empty(&mad_agent_priv->local_list)) { local = list_entry(mad_agent_priv->local_list.next, struct ib_mad_local_private, completion_list); list_del(&local->completion_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); free_mad = 0; if (local->mad_priv) { + u8 base_version; recv_mad_agent = local->recv_mad_agent; if (!recv_mad_agent) { - printk(KERN_ERR PFX "No receive MAD agent for local completion\n"); + dev_err(&mad_agent_priv->agent.device->dev, + "No receive MAD agent for local completion\n"); free_mad = 1; goto local_send_completion; } /* * Defined behavior is to complete response * before request */ build_smp_wc(recv_mad_agent->agent.qp, - (unsigned long) local->mad_send_wr, + local->mad_send_wr->send_wr.wr.wr_cqe, be16_to_cpu(IB_LID_PERMISSIVE), - 0, recv_mad_agent->agent.port_num, &wc); + local->mad_send_wr->send_wr.pkey_index, + recv_mad_agent->agent.port_num, &wc); local->mad_priv->header.recv_wc.wc = &wc; - local->mad_priv->header.recv_wc.mad_len = - sizeof(struct ib_mad); + + base_version = ((struct ib_mad_hdr *)(local->mad_priv->mad))->base_version; + if (opa && base_version == OPA_MGMT_BASE_VERSION) { + local->mad_priv->header.recv_wc.mad_len = local->return_wc_byte_len; + local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad); + } else { + local->mad_priv->header.recv_wc.mad_len = sizeof(struct ib_mad); + local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad); + } + INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list); list_add(&local->mad_priv->header.recv_wc.recv_buf.list, &local->mad_priv->header.recv_wc.rmpp_list); local->mad_priv->header.recv_wc.recv_buf.grh = NULL; local->mad_priv->header.recv_wc.recv_buf.mad = - &local->mad_priv->mad.mad; + (struct ib_mad *)local->mad_priv->mad; if (atomic_read(&recv_mad_agent->qp_info->snoop_count)) snoop_recv(recv_mad_agent->qp_info, &local->mad_priv->header.recv_wc, IB_MAD_SNOOP_RECVS); recv_mad_agent->agent.recv_handler( &recv_mad_agent->agent, + &local->mad_send_wr->send_buf, &local->mad_priv->header.recv_wc); spin_lock_irqsave(&recv_mad_agent->lock, flags); atomic_dec(&recv_mad_agent->refcount); spin_unlock_irqrestore(&recv_mad_agent->lock, flags); } local_send_completion: /* Complete send */ mad_send_wc.status = IB_WC_SUCCESS; mad_send_wc.vendor_err = 0; mad_send_wc.send_buf = &local->mad_send_wr->send_buf; if (atomic_read(&mad_agent_priv->qp_info->snoop_count)) snoop_send(mad_agent_priv->qp_info, &local->mad_send_wr->send_buf, &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); spin_lock_irqsave(&mad_agent_priv->lock, flags); atomic_dec(&mad_agent_priv->refcount); if (free_mad) - kmem_cache_free(ib_mad_cache, local->mad_priv); + kfree(local->mad_priv); kfree(local); } spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) { int ret; if (!mad_send_wr->retries_left) return -ETIMEDOUT; mad_send_wr->retries_left--; mad_send_wr->send_buf.retries++; mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); - if (mad_send_wr->mad_agent_priv->agent.rmpp_version) { + if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) { ret = ib_retry_rmpp(mad_send_wr); switch (ret) { case IB_RMPP_RESULT_UNHANDLED: ret = ib_send_mad(mad_send_wr); break; case IB_RMPP_RESULT_CONSUMED: ret = 0; break; default: ret = -ECOMM; break; } } else ret = ib_send_mad(mad_send_wr); if (!ret) { mad_send_wr->refcount++; list_add_tail(&mad_send_wr->agent_list, &mad_send_wr->mad_agent_priv->send_list); } return ret; } static void timeout_sends(struct work_struct *work) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc mad_send_wc; unsigned long flags, delay; mad_agent_priv = container_of(work, struct ib_mad_agent_private, timed_work.work); mad_send_wc.vendor_err = 0; spin_lock_irqsave(&mad_agent_priv->lock, flags); while (!list_empty(&mad_agent_priv->wait_list)) { mad_send_wr = list_entry(mad_agent_priv->wait_list.next, struct ib_mad_send_wr_private, agent_list); if (time_after(mad_send_wr->timeout, jiffies)) { delay = mad_send_wr->timeout - jiffies; if ((long)delay <= 0) delay = 1; queue_delayed_work(mad_agent_priv->qp_info-> port_priv->wq, &mad_agent_priv->timed_work, delay); break; } list_del(&mad_send_wr->agent_list); if (mad_send_wr->status == IB_WC_SUCCESS && !retry_send(mad_send_wr)) continue; spin_unlock_irqrestore(&mad_agent_priv->lock, flags); if (mad_send_wr->status == IB_WC_SUCCESS) mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR; else mad_send_wc.status = mad_send_wr->status; mad_send_wc.send_buf = &mad_send_wr->send_buf; - if (mad_send_wr->is_sa_cc_mad) - sa_cc_mad_done(get_cc_obj(mad_send_wr)); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); atomic_dec(&mad_agent_priv->refcount); spin_lock_irqsave(&mad_agent_priv->lock, flags); } spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } -static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg) -{ - struct ib_mad_port_private *port_priv = cq->cq_context; - unsigned long flags; - - spin_lock_irqsave(&ib_mad_port_list_lock, flags); - if (!list_empty(&port_priv->port_list)) - queue_work(port_priv->wq, &port_priv->work); - spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); -} - /* * Allocate receive MADs and post receive WRs for them */ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, struct ib_mad_private *mad) { unsigned long flags; int post, ret; struct ib_mad_private *mad_priv; struct ib_sge sg_list; struct ib_recv_wr recv_wr, *bad_recv_wr; struct ib_mad_queue *recv_queue = &qp_info->recv_queue; /* Initialize common scatter list fields */ - sg_list.length = sizeof *mad_priv - sizeof mad_priv->header; - sg_list.lkey = (*qp_info->port_priv->mr).lkey; + sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey; /* Initialize common receive WR fields */ recv_wr.next = NULL; recv_wr.sg_list = &sg_list; recv_wr.num_sge = 1; do { /* Allocate and map receive buffer */ if (mad) { mad_priv = mad; mad = NULL; } else { - mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); + mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv), + GFP_ATOMIC); if (!mad_priv) { - printk(KERN_ERR PFX "No memory for receive buffer\n"); + dev_err(&qp_info->port_priv->device->dev, + "No memory for receive buffer\n"); ret = -ENOMEM; break; } } + sg_list.length = mad_priv_dma_size(mad_priv); sg_list.addr = ib_dma_map_single(qp_info->port_priv->device, &mad_priv->grh, - sizeof *mad_priv - - sizeof mad_priv->header, + mad_priv_dma_size(mad_priv), DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, sg_list.addr))) { ret = -ENOMEM; - kmem_cache_free(ib_mad_cache, mad_priv); - printk(KERN_ERR PFX "ib_dma_map_single failed\n"); break; } - mad_priv->header.mapping = sg_list.addr; - recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list; mad_priv->header.mad_list.mad_queue = recv_queue; + mad_priv->header.mad_list.cqe.done = ib_mad_recv_done; + recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe; /* Post receive WR */ spin_lock_irqsave(&recv_queue->lock, flags); post = (++recv_queue->count < recv_queue->max_active); list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list); spin_unlock_irqrestore(&recv_queue->lock, flags); ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr); if (ret) { spin_lock_irqsave(&recv_queue->lock, flags); list_del(&mad_priv->header.mad_list.list); recv_queue->count--; spin_unlock_irqrestore(&recv_queue->lock, flags); ib_dma_unmap_single(qp_info->port_priv->device, mad_priv->header.mapping, - sizeof *mad_priv - - sizeof mad_priv->header, + mad_priv_dma_size(mad_priv), DMA_FROM_DEVICE); - kmem_cache_free(ib_mad_cache, mad_priv); - printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret); + kfree(mad_priv); + dev_err(&qp_info->port_priv->device->dev, + "ib_post_recv failed: %d\n", ret); break; } } while (post); return ret; } /* * Return all the posted receive MADs */ static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info) { struct ib_mad_private_header *mad_priv_hdr; struct ib_mad_private *recv; struct ib_mad_list_head *mad_list; if (!qp_info->qp) return; while (!list_empty(&qp_info->recv_queue.list)) { mad_list = list_entry(qp_info->recv_queue.list.next, struct ib_mad_list_head, list); mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header, mad_list); recv = container_of(mad_priv_hdr, struct ib_mad_private, header); /* Remove from posted receive MAD list */ list_del(&mad_list->list); ib_dma_unmap_single(qp_info->port_priv->device, recv->header.mapping, - sizeof(struct ib_mad_private) - - sizeof(struct ib_mad_private_header), + mad_priv_dma_size(recv), DMA_FROM_DEVICE); - kmem_cache_free(ib_mad_cache, recv); + kfree(recv); } qp_info->recv_queue.count = 0; } /* * Start the port */ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) { int ret, i; struct ib_qp_attr *attr; struct ib_qp *qp; - u16 pkey_index = 0; + u16 pkey_index; attr = kmalloc(sizeof *attr, GFP_KERNEL); if (!attr) { - printk(KERN_ERR PFX "Couldn't kmalloc ib_qp_attr\n"); + dev_err(&port_priv->device->dev, + "Couldn't kmalloc ib_qp_attr\n"); return -ENOMEM; } ret = ib_find_pkey(port_priv->device, port_priv->port_num, - 0xFFFF, &pkey_index); + IB_DEFAULT_PKEY_FULL, &pkey_index); if (ret) pkey_index = 0; for (i = 0; i < IB_MAD_QPS_CORE; i++) { qp = port_priv->qp_info[i].qp; if (!qp) continue; /* * PKey index for QP1 is irrelevant but * one is needed for the Reset to Init transition */ attr->qp_state = IB_QPS_INIT; attr->pkey_index = pkey_index; attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY; ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY); if (ret) { - printk(KERN_ERR PFX "Couldn't change QP%d state to " - "INIT: %d\n", i, ret); + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to INIT: %d\n", + i, ret); goto out; } attr->qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, attr, IB_QP_STATE); if (ret) { - printk(KERN_ERR PFX "Couldn't change QP%d state to " - "RTR: %d\n", i, ret); + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to RTR: %d\n", + i, ret); goto out; } attr->qp_state = IB_QPS_RTS; attr->sq_psn = IB_MAD_SEND_Q_PSN; ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN); if (ret) { - printk(KERN_ERR PFX "Couldn't change QP%d state to " - "RTS: %d\n", i, ret); + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to RTS: %d\n", + i, ret); goto out; } } ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); if (ret) { - printk(KERN_ERR PFX "Failed to request completion " - "notification: %d\n", ret); + dev_err(&port_priv->device->dev, + "Failed to request completion notification: %d\n", + ret); goto out; } for (i = 0; i < IB_MAD_QPS_CORE; i++) { if (!port_priv->qp_info[i].qp) continue; ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL); if (ret) { - printk(KERN_ERR PFX "Couldn't post receive WRs\n"); + dev_err(&port_priv->device->dev, + "Couldn't post receive WRs\n"); goto out; } } out: kfree(attr); return ret; } static void qp_event_handler(struct ib_event *event, void *qp_context) { struct ib_mad_qp_info *qp_info = qp_context; /* It's worse than that! He's dead, Jim! */ - printk(KERN_ERR PFX "Fatal error (%d) on MAD QP (%d)\n", + dev_err(&qp_info->port_priv->device->dev, + "Fatal error (%d) on MAD QP (%d)\n", event->event, qp_info->qp->qp_num); } static void init_mad_queue(struct ib_mad_qp_info *qp_info, struct ib_mad_queue *mad_queue) { mad_queue->qp_info = qp_info; mad_queue->count = 0; spin_lock_init(&mad_queue->lock); INIT_LIST_HEAD(&mad_queue->list); } static void init_mad_qp(struct ib_mad_port_private *port_priv, struct ib_mad_qp_info *qp_info) { qp_info->port_priv = port_priv; init_mad_queue(qp_info, &qp_info->send_queue); init_mad_queue(qp_info, &qp_info->recv_queue); INIT_LIST_HEAD(&qp_info->overflow_list); spin_lock_init(&qp_info->snoop_lock); qp_info->snoop_table = NULL; qp_info->snoop_table_size = 0; atomic_set(&qp_info->snoop_count, 0); } static int create_mad_qp(struct ib_mad_qp_info *qp_info, enum ib_qp_type qp_type) { struct ib_qp_init_attr qp_init_attr; int ret; memset(&qp_init_attr, 0, sizeof qp_init_attr); qp_init_attr.send_cq = qp_info->port_priv->cq; qp_init_attr.recv_cq = qp_info->port_priv->cq; qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; qp_init_attr.cap.max_send_wr = mad_sendq_size; qp_init_attr.cap.max_recv_wr = mad_recvq_size; qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG; qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG; qp_init_attr.qp_type = qp_type; qp_init_attr.port_num = qp_info->port_priv->port_num; qp_init_attr.qp_context = qp_info; qp_init_attr.event_handler = qp_event_handler; qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr); if (IS_ERR(qp_info->qp)) { - printk(KERN_ERR PFX "Couldn't create ib_mad QP%d\n", - get_spl_qp_index(qp_type)); + dev_err(&qp_info->port_priv->device->dev, + "Couldn't create ib_mad QP%d\n", + get_spl_qp_index(qp_type)); ret = PTR_ERR(qp_info->qp); goto error; } /* Use minimum queue sizes unless the CQ is resized */ qp_info->send_queue.max_active = mad_sendq_size; qp_info->recv_queue.max_active = mad_recvq_size; return 0; error: return ret; } static void destroy_mad_qp(struct ib_mad_qp_info *qp_info) { if (!qp_info->qp) return; ib_destroy_qp(qp_info->qp); kfree(qp_info->snoop_table); } /* * Open the port * Create the QP, PD, MR, and CQ if needed */ static int ib_mad_port_open(struct ib_device *device, int port_num) { int ret, cq_size; struct ib_mad_port_private *port_priv; unsigned long flags; char name[sizeof "ib_mad123"]; int has_smi; + if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE)) + return -EFAULT; + + if (WARN_ON(rdma_cap_opa_mad(device, port_num) && + rdma_max_mad_size(device, port_num) < OPA_MGMT_MAD_SIZE)) + return -EFAULT; + /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); if (!port_priv) { - printk(KERN_ERR PFX "No memory for ib_mad_port_private\n"); + dev_err(&device->dev, "No memory for ib_mad_port_private\n"); return -ENOMEM; } port_priv->device = device; port_priv->port_num = port_num; spin_lock_init(&port_priv->reg_lock); INIT_LIST_HEAD(&port_priv->agent_list); init_mad_qp(port_priv, &port_priv->qp_info[0]); init_mad_qp(port_priv, &port_priv->qp_info[1]); cq_size = mad_sendq_size + mad_recvq_size; - has_smi = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND; + has_smi = rdma_cap_ib_smi(device, port_num); if (has_smi) cq_size *= 2; - port_priv->cq = ib_create_cq(port_priv->device, - ib_mad_thread_completion_handler, - NULL, port_priv, cq_size, 0); + port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0, + IB_POLL_WORKQUEUE); if (IS_ERR(port_priv->cq)) { - printk(KERN_ERR PFX "Couldn't create ib_mad CQ\n"); + dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); goto error3; } - port_priv->pd = ib_alloc_pd(device); + port_priv->pd = ib_alloc_pd(device, 0); if (IS_ERR(port_priv->pd)) { - printk(KERN_ERR PFX "Couldn't create ib_mad PD\n"); + dev_err(&device->dev, "Couldn't create ib_mad PD\n"); ret = PTR_ERR(port_priv->pd); goto error4; } - port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(port_priv->mr)) { - printk(KERN_ERR PFX "Couldn't get ib_mad DMA MR\n"); - ret = PTR_ERR(port_priv->mr); - goto error5; - } - if (has_smi) { ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI); if (ret) goto error6; } ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI); if (ret) goto error7; snprintf(name, sizeof name, "ib_mad%d", port_num); - port_priv->wq = create_singlethread_workqueue(name); + port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); if (!port_priv->wq) { ret = -ENOMEM; goto error8; } - INIT_WORK(&port_priv->work, ib_mad_completion_handler); - if (sa_cc_init(&port_priv->sa_cc)) - goto error9; - - spin_lock_irqsave(&ib_mad_port_list_lock, flags); list_add_tail(&port_priv->port_list, &ib_mad_port_list); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); ret = ib_mad_port_start(port_priv); if (ret) { - printk(KERN_ERR PFX "Couldn't start port\n"); - goto error10; + dev_err(&device->dev, "Couldn't start port\n"); + goto error9; } return 0; -error10: +error9: spin_lock_irqsave(&ib_mad_port_list_lock, flags); list_del_init(&port_priv->port_list); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); destroy_workqueue(port_priv->wq); -error9: - sa_cc_destroy(&port_priv->sa_cc); error8: destroy_mad_qp(&port_priv->qp_info[1]); error7: destroy_mad_qp(&port_priv->qp_info[0]); error6: - ib_dereg_mr(port_priv->mr); -error5: ib_dealloc_pd(port_priv->pd); error4: - ib_destroy_cq(port_priv->cq); + ib_free_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); error3: kfree(port_priv); return ret; } /* * Close the port * If there are no classes using the port, free the port * resources (CQ, MR, PD, QP) and remove the port's info structure */ static int ib_mad_port_close(struct ib_device *device, int port_num) { struct ib_mad_port_private *port_priv; unsigned long flags; spin_lock_irqsave(&ib_mad_port_list_lock, flags); port_priv = __ib_get_mad_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); - printk(KERN_ERR PFX "Port %d not found\n", port_num); + dev_err(&device->dev, "Port %d not found\n", port_num); return -ENODEV; } list_del_init(&port_priv->port_list); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); destroy_workqueue(port_priv->wq); - sa_cc_destroy(&port_priv->sa_cc); destroy_mad_qp(&port_priv->qp_info[1]); destroy_mad_qp(&port_priv->qp_info[0]); - ib_dereg_mr(port_priv->mr); ib_dealloc_pd(port_priv->pd); - ib_destroy_cq(port_priv->cq); + ib_free_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); /* XXX: Handle deallocation of MAD registration tables */ kfree(port_priv); return 0; } static void ib_mad_init_device(struct ib_device *device) { - int start, end, i; + int start, i; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; + start = rdma_start_port(device); - if (device->node_type == RDMA_NODE_IB_SWITCH) { - start = 0; - end = 0; - } else { - start = 1; - end = device->phys_port_cnt; - } + for (i = start; i <= rdma_end_port(device); i++) { + if (!rdma_cap_ib_mad(device, i)) + continue; - for (i = start; i <= end; i++) { if (ib_mad_port_open(device, i)) { - printk(KERN_ERR PFX "Couldn't open %s port %d\n", - device->name, i); + dev_err(&device->dev, "Couldn't open port %d\n", i); goto error; } if (ib_agent_port_open(device, i)) { - printk(KERN_ERR PFX "Couldn't open %s port %d " - "for agents\n", - device->name, i); + dev_err(&device->dev, + "Couldn't open port %d for agents\n", i); goto error_agent; } } return; error_agent: if (ib_mad_port_close(device, i)) - printk(KERN_ERR PFX "Couldn't close %s port %d\n", - device->name, i); + dev_err(&device->dev, "Couldn't close port %d\n", i); error: - i--; + while (--i >= start) { + if (!rdma_cap_ib_mad(device, i)) + continue; - while (i >= start) { if (ib_agent_port_close(device, i)) - printk(KERN_ERR PFX "Couldn't close %s port %d " - "for agents\n", - device->name, i); + dev_err(&device->dev, + "Couldn't close port %d for agents\n", i); if (ib_mad_port_close(device, i)) - printk(KERN_ERR PFX "Couldn't close %s port %d\n", - device->name, i); - i--; + dev_err(&device->dev, "Couldn't close port %d\n", i); } } -static void ib_mad_remove_device(struct ib_device *device) +static void ib_mad_remove_device(struct ib_device *device, void *client_data) { - int i, num_ports, cur_port; + int i; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; + for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + if (!rdma_cap_ib_mad(device, i)) + continue; - if (device->node_type == RDMA_NODE_IB_SWITCH) { - num_ports = 1; - cur_port = 0; - } else { - num_ports = device->phys_port_cnt; - cur_port = 1; + if (ib_agent_port_close(device, i)) + dev_err(&device->dev, + "Couldn't close port %d for agents\n", i); + if (ib_mad_port_close(device, i)) + dev_err(&device->dev, "Couldn't close port %d\n", i); } - for (i = 0; i < num_ports; i++, cur_port++) { - if (ib_agent_port_close(device, cur_port)) - printk(KERN_ERR PFX "Couldn't close %s port %d " - "for agents\n", - device->name, cur_port); - if (ib_mad_port_close(device, cur_port)) - printk(KERN_ERR PFX "Couldn't close %s port %d\n", - device->name, cur_port); - } } static struct ib_client mad_client = { .name = "mad", .add = ib_mad_init_device, .remove = ib_mad_remove_device }; -static int __init ib_mad_init_module(void) +int ib_mad_init(void) { - int ret; - mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE); mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE); mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE); mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE); - ib_mad_cache = kmem_cache_create("ib_mad", - sizeof(struct ib_mad_private), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - if (!ib_mad_cache) { - printk(KERN_ERR PFX "Couldn't create ib_mad cache\n"); - ret = -ENOMEM; - goto error1; - } - INIT_LIST_HEAD(&ib_mad_port_list); if (ib_register_client(&mad_client)) { - printk(KERN_ERR PFX "Couldn't register ib_mad client\n"); - ret = -EINVAL; - goto error2; + pr_err("Couldn't register ib_mad client\n"); + return -EINVAL; } return 0; - -error2: - kmem_cache_destroy(ib_mad_cache); -error1: - return ret; } -static void __exit ib_mad_cleanup_module(void) +void ib_mad_cleanup(void) { ib_unregister_client(&mad_client); - kmem_cache_destroy(ib_mad_cache); } - -module_init(ib_mad_init_module); -module_exit(ib_mad_cleanup_module); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/mad_priv.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/mad_priv.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/mad_priv.h (revision 319974) @@ -1,262 +1,225 @@ /* * Copyright (c) 2004, 2005, Voltaire, Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef __IB_MAD_PRIV_H__ #define __IB_MAD_PRIV_H__ #include #include #include #include #include +#include - -#define PFX "ib_mad: " - #define IB_MAD_QPS_CORE 2 /* Always QP0 and QP1 as a minimum */ /* QP and CQ parameters */ #define IB_MAD_QP_SEND_SIZE 128 #define IB_MAD_QP_RECV_SIZE 512 #define IB_MAD_QP_MIN_SIZE 64 #define IB_MAD_QP_MAX_SIZE 8192 #define IB_MAD_SEND_REQ_MAX_SG 2 #define IB_MAD_RECV_REQ_MAX_SG 1 #define IB_MAD_SEND_Q_PSN 0 /* Registration table sizes */ #define MAX_MGMT_CLASS 80 -#define MAX_MGMT_VERSION 8 +#define MAX_MGMT_VERSION 0x83 #define MAX_MGMT_OUI 8 #define MAX_MGMT_VENDOR_RANGE2 (IB_MGMT_CLASS_VENDOR_RANGE2_END - \ IB_MGMT_CLASS_VENDOR_RANGE2_START + 1) struct ib_mad_list_head { struct list_head list; + struct ib_cqe cqe; struct ib_mad_queue *mad_queue; }; struct ib_mad_private_header { struct ib_mad_list_head mad_list; struct ib_mad_recv_wc recv_wc; struct ib_wc wc; u64 mapping; } __attribute__ ((packed)); struct ib_mad_private { struct ib_mad_private_header header; + size_t mad_size; struct ib_grh grh; - union { - struct ib_mad mad; - struct ib_rmpp_mad rmpp_mad; - struct ib_smp smp; - } mad; + u8 mad[0]; } __attribute__ ((packed)); struct ib_rmpp_segment { struct list_head list; u32 num; u8 data[0]; }; struct ib_mad_agent_private { struct list_head agent_list; struct ib_mad_agent agent; struct ib_mad_reg_req *reg_req; struct ib_mad_qp_info *qp_info; spinlock_t lock; struct list_head send_list; struct list_head wait_list; struct list_head done_list; struct delayed_work timed_work; unsigned long timeout; struct list_head local_list; struct work_struct local_work; struct list_head rmpp_list; atomic_t refcount; struct completion comp; }; struct ib_mad_snoop_private { struct ib_mad_agent agent; struct ib_mad_qp_info *qp_info; int snoop_index; int mad_snoop_flags; atomic_t refcount; struct completion comp; }; -/* Structure for timeout-fifo entry */ -struct tf_entry { - unsigned long exp_time; /* entry expiration time */ - struct list_head fifo_list; /* to keep entries in fifo order */ - struct list_head to_list; /* to keep entries in timeout order */ - int canceled; /* indicates whether entry is canceled */ -}; - struct ib_mad_send_wr_private { struct ib_mad_list_head mad_list; struct list_head agent_list; struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_buf send_buf; u64 header_mapping; u64 payload_mapping; - struct ib_send_wr send_wr; + struct ib_ud_wr send_wr; struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG]; __be64 tid; unsigned long timeout; int max_retries; int retries_left; int retry; int refcount; enum ib_wc_status status; /* RMPP control */ struct list_head rmpp_list; struct ib_rmpp_segment *last_ack_seg; struct ib_rmpp_segment *cur_seg; int last_ack; int seg_num; int newwin; int pad; - - /* SA congestion controlled MAD */ - int is_sa_cc_mad; - struct tf_entry tf_list; }; struct ib_mad_local_private { struct list_head completion_list; struct ib_mad_private *mad_priv; struct ib_mad_agent_private *recv_mad_agent; struct ib_mad_send_wr_private *mad_send_wr; + size_t return_wc_byte_len; }; struct ib_mad_mgmt_method_table { struct ib_mad_agent_private *agent[IB_MGMT_MAX_METHODS]; }; struct ib_mad_mgmt_class_table { struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS]; }; struct ib_mad_mgmt_vendor_class { u8 oui[MAX_MGMT_OUI][3]; struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_OUI]; }; struct ib_mad_mgmt_vendor_class_table { struct ib_mad_mgmt_vendor_class *vendor_class[MAX_MGMT_VENDOR_RANGE2]; }; struct ib_mad_mgmt_version_table { struct ib_mad_mgmt_class_table *class; struct ib_mad_mgmt_vendor_class_table *vendor; }; struct ib_mad_queue { spinlock_t lock; struct list_head list; int count; int max_active; struct ib_mad_qp_info *qp_info; }; struct ib_mad_qp_info { struct ib_mad_port_private *port_priv; struct ib_qp *qp; struct ib_mad_queue send_queue; struct ib_mad_queue recv_queue; struct list_head overflow_list; spinlock_t snoop_lock; struct ib_mad_snoop_private **snoop_table; int snoop_table_size; atomic_t snoop_count; }; -struct to_fifo { - struct list_head to_head; - struct list_head fifo_head; - spinlock_t lists_lock; - struct timer_list timer; - struct work_struct work; - u32 fifo_size; - u32 num_items; - int stop_enqueue; - struct workqueue_struct *workq; -}; - -/* SA congestion control data */ -struct sa_cc_data { - spinlock_t lock; - unsigned long outstanding; - struct to_fifo *tf; -}; - struct ib_mad_port_private { struct list_head port_list; struct ib_device *device; int port_num; struct ib_cq *cq; struct ib_pd *pd; - struct ib_mr *mr; spinlock_t reg_lock; struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION]; struct list_head agent_list; struct workqueue_struct *wq; - struct work_struct work; struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE]; - struct sa_cc_data sa_cc; }; int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr); struct ib_mad_send_wr_private * -ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_recv_wc *mad_recv_wc); +ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_recv_wc *mad_recv_wc); void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, struct ib_mad_send_wc *mad_send_wc); void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr); void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, int timeout_ms); #endif /* __IB_MAD_PRIV_H__ */ Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/mad_rmpp.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/mad_rmpp.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/mad_rmpp.c (revision 319974) @@ -1,953 +1,968 @@ /* * Copyright (c) 2005 Intel Inc. All rights reserved. * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "mad_priv.h" #include "mad_rmpp.h" enum rmpp_state { RMPP_STATE_ACTIVE, RMPP_STATE_TIMEOUT, RMPP_STATE_COMPLETE, RMPP_STATE_CANCELING }; struct mad_rmpp_recv { struct ib_mad_agent_private *agent; struct list_head list; struct delayed_work timeout_work; struct delayed_work cleanup_work; struct completion comp; enum rmpp_state state; spinlock_t lock; atomic_t refcount; struct ib_ah *ah; struct ib_mad_recv_wc *rmpp_wc; struct ib_mad_recv_buf *cur_seg_buf; int last_ack; int seg_num; int newwin; int repwin; __be64 tid; u32 src_qp; u16 slid; u8 mgmt_class; u8 class_version; u8 method; + u8 base_version; }; static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) { if (atomic_dec_and_test(&rmpp_recv->refcount)) complete(&rmpp_recv->comp); } static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) { deref_rmpp_recv(rmpp_recv); wait_for_completion(&rmpp_recv->comp); ib_destroy_ah(rmpp_recv->ah); kfree(rmpp_recv); } void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent) { struct mad_rmpp_recv *rmpp_recv, *temp_rmpp_recv; unsigned long flags; spin_lock_irqsave(&agent->lock, flags); list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { if (rmpp_recv->state != RMPP_STATE_COMPLETE) ib_free_recv_mad(rmpp_recv->rmpp_wc); rmpp_recv->state = RMPP_STATE_CANCELING; } spin_unlock_irqrestore(&agent->lock, flags); list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { cancel_delayed_work(&rmpp_recv->timeout_work); cancel_delayed_work(&rmpp_recv->cleanup_work); } flush_workqueue(agent->qp_info->port_priv->wq); list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv, &agent->rmpp_list, list) { list_del(&rmpp_recv->list); destroy_rmpp_recv(rmpp_recv); } } static void format_ack(struct ib_mad_send_buf *msg, struct ib_rmpp_mad *data, struct mad_rmpp_recv *rmpp_recv) { struct ib_rmpp_mad *ack = msg->mad; unsigned long flags; memcpy(ack, &data->mad_hdr, msg->hdr_len); ack->mad_hdr.method ^= IB_MGMT_METHOD_RESP; ack->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ACK; ib_set_rmpp_flags(&ack->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); spin_lock_irqsave(&rmpp_recv->lock, flags); rmpp_recv->last_ack = rmpp_recv->seg_num; ack->rmpp_hdr.seg_num = cpu_to_be32(rmpp_recv->seg_num); ack->rmpp_hdr.paylen_newwin = cpu_to_be32(rmpp_recv->newwin); spin_unlock_irqrestore(&rmpp_recv->lock, flags); } static void ack_recv(struct mad_rmpp_recv *rmpp_recv, struct ib_mad_recv_wc *recv_wc) { struct ib_mad_send_buf *msg; int ret, hdr_len; hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp, recv_wc->wc->pkey_index, 1, hdr_len, - 0, GFP_KERNEL); + 0, GFP_KERNEL, + IB_MGMT_BASE_VERSION); if (IS_ERR(msg)) return; format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv); msg->ah = rmpp_recv->ah; ret = ib_post_send_mad(msg, NULL); if (ret) ib_free_send_mad(msg); } static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent, struct ib_mad_recv_wc *recv_wc) { struct ib_mad_send_buf *msg; struct ib_ah *ah; int hdr_len; ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc, recv_wc->recv_buf.grh, agent->port_num); if (IS_ERR(ah)) return (void *) ah; hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); msg = ib_create_send_mad(agent, recv_wc->wc->src_qp, recv_wc->wc->pkey_index, 1, - hdr_len, 0, GFP_KERNEL); + hdr_len, 0, GFP_KERNEL, + IB_MGMT_BASE_VERSION); if (IS_ERR(msg)) ib_destroy_ah(ah); else { msg->ah = ah; msg->context[0] = ah; } return msg; } static void ack_ds_ack(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *recv_wc) { struct ib_mad_send_buf *msg; struct ib_rmpp_mad *rmpp_mad; int ret; msg = alloc_response_msg(&agent->agent, recv_wc); if (IS_ERR(msg)) return; rmpp_mad = msg->mad; memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len); rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP; ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); rmpp_mad->rmpp_hdr.seg_num = 0; rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(1); ret = ib_post_send_mad(msg, NULL); if (ret) { ib_destroy_ah(msg->ah); ib_free_send_mad(msg); } } void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc) { if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah) ib_destroy_ah(mad_send_wc->send_buf->ah); ib_free_send_mad(mad_send_wc->send_buf); } static void nack_recv(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *recv_wc, u8 rmpp_status) { struct ib_mad_send_buf *msg; struct ib_rmpp_mad *rmpp_mad; int ret; msg = alloc_response_msg(&agent->agent, recv_wc); if (IS_ERR(msg)) return; rmpp_mad = msg->mad; memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len); rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP; rmpp_mad->rmpp_hdr.rmpp_version = IB_MGMT_RMPP_VERSION; rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ABORT; ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); rmpp_mad->rmpp_hdr.rmpp_status = rmpp_status; rmpp_mad->rmpp_hdr.seg_num = 0; rmpp_mad->rmpp_hdr.paylen_newwin = 0; ret = ib_post_send_mad(msg, NULL); if (ret) { ib_destroy_ah(msg->ah); ib_free_send_mad(msg); } } static void recv_timeout_handler(struct work_struct *work) { struct mad_rmpp_recv *rmpp_recv = container_of(work, struct mad_rmpp_recv, timeout_work.work); struct ib_mad_recv_wc *rmpp_wc; unsigned long flags; spin_lock_irqsave(&rmpp_recv->agent->lock, flags); if (rmpp_recv->state != RMPP_STATE_ACTIVE) { spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); return; } rmpp_recv->state = RMPP_STATE_TIMEOUT; list_del(&rmpp_recv->list); spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); rmpp_wc = rmpp_recv->rmpp_wc; nack_recv(rmpp_recv->agent, rmpp_wc, IB_MGMT_RMPP_STATUS_T2L); destroy_rmpp_recv(rmpp_recv); ib_free_recv_mad(rmpp_wc); } static void recv_cleanup_handler(struct work_struct *work) { struct mad_rmpp_recv *rmpp_recv = container_of(work, struct mad_rmpp_recv, cleanup_work.work); unsigned long flags; spin_lock_irqsave(&rmpp_recv->agent->lock, flags); if (rmpp_recv->state == RMPP_STATE_CANCELING) { spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); return; } list_del(&rmpp_recv->list); spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); destroy_rmpp_recv(rmpp_recv); } static struct mad_rmpp_recv * create_rmpp_recv(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct mad_rmpp_recv *rmpp_recv; struct ib_mad_hdr *mad_hdr; rmpp_recv = kmalloc(sizeof *rmpp_recv, GFP_KERNEL); if (!rmpp_recv) return NULL; rmpp_recv->ah = ib_create_ah_from_wc(agent->agent.qp->pd, mad_recv_wc->wc, mad_recv_wc->recv_buf.grh, agent->agent.port_num); if (IS_ERR(rmpp_recv->ah)) goto error; rmpp_recv->agent = agent; init_completion(&rmpp_recv->comp); INIT_DELAYED_WORK(&rmpp_recv->timeout_work, recv_timeout_handler); INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler); spin_lock_init(&rmpp_recv->lock); rmpp_recv->state = RMPP_STATE_ACTIVE; atomic_set(&rmpp_recv->refcount, 1); rmpp_recv->rmpp_wc = mad_recv_wc; rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf; rmpp_recv->newwin = 1; rmpp_recv->seg_num = 1; rmpp_recv->last_ack = 0; rmpp_recv->repwin = 1; mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr; rmpp_recv->tid = mad_hdr->tid; rmpp_recv->src_qp = mad_recv_wc->wc->src_qp; rmpp_recv->slid = mad_recv_wc->wc->slid; rmpp_recv->mgmt_class = mad_hdr->mgmt_class; rmpp_recv->class_version = mad_hdr->class_version; rmpp_recv->method = mad_hdr->method; + rmpp_recv->base_version = mad_hdr->base_version; return rmpp_recv; error: kfree(rmpp_recv); return NULL; } static struct mad_rmpp_recv * find_rmpp_recv(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct mad_rmpp_recv *rmpp_recv; struct ib_mad_hdr *mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr; list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { if (rmpp_recv->tid == mad_hdr->tid && rmpp_recv->src_qp == mad_recv_wc->wc->src_qp && rmpp_recv->slid == mad_recv_wc->wc->slid && rmpp_recv->mgmt_class == mad_hdr->mgmt_class && rmpp_recv->class_version == mad_hdr->class_version && rmpp_recv->method == mad_hdr->method) return rmpp_recv; } return NULL; } static struct mad_rmpp_recv * acquire_rmpp_recv(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct mad_rmpp_recv *rmpp_recv; unsigned long flags; spin_lock_irqsave(&agent->lock, flags); rmpp_recv = find_rmpp_recv(agent, mad_recv_wc); if (rmpp_recv) atomic_inc(&rmpp_recv->refcount); spin_unlock_irqrestore(&agent->lock, flags); return rmpp_recv; } static struct mad_rmpp_recv * insert_rmpp_recv(struct ib_mad_agent_private *agent, struct mad_rmpp_recv *rmpp_recv) { struct mad_rmpp_recv *cur_rmpp_recv; cur_rmpp_recv = find_rmpp_recv(agent, rmpp_recv->rmpp_wc); if (!cur_rmpp_recv) list_add_tail(&rmpp_recv->list, &agent->rmpp_list); return cur_rmpp_recv; } static inline int get_last_flag(struct ib_mad_recv_buf *seg) { struct ib_rmpp_mad *rmpp_mad; rmpp_mad = (struct ib_rmpp_mad *) seg->mad; return ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_LAST; } static inline int get_seg_num(struct ib_mad_recv_buf *seg) { struct ib_rmpp_mad *rmpp_mad; rmpp_mad = (struct ib_rmpp_mad *) seg->mad; return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num); } static inline struct ib_mad_recv_buf * get_next_seg(struct list_head *rmpp_list, struct ib_mad_recv_buf *seg) { if (seg->list.next == rmpp_list) return NULL; return container_of(seg->list.next, struct ib_mad_recv_buf, list); } static inline int window_size(struct ib_mad_agent_private *agent) { return max(agent->qp_info->recv_queue.max_active >> 3, 1); } static struct ib_mad_recv_buf * find_seg_location(struct list_head *rmpp_list, int seg_num) { struct ib_mad_recv_buf *seg_buf; int cur_seg_num; list_for_each_entry_reverse(seg_buf, rmpp_list, list) { cur_seg_num = get_seg_num(seg_buf); if (seg_num > cur_seg_num) return seg_buf; if (seg_num == cur_seg_num) break; } return NULL; } static void update_seg_num(struct mad_rmpp_recv *rmpp_recv, struct ib_mad_recv_buf *new_buf) { struct list_head *rmpp_list = &rmpp_recv->rmpp_wc->rmpp_list; while (new_buf && (get_seg_num(new_buf) == rmpp_recv->seg_num + 1)) { rmpp_recv->cur_seg_buf = new_buf; rmpp_recv->seg_num++; new_buf = get_next_seg(rmpp_list, new_buf); } } static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv) { struct ib_rmpp_mad *rmpp_mad; int hdr_size, data_size, pad; + bool opa = rdma_cap_opa_mad(rmpp_recv->agent->qp_info->port_priv->device, + rmpp_recv->agent->qp_info->port_priv->port_num); rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad; hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class); - data_size = sizeof(struct ib_rmpp_mad) - hdr_size; - pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); - if (pad > IB_MGMT_RMPP_DATA || pad < 0) - pad = 0; + if (opa && rmpp_recv->base_version == OPA_MGMT_BASE_VERSION) { + data_size = sizeof(struct opa_rmpp_mad) - hdr_size; + pad = OPA_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); + if (pad > OPA_MGMT_RMPP_DATA || pad < 0) + pad = 0; + } else { + data_size = sizeof(struct ib_rmpp_mad) - hdr_size; + pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); + if (pad > IB_MGMT_RMPP_DATA || pad < 0) + pad = 0; + } return hdr_size + rmpp_recv->seg_num * data_size - pad; } static struct ib_mad_recv_wc * complete_rmpp(struct mad_rmpp_recv *rmpp_recv) { struct ib_mad_recv_wc *rmpp_wc; ack_recv(rmpp_recv, rmpp_recv->rmpp_wc); if (rmpp_recv->seg_num > 1) cancel_delayed_work(&rmpp_recv->timeout_work); rmpp_wc = rmpp_recv->rmpp_wc; rmpp_wc->mad_len = get_mad_len(rmpp_recv); /* 10 seconds until we can find the packet lifetime */ queue_delayed_work(rmpp_recv->agent->qp_info->port_priv->wq, &rmpp_recv->cleanup_work, msecs_to_jiffies(10000)); return rmpp_wc; } static struct ib_mad_recv_wc * continue_rmpp(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct mad_rmpp_recv *rmpp_recv; struct ib_mad_recv_buf *prev_buf; struct ib_mad_recv_wc *done_wc; int seg_num; unsigned long flags; rmpp_recv = acquire_rmpp_recv(agent, mad_recv_wc); if (!rmpp_recv) goto drop1; seg_num = get_seg_num(&mad_recv_wc->recv_buf); spin_lock_irqsave(&rmpp_recv->lock, flags); if ((rmpp_recv->state == RMPP_STATE_TIMEOUT) || (seg_num > rmpp_recv->newwin)) goto drop3; if ((seg_num <= rmpp_recv->last_ack) || (rmpp_recv->state == RMPP_STATE_COMPLETE)) { spin_unlock_irqrestore(&rmpp_recv->lock, flags); ack_recv(rmpp_recv, mad_recv_wc); goto drop2; } prev_buf = find_seg_location(&rmpp_recv->rmpp_wc->rmpp_list, seg_num); if (!prev_buf) goto drop3; done_wc = NULL; list_add(&mad_recv_wc->recv_buf.list, &prev_buf->list); if (rmpp_recv->cur_seg_buf == prev_buf) { update_seg_num(rmpp_recv, &mad_recv_wc->recv_buf); if (get_last_flag(rmpp_recv->cur_seg_buf)) { rmpp_recv->state = RMPP_STATE_COMPLETE; spin_unlock_irqrestore(&rmpp_recv->lock, flags); done_wc = complete_rmpp(rmpp_recv); goto out; } else if (rmpp_recv->seg_num == rmpp_recv->newwin) { rmpp_recv->newwin += window_size(agent); spin_unlock_irqrestore(&rmpp_recv->lock, flags); ack_recv(rmpp_recv, mad_recv_wc); goto out; } } spin_unlock_irqrestore(&rmpp_recv->lock, flags); out: deref_rmpp_recv(rmpp_recv); return done_wc; drop3: spin_unlock_irqrestore(&rmpp_recv->lock, flags); drop2: deref_rmpp_recv(rmpp_recv); drop1: ib_free_recv_mad(mad_recv_wc); return NULL; } static struct ib_mad_recv_wc * start_rmpp(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct mad_rmpp_recv *rmpp_recv; unsigned long flags; rmpp_recv = create_rmpp_recv(agent, mad_recv_wc); if (!rmpp_recv) { ib_free_recv_mad(mad_recv_wc); return NULL; } spin_lock_irqsave(&agent->lock, flags); if (insert_rmpp_recv(agent, rmpp_recv)) { spin_unlock_irqrestore(&agent->lock, flags); /* duplicate first MAD */ destroy_rmpp_recv(rmpp_recv); return continue_rmpp(agent, mad_recv_wc); } atomic_inc(&rmpp_recv->refcount); if (get_last_flag(&mad_recv_wc->recv_buf)) { rmpp_recv->state = RMPP_STATE_COMPLETE; spin_unlock_irqrestore(&agent->lock, flags); complete_rmpp(rmpp_recv); } else { spin_unlock_irqrestore(&agent->lock, flags); /* 40 seconds until we can find the packet lifetimes */ queue_delayed_work(agent->qp_info->port_priv->wq, &rmpp_recv->timeout_work, msecs_to_jiffies(40000)); rmpp_recv->newwin += window_size(agent); ack_recv(rmpp_recv, mad_recv_wc); mad_recv_wc = NULL; } deref_rmpp_recv(rmpp_recv); return mad_recv_wc; } static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_rmpp_mad *rmpp_mad; int timeout; u32 paylen = 0; rmpp_mad = mad_send_wr->send_buf.mad; ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); rmpp_mad->rmpp_hdr.seg_num = cpu_to_be32(++mad_send_wr->seg_num); if (mad_send_wr->seg_num == 1) { rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST; - paylen = mad_send_wr->send_buf.seg_count * IB_MGMT_RMPP_DATA - - mad_send_wr->pad; + paylen = (mad_send_wr->send_buf.seg_count * + mad_send_wr->send_buf.seg_rmpp_size) - + mad_send_wr->pad; } if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) { rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST; - paylen = IB_MGMT_RMPP_DATA - mad_send_wr->pad; + paylen = mad_send_wr->send_buf.seg_rmpp_size - mad_send_wr->pad; } rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen); /* 2 seconds for an ACK until we can find the packet lifetime */ timeout = mad_send_wr->send_buf.timeout_ms; if (!timeout || timeout > 2000) mad_send_wr->timeout = msecs_to_jiffies(2000); return ib_send_mad(mad_send_wr); } static void abort_send(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc, u8 rmpp_status) { struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc wc; unsigned long flags; spin_lock_irqsave(&agent->lock, flags); mad_send_wr = ib_find_send_mad(agent, mad_recv_wc); if (!mad_send_wr) goto out; /* Unmatched send */ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS)) goto out; /* Send is already done */ ib_mark_mad_done(mad_send_wr); spin_unlock_irqrestore(&agent->lock, flags); wc.status = IB_WC_REM_ABORT_ERR; wc.vendor_err = rmpp_status; wc.send_buf = &mad_send_wr->send_buf; ib_mad_complete_send_wr(mad_send_wr, &wc); return; out: spin_unlock_irqrestore(&agent->lock, flags); } static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr, int seg_num) { struct list_head *list; wr->last_ack = seg_num; list = &wr->last_ack_seg->list; list_for_each_entry(wr->last_ack_seg, list, list) if (wr->last_ack_seg->num == seg_num) break; } static void process_ds_ack(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc, int newwin) { struct mad_rmpp_recv *rmpp_recv; rmpp_recv = find_rmpp_recv(agent, mad_recv_wc); if (rmpp_recv && rmpp_recv->state == RMPP_STATE_COMPLETE) rmpp_recv->repwin = newwin; } static void process_rmpp_ack(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_mad_send_wr_private *mad_send_wr; struct ib_rmpp_mad *rmpp_mad; unsigned long flags; int seg_num, newwin, ret; rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; if (rmpp_mad->rmpp_hdr.rmpp_status) { abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); return; } seg_num = be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num); newwin = be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); if (newwin < seg_num) { abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S); nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S); return; } spin_lock_irqsave(&agent->lock, flags); mad_send_wr = ib_find_send_mad(agent, mad_recv_wc); if (!mad_send_wr) { if (!seg_num) process_ds_ack(agent, mad_recv_wc, newwin); goto out; /* Unmatched or DS RMPP ACK */ } if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) && (mad_send_wr->timeout)) { spin_unlock_irqrestore(&agent->lock, flags); ack_ds_ack(agent, mad_recv_wc); return; /* Repeated ACK for DS RMPP transaction */ } if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS)) goto out; /* Send is already done */ if (seg_num > mad_send_wr->send_buf.seg_count || seg_num > mad_send_wr->newwin) { spin_unlock_irqrestore(&agent->lock, flags); abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B); nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B); return; } if (newwin < mad_send_wr->newwin || seg_num < mad_send_wr->last_ack) goto out; /* Old ACK */ if (seg_num > mad_send_wr->last_ack) { adjust_last_ack(mad_send_wr, seg_num); mad_send_wr->retries_left = mad_send_wr->max_retries; } mad_send_wr->newwin = newwin; if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) { /* If no response is expected, the ACK completes the send */ if (!mad_send_wr->send_buf.timeout_ms) { struct ib_mad_send_wc wc; ib_mark_mad_done(mad_send_wr); spin_unlock_irqrestore(&agent->lock, flags); wc.status = IB_WC_SUCCESS; wc.vendor_err = 0; wc.send_buf = &mad_send_wr->send_buf; ib_mad_complete_send_wr(mad_send_wr, &wc); return; } if (mad_send_wr->refcount == 1) ib_reset_mad_timeout(mad_send_wr, mad_send_wr->send_buf.timeout_ms); spin_unlock_irqrestore(&agent->lock, flags); ack_ds_ack(agent, mad_recv_wc); return; } else if (mad_send_wr->refcount == 1 && mad_send_wr->seg_num < mad_send_wr->newwin && mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) { /* Send failure will just result in a timeout/retry */ ret = send_next_seg(mad_send_wr); if (ret) goto out; mad_send_wr->refcount++; list_move_tail(&mad_send_wr->agent_list, &mad_send_wr->mad_agent_priv->send_list); } out: spin_unlock_irqrestore(&agent->lock, flags); } static struct ib_mad_recv_wc * process_rmpp_data(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_rmpp_hdr *rmpp_hdr; u8 rmpp_status; rmpp_hdr = &((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr; if (rmpp_hdr->rmpp_status) { rmpp_status = IB_MGMT_RMPP_STATUS_BAD_STATUS; goto bad; } if (rmpp_hdr->seg_num == cpu_to_be32(1)) { if (!(ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST)) { rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG; goto bad; } return start_rmpp(agent, mad_recv_wc); } else { if (ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST) { rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG; goto bad; } return continue_rmpp(agent, mad_recv_wc); } bad: nack_recv(agent, mad_recv_wc, rmpp_status); ib_free_recv_mad(mad_recv_wc); return NULL; } static void process_rmpp_stop(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_rmpp_mad *rmpp_mad; rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; if (rmpp_mad->rmpp_hdr.rmpp_status != IB_MGMT_RMPP_STATUS_RESX) { abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); } else abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status); } static void process_rmpp_abort(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_rmpp_mad *rmpp_mad; rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; if (rmpp_mad->rmpp_hdr.rmpp_status < IB_MGMT_RMPP_STATUS_ABORT_MIN || rmpp_mad->rmpp_hdr.rmpp_status > IB_MGMT_RMPP_STATUS_ABORT_MAX) { abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); } else abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status); } struct ib_mad_recv_wc * ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_rmpp_mad *rmpp_mad; rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; if (!(rmpp_mad->rmpp_hdr.rmpp_rtime_flags & IB_MGMT_RMPP_FLAG_ACTIVE)) return mad_recv_wc; if (rmpp_mad->rmpp_hdr.rmpp_version != IB_MGMT_RMPP_VERSION) { abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV); nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV); goto out; } switch (rmpp_mad->rmpp_hdr.rmpp_type) { case IB_MGMT_RMPP_TYPE_DATA: return process_rmpp_data(agent, mad_recv_wc); case IB_MGMT_RMPP_TYPE_ACK: process_rmpp_ack(agent, mad_recv_wc); break; case IB_MGMT_RMPP_TYPE_STOP: process_rmpp_stop(agent, mad_recv_wc); break; case IB_MGMT_RMPP_TYPE_ABORT: process_rmpp_abort(agent, mad_recv_wc); break; default: abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT); nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT); break; } out: ib_free_recv_mad(mad_recv_wc); return NULL; } static int init_newwin(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv; struct ib_mad_hdr *mad_hdr = mad_send_wr->send_buf.mad; struct mad_rmpp_recv *rmpp_recv; struct ib_ah_attr ah_attr; unsigned long flags; int newwin = 1; if (!(mad_hdr->method & IB_MGMT_METHOD_RESP)) goto out; spin_lock_irqsave(&agent->lock, flags); list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { if (rmpp_recv->tid != mad_hdr->tid || rmpp_recv->mgmt_class != mad_hdr->mgmt_class || rmpp_recv->class_version != mad_hdr->class_version || (rmpp_recv->method & IB_MGMT_METHOD_RESP)) continue; if (ib_query_ah(mad_send_wr->send_buf.ah, &ah_attr)) continue; if (rmpp_recv->slid == ah_attr.dlid) { newwin = rmpp_recv->repwin; break; } } spin_unlock_irqrestore(&agent->lock, flags); out: return newwin; } int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_rmpp_mad *rmpp_mad; int ret; rmpp_mad = mad_send_wr->send_buf.mad; if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) return IB_RMPP_RESULT_UNHANDLED; if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) { mad_send_wr->seg_num = 1; return IB_RMPP_RESULT_INTERNAL; } mad_send_wr->newwin = init_newwin(mad_send_wr); /* We need to wait for the final ACK even if there isn't a response */ mad_send_wr->refcount += (mad_send_wr->timeout == 0); ret = send_next_seg(mad_send_wr); if (!ret) return IB_RMPP_RESULT_CONSUMED; return ret; } int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr, struct ib_mad_send_wc *mad_send_wc) { struct ib_rmpp_mad *rmpp_mad; int ret; rmpp_mad = mad_send_wr->send_buf.mad; if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */ if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) return IB_RMPP_RESULT_INTERNAL; /* ACK, STOP, or ABORT */ if (mad_send_wc->status != IB_WC_SUCCESS || mad_send_wr->status != IB_WC_SUCCESS) return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */ if (!mad_send_wr->timeout) return IB_RMPP_RESULT_PROCESSED; /* Response received */ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) { mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); return IB_RMPP_RESULT_PROCESSED; /* Send done */ } if (mad_send_wr->seg_num == mad_send_wr->newwin || mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) return IB_RMPP_RESULT_PROCESSED; /* Wait for ACK */ ret = send_next_seg(mad_send_wr); if (ret) { mad_send_wc->status = IB_WC_GENERAL_ERR; return IB_RMPP_RESULT_PROCESSED; } return IB_RMPP_RESULT_CONSUMED; } int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_rmpp_mad *rmpp_mad; int ret; rmpp_mad = mad_send_wr->send_buf.mad; if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) return IB_RMPP_RESULT_PROCESSED; mad_send_wr->seg_num = mad_send_wr->last_ack; mad_send_wr->cur_seg = mad_send_wr->last_ack_seg; ret = send_next_seg(mad_send_wr); if (ret) return IB_RMPP_RESULT_PROCESSED; return IB_RMPP_RESULT_CONSUMED; } Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/multicast.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/multicast.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/multicast.c (revision 319974) @@ -1,916 +1,900 @@ /* * Copyright (c) 2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define LINUXKPI_PARAM_PREFIX ibcore_ #include #include #include #include -#include #include #include #include -#include #include #include #include "sa.h" -static int mcast_leave_retries = 3; - -/*static const struct kernel_param_ops retry_ops = { - .set = param_set_int, - .get = param_get_int, -}; - -module_param_cb(mcast_leave_retries, &retry_ops, &mcast_leave_retries, 0644); -MODULE_PARM_DESC(mcast_leave_retries, "Number of retries for multicast leave " - "requests before giving up (default: 3)"); -*/ static void mcast_add_one(struct ib_device *device); -static void mcast_remove_one(struct ib_device *device); +static void mcast_remove_one(struct ib_device *device, void *client_data); static struct ib_client mcast_client = { .name = "ib_multicast", .add = mcast_add_one, .remove = mcast_remove_one }; static struct ib_sa_client sa_client; static struct workqueue_struct *mcast_wq; static union ib_gid mgid0; struct mcast_device; struct mcast_port { struct mcast_device *dev; spinlock_t lock; struct rb_root table; atomic_t refcount; struct completion comp; u8 port_num; }; struct mcast_device { struct ib_device *device; struct ib_event_handler event_handler; int start_port; int end_port; struct mcast_port port[0]; }; enum mcast_state { MCAST_JOINING, MCAST_MEMBER, MCAST_ERROR, }; enum mcast_group_state { MCAST_IDLE, MCAST_BUSY, MCAST_GROUP_ERROR, MCAST_PKEY_EVENT }; enum { MCAST_INVALID_PKEY_INDEX = 0xFFFF }; struct mcast_member; struct mcast_group { struct ib_sa_mcmember_rec rec; struct rb_node node; struct mcast_port *port; spinlock_t lock; struct work_struct work; struct list_head pending_list; struct list_head active_list; struct mcast_member *last_join; - int members[3]; + int members[NUM_JOIN_MEMBERSHIP_TYPES]; atomic_t refcount; enum mcast_group_state state; struct ib_sa_query *query; - int query_id; u16 pkey_index; u8 leave_state; int retries; }; struct mcast_member { struct ib_sa_multicast multicast; struct ib_sa_client *client; struct mcast_group *group; struct list_head list; enum mcast_state state; atomic_t refcount; struct completion comp; }; static void join_handler(int status, struct ib_sa_mcmember_rec *rec, void *context); static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, void *context); static struct mcast_group *mcast_find(struct mcast_port *port, union ib_gid *mgid) { struct rb_node *node = port->table.rb_node; struct mcast_group *group; int ret; while (node) { group = rb_entry(node, struct mcast_group, node); ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid); if (!ret) return group; if (ret < 0) node = node->rb_left; else node = node->rb_right; } return NULL; } static struct mcast_group *mcast_insert(struct mcast_port *port, struct mcast_group *group, int allow_duplicates) { struct rb_node **link = &port->table.rb_node; struct rb_node *parent = NULL; struct mcast_group *cur_group; int ret; while (*link) { parent = *link; cur_group = rb_entry(parent, struct mcast_group, node); ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw, sizeof group->rec.mgid); if (ret < 0) link = &(*link)->rb_left; else if (ret > 0) link = &(*link)->rb_right; else if (allow_duplicates) link = &(*link)->rb_left; else return cur_group; } rb_link_node(&group->node, parent, link); rb_insert_color(&group->node, &port->table); return NULL; } static void deref_port(struct mcast_port *port) { if (atomic_dec_and_test(&port->refcount)) complete(&port->comp); } static void release_group(struct mcast_group *group) { struct mcast_port *port = group->port; unsigned long flags; spin_lock_irqsave(&port->lock, flags); if (atomic_dec_and_test(&group->refcount)) { rb_erase(&group->node, &port->table); spin_unlock_irqrestore(&port->lock, flags); kfree(group); deref_port(port); } else spin_unlock_irqrestore(&port->lock, flags); } static void deref_member(struct mcast_member *member) { if (atomic_dec_and_test(&member->refcount)) complete(&member->comp); } static void queue_join(struct mcast_member *member) { struct mcast_group *group = member->group; unsigned long flags; spin_lock_irqsave(&group->lock, flags); list_add_tail(&member->list, &group->pending_list); if (group->state == MCAST_IDLE) { group->state = MCAST_BUSY; atomic_inc(&group->refcount); queue_work(mcast_wq, &group->work); } spin_unlock_irqrestore(&group->lock, flags); } /* - * A multicast group has three types of members: full member, non member, and - * send only member. We need to keep track of the number of members of each + * A multicast group has four types of members: full member, non member, + * sendonly non member and sendonly full member. + * We need to keep track of the number of members of each * type based on their join state. Adjust the number of members the belong to * the specified join states. */ static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) { int i; - for (i = 0; i < 3; i++, join_state >>= 1) + for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1) if (join_state & 0x1) group->members[i] += inc; } /* * If a multicast group has zero members left for a particular join state, but * the group is still a member with the SA, we need to leave that join state. * Determine which join states we still belong to, but that do not have any * active members. */ static u8 get_leave_state(struct mcast_group *group) { u8 leave_state = 0; int i; - for (i = 0; i < 3; i++) + for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++) if (!group->members[i]) leave_state |= (0x1 << i); return leave_state & group->rec.join_state; } static int check_selector(ib_sa_comp_mask comp_mask, ib_sa_comp_mask selector_mask, ib_sa_comp_mask value_mask, u8 selector, u8 src_value, u8 dst_value) { int err; if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) return 0; switch (selector) { case IB_SA_GT: err = (src_value <= dst_value); break; case IB_SA_LT: err = (src_value >= dst_value); break; case IB_SA_EQ: err = (src_value != dst_value); break; default: err = 0; break; } return err; } static int cmp_rec(struct ib_sa_mcmember_rec *src, struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask) { /* MGID must already match */ if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID && memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, - IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector, - src->mtu, dst->mtu)) + IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector, + src->mtu, dst->mtu)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && src->traffic_class != dst->traffic_class) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, - IB_SA_MCMEMBER_REC_RATE, dst->rate_selector, - src->rate, dst->rate)) + IB_SA_MCMEMBER_REC_RATE, dst->rate_selector, + src->rate, dst->rate)) return -EINVAL; if (check_selector(comp_mask, - IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, - IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, - dst->packet_life_time_selector, - src->packet_life_time, dst->packet_life_time)) + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, + dst->packet_life_time_selector, + src->packet_life_time, dst->packet_life_time)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL && src->flow_label != dst->flow_label) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT && src->hop_limit != dst->hop_limit) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope) return -EINVAL; /* join_state checked separately, proxy_join ignored */ return 0; } static int send_join(struct mcast_group *group, struct mcast_member *member) { struct mcast_port *port = group->port; int ret; group->last_join = member; ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, port->port_num, IB_MGMT_METHOD_SET, &member->multicast.rec, member->multicast.comp_mask, 3000, GFP_KERNEL, join_handler, group, &group->query); - if (ret >= 0) { - group->query_id = ret; - ret = 0; - } - return ret; + return (ret > 0) ? 0 : ret; } static int send_leave(struct mcast_group *group, u8 leave_state) { struct mcast_port *port = group->port; struct ib_sa_mcmember_rec rec; int ret; rec = group->rec; rec.join_state = leave_state; group->leave_state = leave_state; ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, port->port_num, IB_SA_METHOD_DELETE, &rec, IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE, 3000, GFP_KERNEL, leave_handler, group, &group->query); - if (ret >= 0) { - group->query_id = ret; - ret = 0; - } - return ret; + return (ret > 0) ? 0 : ret; } static void join_group(struct mcast_group *group, struct mcast_member *member, u8 join_state) { member->state = MCAST_MEMBER; adjust_membership(group, join_state, 1); group->rec.join_state |= join_state; member->multicast.rec = group->rec; member->multicast.rec.join_state = join_state; list_move(&member->list, &group->active_list); } static int fail_join(struct mcast_group *group, struct mcast_member *member, int status) { spin_lock_irq(&group->lock); list_del_init(&member->list); spin_unlock_irq(&group->lock); return member->multicast.callback(status, &member->multicast); } static void process_group_error(struct mcast_group *group) { struct mcast_member *member; int ret = 0; u16 pkey_index; if (group->state == MCAST_PKEY_EVENT) ret = ib_find_pkey(group->port->dev->device, group->port->port_num, be16_to_cpu(group->rec.pkey), &pkey_index); spin_lock_irq(&group->lock); if (group->state == MCAST_PKEY_EVENT && !ret && group->pkey_index == pkey_index) goto out; while (!list_empty(&group->active_list)) { member = list_entry(group->active_list.next, struct mcast_member, list); atomic_inc(&member->refcount); list_del_init(&member->list); adjust_membership(group, member->multicast.rec.join_state, -1); member->state = MCAST_ERROR; spin_unlock_irq(&group->lock); ret = member->multicast.callback(-ENETRESET, &member->multicast); deref_member(member); if (ret) ib_sa_free_multicast(&member->multicast); spin_lock_irq(&group->lock); } group->rec.join_state = 0; out: group->state = MCAST_BUSY; spin_unlock_irq(&group->lock); } static void mcast_work_handler(struct work_struct *work) { struct mcast_group *group; struct mcast_member *member; struct ib_sa_multicast *multicast; int status, ret; u8 join_state; group = container_of(work, typeof(*group), work); retest: spin_lock_irq(&group->lock); while (!list_empty(&group->pending_list) || (group->state != MCAST_BUSY)) { if (group->state != MCAST_BUSY) { spin_unlock_irq(&group->lock); process_group_error(group); goto retest; } member = list_entry(group->pending_list.next, struct mcast_member, list); multicast = &member->multicast; join_state = multicast->rec.join_state; atomic_inc(&member->refcount); if (join_state == (group->rec.join_state & join_state)) { status = cmp_rec(&group->rec, &multicast->rec, multicast->comp_mask); if (!status) join_group(group, member, join_state); else list_del_init(&member->list); spin_unlock_irq(&group->lock); ret = multicast->callback(status, multicast); } else { spin_unlock_irq(&group->lock); status = send_join(group, member); if (!status) { deref_member(member); return; } ret = fail_join(group, member, status); } deref_member(member); if (ret) ib_sa_free_multicast(&member->multicast); spin_lock_irq(&group->lock); } join_state = get_leave_state(group); if (join_state) { group->rec.join_state &= ~join_state; spin_unlock_irq(&group->lock); if (send_leave(group, join_state)) goto retest; } else { group->state = MCAST_IDLE; spin_unlock_irq(&group->lock); release_group(group); } } /* * Fail a join request if it is still active - at the head of the pending queue. */ static void process_join_error(struct mcast_group *group, int status) { struct mcast_member *member; int ret; spin_lock_irq(&group->lock); member = list_entry(group->pending_list.next, struct mcast_member, list); if (group->last_join == member) { atomic_inc(&member->refcount); list_del_init(&member->list); spin_unlock_irq(&group->lock); ret = member->multicast.callback(status, &member->multicast); deref_member(member); if (ret) ib_sa_free_multicast(&member->multicast); } else spin_unlock_irq(&group->lock); } static void join_handler(int status, struct ib_sa_mcmember_rec *rec, void *context) { struct mcast_group *group = context; u16 pkey_index = MCAST_INVALID_PKEY_INDEX; if (status) process_join_error(group, status); else { + int mgids_changed, is_mgid0; ib_find_pkey(group->port->dev->device, group->port->port_num, be16_to_cpu(rec->pkey), &pkey_index); spin_lock_irq(&group->port->lock); - group->rec = *rec; if (group->state == MCAST_BUSY && group->pkey_index == MCAST_INVALID_PKEY_INDEX) group->pkey_index = pkey_index; - if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) { + mgids_changed = memcmp(&rec->mgid, &group->rec.mgid, + sizeof(group->rec.mgid)); + group->rec = *rec; + if (mgids_changed) { rb_erase(&group->node, &group->port->table); - mcast_insert(group->port, group, 1); + is_mgid0 = !memcmp(&mgid0, &group->rec.mgid, + sizeof(mgid0)); + mcast_insert(group->port, group, is_mgid0); } spin_unlock_irq(&group->port->lock); } mcast_work_handler(&group->work); } static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, void *context) { struct mcast_group *group = context; if (status && group->retries > 0 && !send_leave(group, group->leave_state)) group->retries--; - else { - if (status && group->retries <= 0) - printk(KERN_WARNING "reached max retry count. " - "status=%d. Giving up\n", status); + else mcast_work_handler(&group->work); - } } static struct mcast_group *acquire_group(struct mcast_port *port, union ib_gid *mgid, gfp_t gfp_mask) { struct mcast_group *group, *cur_group; unsigned long flags; int is_mgid0; is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0); if (!is_mgid0) { spin_lock_irqsave(&port->lock, flags); group = mcast_find(port, mgid); if (group) goto found; spin_unlock_irqrestore(&port->lock, flags); } group = kzalloc(sizeof *group, gfp_mask); if (!group) return NULL; - group->retries = mcast_leave_retries; + group->retries = 3; group->port = port; group->rec.mgid = *mgid; group->pkey_index = MCAST_INVALID_PKEY_INDEX; INIT_LIST_HEAD(&group->pending_list); INIT_LIST_HEAD(&group->active_list); INIT_WORK(&group->work, mcast_work_handler); spin_lock_init(&group->lock); spin_lock_irqsave(&port->lock, flags); cur_group = mcast_insert(port, group, is_mgid0); if (cur_group) { kfree(group); group = cur_group; } else atomic_inc(&port->refcount); found: atomic_inc(&group->refcount); spin_unlock_irqrestore(&port->lock, flags); return group; } /* * We serialize all join requests to a single group to make our lives much * easier. Otherwise, two users could try to join the same group * simultaneously, with different configurations, one could leave while the * join is in progress, etc., which makes locking around error recovery * difficult. */ struct ib_sa_multicast * ib_sa_join_multicast(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, gfp_t gfp_mask, int (*callback)(int status, struct ib_sa_multicast *multicast), void *context) { struct mcast_device *dev; struct mcast_member *member; struct ib_sa_multicast *multicast; int ret; dev = ib_get_client_data(device, &mcast_client); if (!dev) return ERR_PTR(-ENODEV); member = kmalloc(sizeof *member, gfp_mask); if (!member) return ERR_PTR(-ENOMEM); ib_sa_client_get(client); member->client = client; member->multicast.rec = *rec; member->multicast.comp_mask = comp_mask; member->multicast.callback = callback; member->multicast.context = context; init_completion(&member->comp); atomic_set(&member->refcount, 1); member->state = MCAST_JOINING; member->group = acquire_group(&dev->port[port_num - dev->start_port], &rec->mgid, gfp_mask); if (!member->group) { ret = -ENOMEM; goto err; } /* * The user will get the multicast structure in their callback. They * could then free the multicast structure before we can return from * this routine. So we save the pointer to return before queuing * any callback. */ multicast = &member->multicast; queue_join(member); return multicast; err: ib_sa_client_put(client); kfree(member); return ERR_PTR(ret); } EXPORT_SYMBOL(ib_sa_join_multicast); void ib_sa_free_multicast(struct ib_sa_multicast *multicast) { struct mcast_member *member; struct mcast_group *group; member = container_of(multicast, struct mcast_member, multicast); group = member->group; spin_lock_irq(&group->lock); if (member->state == MCAST_MEMBER) adjust_membership(group, multicast->rec.join_state, -1); list_del_init(&member->list); if (group->state == MCAST_IDLE) { group->state = MCAST_BUSY; spin_unlock_irq(&group->lock); /* Continue to hold reference on group until callback */ queue_work(mcast_wq, &group->work); } else { spin_unlock_irq(&group->lock); release_group(group); } deref_member(member); wait_for_completion(&member->comp); ib_sa_client_put(member->client); kfree(member); } EXPORT_SYMBOL(ib_sa_free_multicast); int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num, union ib_gid *mgid, struct ib_sa_mcmember_rec *rec) { struct mcast_device *dev; struct mcast_port *port; struct mcast_group *group; unsigned long flags; int ret = 0; dev = ib_get_client_data(device, &mcast_client); if (!dev) return -ENODEV; port = &dev->port[port_num - dev->start_port]; spin_lock_irqsave(&port->lock, flags); group = mcast_find(port, mgid); if (group) *rec = group->rec; else ret = -EADDRNOTAVAIL; spin_unlock_irqrestore(&port->lock, flags); return ret; } EXPORT_SYMBOL(ib_sa_get_mcmember_rec); int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, + struct net_device *ndev, + enum ib_gid_type gid_type, struct ib_ah_attr *ah_attr) { int ret; u16 gid_index; u8 p; - ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index); + if (rdma_protocol_roce(device, port_num)) { + ret = ib_find_cached_gid_by_port(device, &rec->port_gid, + gid_type, port_num, + ndev, + &gid_index); + } else if (rdma_protocol_ib(device, port_num)) { + ret = ib_find_cached_gid(device, &rec->port_gid, + IB_GID_TYPE_IB, NULL, &p, + &gid_index); + } else { + ret = -EINVAL; + } + if (ret) return ret; memset(ah_attr, 0, sizeof *ah_attr); ah_attr->dlid = be16_to_cpu(rec->mlid); ah_attr->sl = rec->sl; ah_attr->port_num = port_num; ah_attr->static_rate = rec->rate; ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = rec->mgid; ah_attr->grh.sgid_index = (u8) gid_index; ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label); ah_attr->grh.hop_limit = rec->hop_limit; ah_attr->grh.traffic_class = rec->traffic_class; return 0; } EXPORT_SYMBOL(ib_init_ah_from_mcmember); static void mcast_groups_event(struct mcast_port *port, enum mcast_group_state state) { struct mcast_group *group; struct rb_node *node; unsigned long flags; spin_lock_irqsave(&port->lock, flags); for (node = rb_first(&port->table); node; node = rb_next(node)) { group = rb_entry(node, struct mcast_group, node); spin_lock(&group->lock); if (group->state == MCAST_IDLE) { atomic_inc(&group->refcount); queue_work(mcast_wq, &group->work); } if (group->state != MCAST_GROUP_ERROR) group->state = state; spin_unlock(&group->lock); } spin_unlock_irqrestore(&port->lock, flags); } static void mcast_event_handler(struct ib_event_handler *handler, struct ib_event *event) { struct mcast_device *dev; int index; dev = container_of(handler, struct mcast_device, event_handler); - if (rdma_port_get_link_layer(dev->device, event->element.port_num) != - IB_LINK_LAYER_INFINIBAND) + if (!rdma_cap_ib_mcast(dev->device, event->element.port_num)) return; index = event->element.port_num - dev->start_port; switch (event->event) { case IB_EVENT_PORT_ERR: case IB_EVENT_LID_CHANGE: + case IB_EVENT_SM_CHANGE: case IB_EVENT_CLIENT_REREGISTER: mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR); break; case IB_EVENT_PKEY_CHANGE: mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT); break; default: break; } } static void mcast_add_one(struct ib_device *device) { struct mcast_device *dev; struct mcast_port *port; int i; int count = 0; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; - dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port, GFP_KERNEL); if (!dev) return; - if (device->node_type == RDMA_NODE_IB_SWITCH) - dev->start_port = dev->end_port = 0; - else { - dev->start_port = 1; - dev->end_port = device->phys_port_cnt; - } + dev->start_port = rdma_start_port(device); + dev->end_port = rdma_end_port(device); for (i = 0; i <= dev->end_port - dev->start_port; i++) { - if (rdma_port_get_link_layer(device, dev->start_port + i) != - IB_LINK_LAYER_INFINIBAND) + if (!rdma_cap_ib_mcast(device, dev->start_port + i)) continue; port = &dev->port[i]; port->dev = dev; port->port_num = dev->start_port + i; spin_lock_init(&port->lock); port->table = RB_ROOT; init_completion(&port->comp); atomic_set(&port->refcount, 1); ++count; } if (!count) { kfree(dev); return; } dev->device = device; ib_set_client_data(device, &mcast_client, dev); INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler); ib_register_event_handler(&dev->event_handler); } -static void mcast_remove_one(struct ib_device *device) +static void mcast_remove_one(struct ib_device *device, void *client_data) { - struct mcast_device *dev; + struct mcast_device *dev = client_data; struct mcast_port *port; int i; - dev = ib_get_client_data(device, &mcast_client); if (!dev) return; ib_unregister_event_handler(&dev->event_handler); flush_workqueue(mcast_wq); for (i = 0; i <= dev->end_port - dev->start_port; i++) { - if (rdma_port_get_link_layer(device, dev->start_port + i) == - IB_LINK_LAYER_INFINIBAND) { + if (rdma_cap_ib_mcast(device, dev->start_port + i)) { port = &dev->port[i]; deref_port(port); wait_for_completion(&port->comp); } } kfree(dev); } int mcast_init(void) { int ret; - mcast_wq = create_singlethread_workqueue("ib_mcast"); + mcast_wq = alloc_ordered_workqueue("ib_mcast", WQ_MEM_RECLAIM); if (!mcast_wq) return -ENOMEM; ib_sa_register_client(&sa_client); ret = ib_register_client(&mcast_client); if (ret) goto err; return 0; err: ib_sa_unregister_client(&sa_client); destroy_workqueue(mcast_wq); return ret; } void mcast_cleanup(void) { ib_unregister_client(&mcast_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(mcast_wq); } Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/opa_smi.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/opa_smi.h (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/opa_smi.h (revision 319974) @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __OPA_SMI_H_ +#define __OPA_SMI_H_ + +#include +#include + +#include "smi.h" + +enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch, + int port_num, int phys_port_cnt); +int opa_smi_get_fwd_port(struct opa_smp *smp); +extern enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp); +extern enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, + bool is_switch, int port_num); + +/* + * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM + * via process_mad + */ +static inline enum smi_action opa_smi_check_local_smp(struct opa_smp *smp, + struct ib_device *device) +{ + /* C14-9:3 -- We're at the end of the DR segment of path */ + /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */ + return (device->process_mad && + !opa_get_smp_direction(smp) && + (smp->hop_ptr == smp->hop_cnt + 1)) ? + IB_SMI_HANDLE : IB_SMI_DISCARD; +} + +/* + * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM + * via process_mad + */ +static inline enum smi_action opa_smi_check_local_returning_smp(struct opa_smp *smp, + struct ib_device *device) +{ + /* C14-13:3 -- We're at the end of the DR segment of path */ + /* C14-13:4 -- Hop Pointer == 0 -> give to SM */ + return (device->process_mad && + opa_get_smp_direction(smp) && + !smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD; +} + +#endif /* __OPA_SMI_H_ */ Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/opa_smi.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/packer.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/packer.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/packer.c (revision 319974) @@ -1,203 +1,200 @@ /* * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include #include #include static u64 value_read(int offset, int size, void *structure) { switch (size) { - case 1: return *(u8 *) (structure + offset); - case 2: return be16_to_cpup((__be16 *) (structure + offset)); - case 4: return be32_to_cpup((__be32 *) (structure + offset)); - case 8: return be64_to_cpup((__be64 *) (structure + offset)); + case 1: return *(u8 *) ((char *)structure + offset); + case 2: return be16_to_cpup((__be16 *) ((char *)structure + offset)); + case 4: return be32_to_cpup((__be32 *) ((char *)structure + offset)); + case 8: return be64_to_cpup((__be64 *) ((char *)structure + offset)); default: - printk(KERN_WARNING "Field size %d bits not handled\n", size * 8); + pr_warn("Field size %d bits not handled\n", size * 8); return 0; } } /** * ib_pack - Pack a structure into a buffer * @desc:Array of structure field descriptions * @desc_len:Number of entries in @desc * @structure:Structure to pack from * @buf:Buffer to pack into * * ib_pack() packs a list of structure fields into a buffer, * controlled by the array of fields in @desc. */ void ib_pack(const struct ib_field *desc, int desc_len, void *structure, void *buf) { int i; for (i = 0; i < desc_len; ++i) { if (desc[i].size_bits <= 32) { int shift; u32 val; __be32 mask; __be32 *addr; shift = 32 - desc[i].offset_bits - desc[i].size_bits; if (desc[i].struct_size_bytes) val = value_read(desc[i].struct_offset_bytes, desc[i].struct_size_bytes, structure) << shift; else val = 0; mask = cpu_to_be32(((1ull << desc[i].size_bits) - 1) << shift); addr = (__be32 *) buf + desc[i].offset_words; *addr = (*addr & ~mask) | (cpu_to_be32(val) & mask); } else if (desc[i].size_bits <= 64) { int shift; u64 val; __be64 mask; __be64 *addr; shift = 64 - desc[i].offset_bits - desc[i].size_bits; if (desc[i].struct_size_bytes) val = value_read(desc[i].struct_offset_bytes, desc[i].struct_size_bytes, structure) << shift; else val = 0; mask = cpu_to_be64((~0ull >> (64 - desc[i].size_bits)) << shift); addr = (__be64 *) ((__be32 *) buf + desc[i].offset_words); *addr = (*addr & ~mask) | (cpu_to_be64(val) & mask); } else { if (desc[i].offset_bits % 8 || desc[i].size_bits % 8) { - printk(KERN_WARNING "Structure field %s of size %d " - "bits is not byte-aligned\n", - desc[i].field_name, desc[i].size_bits); + pr_warn("Structure field %s of size %d bits is not byte-aligned\n", + desc[i].field_name, desc[i].size_bits); } if (desc[i].struct_size_bytes) - memcpy(buf + desc[i].offset_words * 4 + + memcpy((char *)buf + desc[i].offset_words * 4 + desc[i].offset_bits / 8, - structure + desc[i].struct_offset_bytes, + (char *)structure + desc[i].struct_offset_bytes, desc[i].size_bits / 8); else - memset(buf + desc[i].offset_words * 4 + + memset((char *)buf + desc[i].offset_words * 4 + desc[i].offset_bits / 8, 0, desc[i].size_bits / 8); } } } EXPORT_SYMBOL(ib_pack); static void value_write(int offset, int size, u64 val, void *structure) { switch (size * 8) { - case 8: *( u8 *) (structure + offset) = val; break; - case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break; - case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break; - case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break; + case 8: *( u8 *) ((char *)structure + offset) = val; break; + case 16: *(__be16 *) ((char *)structure + offset) = cpu_to_be16(val); break; + case 32: *(__be32 *) ((char *)structure + offset) = cpu_to_be32(val); break; + case 64: *(__be64 *) ((char *)structure + offset) = cpu_to_be64(val); break; default: - printk(KERN_WARNING "Field size %d bits not handled\n", size * 8); + pr_warn("Field size %d bits not handled\n", size * 8); } } /** * ib_unpack - Unpack a buffer into a structure * @desc:Array of structure field descriptions * @desc_len:Number of entries in @desc * @buf:Buffer to unpack from * @structure:Structure to unpack into * * ib_pack() unpacks a list of structure fields from a buffer, * controlled by the array of fields in @desc. */ void ib_unpack(const struct ib_field *desc, int desc_len, void *buf, void *structure) { int i; for (i = 0; i < desc_len; ++i) { if (!desc[i].struct_size_bytes) continue; if (desc[i].size_bits <= 32) { int shift; u32 val; u32 mask; __be32 *addr; shift = 32 - desc[i].offset_bits - desc[i].size_bits; mask = ((1ull << desc[i].size_bits) - 1) << shift; addr = (__be32 *) buf + desc[i].offset_words; val = (be32_to_cpup(addr) & mask) >> shift; value_write(desc[i].struct_offset_bytes, desc[i].struct_size_bytes, val, structure); } else if (desc[i].size_bits <= 64) { int shift; u64 val; u64 mask; __be64 *addr; shift = 64 - desc[i].offset_bits - desc[i].size_bits; mask = (~0ull >> (64 - desc[i].size_bits)) << shift; addr = (__be64 *) buf + desc[i].offset_words; val = (be64_to_cpup(addr) & mask) >> shift; value_write(desc[i].struct_offset_bytes, desc[i].struct_size_bytes, val, structure); } else { if (desc[i].offset_bits % 8 || desc[i].size_bits % 8) { - printk(KERN_WARNING "Structure field %s of size %d " - "bits is not byte-aligned\n", - desc[i].field_name, desc[i].size_bits); + pr_warn("Structure field %s of size %d bits is not byte-aligned\n", + desc[i].field_name, desc[i].size_bits); } - memcpy(structure + desc[i].struct_offset_bytes, - buf + desc[i].offset_words * 4 + + memcpy((char *)structure + desc[i].struct_offset_bytes, + (char *)buf + desc[i].offset_words * 4 + desc[i].offset_bits / 8, desc[i].size_bits / 8); } } } EXPORT_SYMBOL(ib_unpack); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/sa_query.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/sa_query.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/sa_query.c (revision 319974) @@ -1,1278 +1,1580 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include - +#include #include #include +#include +#include +#include #include "sa.h" +#include "core_priv.h" -MODULE_AUTHOR("Roland Dreier"); -MODULE_DESCRIPTION("InfiniBand subnet administration query support"); -MODULE_LICENSE("Dual BSD/GPL"); +#define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100 +#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000 +#define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000 struct ib_sa_sm_ah { struct ib_ah *ah; struct kref ref; u16 pkey_index; u8 src_path_mask; }; +struct ib_sa_classport_cache { + bool valid; + struct ib_class_port_info data; +}; + struct ib_sa_port { struct ib_mad_agent *agent; struct ib_sa_sm_ah *sm_ah; struct work_struct update_task; + struct ib_sa_classport_cache classport_info; + spinlock_t classport_lock; /* protects class port info set */ spinlock_t ah_lock; u8 port_num; }; struct ib_sa_device { int start_port, end_port; struct ib_event_handler event_handler; struct ib_sa_port port[0]; }; struct ib_sa_query { void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *); void (*release)(struct ib_sa_query *); struct ib_sa_client *client; struct ib_sa_port *port; struct ib_mad_send_buf *mad_buf; struct ib_sa_sm_ah *sm_ah; int id; + u32 flags; + struct list_head list; /* Local svc request list */ + u32 seq; /* Local svc request sequence number */ + unsigned long timeout; /* Local svc timeout */ + u8 path_use; /* How will the pathrecord be used */ }; +#define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001 +#define IB_SA_CANCEL 0x00000002 + struct ib_sa_service_query { void (*callback)(int, struct ib_sa_service_rec *, void *); void *context; struct ib_sa_query sa_query; }; struct ib_sa_path_query { void (*callback)(int, struct ib_sa_path_rec *, void *); void *context; struct ib_sa_query sa_query; }; struct ib_sa_guidinfo_query { void (*callback)(int, struct ib_sa_guidinfo_rec *, void *); void *context; struct ib_sa_query sa_query; }; +struct ib_sa_classport_info_query { + void (*callback)(int, struct ib_class_port_info *, void *); + void *context; + struct ib_sa_query sa_query; +}; + struct ib_sa_mcmember_query { void (*callback)(int, struct ib_sa_mcmember_rec *, void *); void *context; struct ib_sa_query sa_query; }; static void ib_sa_add_one(struct ib_device *device); -static void ib_sa_remove_one(struct ib_device *device); +static void ib_sa_remove_one(struct ib_device *device, void *client_data); static struct ib_client sa_client = { .name = "sa", .add = ib_sa_add_one, .remove = ib_sa_remove_one }; static DEFINE_SPINLOCK(idr_lock); static DEFINE_IDR(query_idr); static DEFINE_SPINLOCK(tid_lock); static u32 tid; #define PATH_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_path_rec, field), \ .struct_size_bytes = sizeof ((struct ib_sa_path_rec *) 0)->field, \ .field_name = "sa_path_rec:" #field static const struct ib_field path_rec_table[] = { { PATH_REC_FIELD(service_id), .offset_words = 0, .offset_bits = 0, .size_bits = 64 }, { PATH_REC_FIELD(dgid), .offset_words = 2, .offset_bits = 0, .size_bits = 128 }, { PATH_REC_FIELD(sgid), .offset_words = 6, .offset_bits = 0, .size_bits = 128 }, { PATH_REC_FIELD(dlid), .offset_words = 10, .offset_bits = 0, .size_bits = 16 }, { PATH_REC_FIELD(slid), .offset_words = 10, .offset_bits = 16, .size_bits = 16 }, { PATH_REC_FIELD(raw_traffic), .offset_words = 11, .offset_bits = 0, .size_bits = 1 }, { RESERVED, .offset_words = 11, .offset_bits = 1, .size_bits = 3 }, { PATH_REC_FIELD(flow_label), .offset_words = 11, .offset_bits = 4, .size_bits = 20 }, { PATH_REC_FIELD(hop_limit), .offset_words = 11, .offset_bits = 24, .size_bits = 8 }, { PATH_REC_FIELD(traffic_class), .offset_words = 12, .offset_bits = 0, .size_bits = 8 }, { PATH_REC_FIELD(reversible), .offset_words = 12, .offset_bits = 8, .size_bits = 1 }, { PATH_REC_FIELD(numb_path), .offset_words = 12, .offset_bits = 9, .size_bits = 7 }, { PATH_REC_FIELD(pkey), .offset_words = 12, .offset_bits = 16, .size_bits = 16 }, { PATH_REC_FIELD(qos_class), .offset_words = 13, .offset_bits = 0, .size_bits = 12 }, { PATH_REC_FIELD(sl), .offset_words = 13, .offset_bits = 12, .size_bits = 4 }, { PATH_REC_FIELD(mtu_selector), .offset_words = 13, .offset_bits = 16, .size_bits = 2 }, { PATH_REC_FIELD(mtu), .offset_words = 13, .offset_bits = 18, .size_bits = 6 }, { PATH_REC_FIELD(rate_selector), .offset_words = 13, .offset_bits = 24, .size_bits = 2 }, { PATH_REC_FIELD(rate), .offset_words = 13, .offset_bits = 26, .size_bits = 6 }, { PATH_REC_FIELD(packet_life_time_selector), .offset_words = 14, .offset_bits = 0, .size_bits = 2 }, { PATH_REC_FIELD(packet_life_time), .offset_words = 14, .offset_bits = 2, .size_bits = 6 }, { PATH_REC_FIELD(preference), .offset_words = 14, .offset_bits = 8, .size_bits = 8 }, { RESERVED, .offset_words = 14, .offset_bits = 16, .size_bits = 48 }, }; #define MCMEMBER_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field), \ .struct_size_bytes = sizeof ((struct ib_sa_mcmember_rec *) 0)->field, \ .field_name = "sa_mcmember_rec:" #field static const struct ib_field mcmember_rec_table[] = { { MCMEMBER_REC_FIELD(mgid), .offset_words = 0, .offset_bits = 0, .size_bits = 128 }, { MCMEMBER_REC_FIELD(port_gid), .offset_words = 4, .offset_bits = 0, .size_bits = 128 }, { MCMEMBER_REC_FIELD(qkey), .offset_words = 8, .offset_bits = 0, .size_bits = 32 }, { MCMEMBER_REC_FIELD(mlid), .offset_words = 9, .offset_bits = 0, .size_bits = 16 }, { MCMEMBER_REC_FIELD(mtu_selector), .offset_words = 9, .offset_bits = 16, .size_bits = 2 }, { MCMEMBER_REC_FIELD(mtu), .offset_words = 9, .offset_bits = 18, .size_bits = 6 }, { MCMEMBER_REC_FIELD(traffic_class), .offset_words = 9, .offset_bits = 24, .size_bits = 8 }, { MCMEMBER_REC_FIELD(pkey), .offset_words = 10, .offset_bits = 0, .size_bits = 16 }, { MCMEMBER_REC_FIELD(rate_selector), .offset_words = 10, .offset_bits = 16, .size_bits = 2 }, { MCMEMBER_REC_FIELD(rate), .offset_words = 10, .offset_bits = 18, .size_bits = 6 }, { MCMEMBER_REC_FIELD(packet_life_time_selector), .offset_words = 10, .offset_bits = 24, .size_bits = 2 }, { MCMEMBER_REC_FIELD(packet_life_time), .offset_words = 10, .offset_bits = 26, .size_bits = 6 }, { MCMEMBER_REC_FIELD(sl), .offset_words = 11, .offset_bits = 0, .size_bits = 4 }, { MCMEMBER_REC_FIELD(flow_label), .offset_words = 11, .offset_bits = 4, .size_bits = 20 }, { MCMEMBER_REC_FIELD(hop_limit), .offset_words = 11, .offset_bits = 24, .size_bits = 8 }, { MCMEMBER_REC_FIELD(scope), .offset_words = 12, .offset_bits = 0, .size_bits = 4 }, { MCMEMBER_REC_FIELD(join_state), .offset_words = 12, .offset_bits = 4, .size_bits = 4 }, { MCMEMBER_REC_FIELD(proxy_join), .offset_words = 12, .offset_bits = 8, .size_bits = 1 }, { RESERVED, .offset_words = 12, .offset_bits = 9, .size_bits = 23 }, }; #define SERVICE_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_service_rec, field), \ .struct_size_bytes = sizeof ((struct ib_sa_service_rec *) 0)->field, \ .field_name = "sa_service_rec:" #field static const struct ib_field service_rec_table[] = { { SERVICE_REC_FIELD(id), .offset_words = 0, .offset_bits = 0, .size_bits = 64 }, { SERVICE_REC_FIELD(gid), .offset_words = 2, .offset_bits = 0, .size_bits = 128 }, { SERVICE_REC_FIELD(pkey), .offset_words = 6, .offset_bits = 0, .size_bits = 16 }, { SERVICE_REC_FIELD(lease), .offset_words = 7, .offset_bits = 0, .size_bits = 32 }, { SERVICE_REC_FIELD(key), .offset_words = 8, .offset_bits = 0, .size_bits = 128 }, { SERVICE_REC_FIELD(name), .offset_words = 12, .offset_bits = 0, .size_bits = 64*8 }, { SERVICE_REC_FIELD(data8), .offset_words = 28, .offset_bits = 0, .size_bits = 16*8 }, { SERVICE_REC_FIELD(data16), .offset_words = 32, .offset_bits = 0, .size_bits = 8*16 }, { SERVICE_REC_FIELD(data32), .offset_words = 36, .offset_bits = 0, .size_bits = 4*32 }, { SERVICE_REC_FIELD(data64), .offset_words = 40, .offset_bits = 0, .size_bits = 2*64 }, }; +#define CLASSPORTINFO_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_class_port_info, field), \ + .struct_size_bytes = sizeof((struct ib_class_port_info *)0)->field, \ + .field_name = "ib_class_port_info:" #field + +static const struct ib_field classport_info_rec_table[] = { + { CLASSPORTINFO_REC_FIELD(base_version), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 8 }, + { CLASSPORTINFO_REC_FIELD(class_version), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 8 }, + { CLASSPORTINFO_REC_FIELD(capability_mask), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_gid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { CLASSPORTINFO_REC_FIELD(redirect_tcslfl), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_lid), + .offset_words = 7, + .offset_bits = 0, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(redirect_pkey), + .offset_words = 7, + .offset_bits = 16, + .size_bits = 16 }, + + { CLASSPORTINFO_REC_FIELD(redirect_qp), + .offset_words = 8, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_qkey), + .offset_words = 9, + .offset_bits = 0, + .size_bits = 32 }, + + { CLASSPORTINFO_REC_FIELD(trap_gid), + .offset_words = 10, + .offset_bits = 0, + .size_bits = 128 }, + { CLASSPORTINFO_REC_FIELD(trap_tcslfl), + .offset_words = 14, + .offset_bits = 0, + .size_bits = 32 }, + + { CLASSPORTINFO_REC_FIELD(trap_lid), + .offset_words = 15, + .offset_bits = 0, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(trap_pkey), + .offset_words = 15, + .offset_bits = 16, + .size_bits = 16 }, + + { CLASSPORTINFO_REC_FIELD(trap_hlqp), + .offset_words = 16, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(trap_qkey), + .offset_words = 17, + .offset_bits = 0, + .size_bits = 32 }, +}; + #define GUIDINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \ .field_name = "sa_guidinfo_rec:" #field static const struct ib_field guidinfo_rec_table[] = { { GUIDINFO_REC_FIELD(lid), .offset_words = 0, .offset_bits = 0, .size_bits = 16 }, { GUIDINFO_REC_FIELD(block_num), .offset_words = 0, .offset_bits = 16, .size_bits = 8 }, { GUIDINFO_REC_FIELD(res1), .offset_words = 0, .offset_bits = 24, .size_bits = 8 }, { GUIDINFO_REC_FIELD(res2), .offset_words = 1, .offset_bits = 0, .size_bits = 32 }, { GUIDINFO_REC_FIELD(guid_info_list), .offset_words = 2, .offset_bits = 0, .size_bits = 512 }, }; +static inline void ib_sa_disable_local_svc(struct ib_sa_query *query) +{ + query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE; +} + static void free_sm_ah(struct kref *kref) { struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); ib_destroy_ah(sm_ah->ah); kfree(sm_ah); } static void update_sm_ah(struct work_struct *work) { struct ib_sa_port *port = container_of(work, struct ib_sa_port, update_task); struct ib_sa_sm_ah *new_ah; struct ib_port_attr port_attr; struct ib_ah_attr ah_attr; if (ib_query_port(port->agent->device, port->port_num, &port_attr)) { - printk(KERN_WARNING "Couldn't query port\n"); + pr_warn("Couldn't query port\n"); return; } new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL); if (!new_ah) { - printk(KERN_WARNING "Couldn't allocate new SM AH\n"); return; } kref_init(&new_ah->ref); new_ah->src_path_mask = (1 << port_attr.lmc) - 1; new_ah->pkey_index = 0; if (ib_find_pkey(port->agent->device, port->port_num, IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index)) - printk(KERN_ERR "Couldn't find index for default PKey\n"); + pr_err("Couldn't find index for default PKey\n"); memset(&ah_attr, 0, sizeof ah_attr); ah_attr.dlid = port_attr.sm_lid; ah_attr.sl = port_attr.sm_sl; ah_attr.port_num = port->port_num; + if (port_attr.grh_required) { + ah_attr.ah_flags = IB_AH_GRH; + ah_attr.grh.dgid.global.subnet_prefix = cpu_to_be64(port_attr.subnet_prefix); + ah_attr.grh.dgid.global.interface_id = cpu_to_be64(IB_SA_WELL_KNOWN_GUID); + } new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr); if (IS_ERR(new_ah->ah)) { - printk(KERN_WARNING "Couldn't create new SM AH\n"); + pr_warn("Couldn't create new SM AH\n"); kfree(new_ah); return; } spin_lock_irq(&port->ah_lock); if (port->sm_ah) kref_put(&port->sm_ah->ref, free_sm_ah); port->sm_ah = new_ah; spin_unlock_irq(&port->ah_lock); } static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event) { if (event->event == IB_EVENT_PORT_ERR || event->event == IB_EVENT_PORT_ACTIVE || event->event == IB_EVENT_LID_CHANGE || event->event == IB_EVENT_PKEY_CHANGE || event->event == IB_EVENT_SM_CHANGE || event->event == IB_EVENT_CLIENT_REREGISTER) { unsigned long flags; struct ib_sa_device *sa_dev = container_of(handler, typeof(*sa_dev), event_handler); struct ib_sa_port *port = &sa_dev->port[event->element.port_num - sa_dev->start_port]; - if (rdma_port_get_link_layer(handler->device, port->port_num) != IB_LINK_LAYER_INFINIBAND) + if (!rdma_cap_ib_sa(handler->device, port->port_num)) return; spin_lock_irqsave(&port->ah_lock, flags); if (port->sm_ah) kref_put(&port->sm_ah->ref, free_sm_ah); port->sm_ah = NULL; spin_unlock_irqrestore(&port->ah_lock, flags); + if (event->event == IB_EVENT_SM_CHANGE || + event->event == IB_EVENT_CLIENT_REREGISTER || + event->event == IB_EVENT_LID_CHANGE) { + spin_lock_irqsave(&port->classport_lock, flags); + port->classport_info.valid = false; + spin_unlock_irqrestore(&port->classport_lock, flags); + } queue_work(ib_wq, &sa_dev->port[event->element.port_num - sa_dev->start_port].update_task); } } void ib_sa_register_client(struct ib_sa_client *client) { atomic_set(&client->users, 1); init_completion(&client->comp); } EXPORT_SYMBOL(ib_sa_register_client); void ib_sa_unregister_client(struct ib_sa_client *client) { ib_sa_client_put(client); wait_for_completion(&client->comp); } EXPORT_SYMBOL(ib_sa_unregister_client); /** * ib_sa_cancel_query - try to cancel an SA query * @id:ID of query to cancel * @query:query pointer to cancel * * Try to cancel an SA query. If the id and query don't match up or * the query has already completed, nothing is done. Otherwise the * query is canceled and will complete with a status of -EINTR. */ void ib_sa_cancel_query(int id, struct ib_sa_query *query) { unsigned long flags; struct ib_mad_agent *agent; struct ib_mad_send_buf *mad_buf; spin_lock_irqsave(&idr_lock, flags); if (idr_find(&query_idr, id) != query) { spin_unlock_irqrestore(&idr_lock, flags); return; } agent = query->port->agent; mad_buf = query->mad_buf; spin_unlock_irqrestore(&idr_lock, flags); - - ib_cancel_mad(agent, mad_buf); } EXPORT_SYMBOL(ib_sa_cancel_query); static u8 get_src_path_mask(struct ib_device *device, u8 port_num) { struct ib_sa_device *sa_dev; struct ib_sa_port *port; unsigned long flags; u8 src_path_mask; sa_dev = ib_get_client_data(device, &sa_client); if (!sa_dev) return 0x7f; port = &sa_dev->port[port_num - sa_dev->start_port]; spin_lock_irqsave(&port->ah_lock, flags); src_path_mask = port->sm_ah ? port->sm_ah->src_path_mask : 0x7f; spin_unlock_irqrestore(&port->ah_lock, flags); return src_path_mask; } int ib_init_ah_from_path(struct ib_device *device, u8 port_num, struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr) { int ret; u16 gid_index; - int force_grh; + int use_roce; + struct net_device *ndev = NULL; memset(ah_attr, 0, sizeof *ah_attr); ah_attr->dlid = be16_to_cpu(rec->dlid); ah_attr->sl = rec->sl; ah_attr->src_path_bits = be16_to_cpu(rec->slid) & get_src_path_mask(device, port_num); ah_attr->port_num = port_num; ah_attr->static_rate = rec->rate; - force_grh = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET; + use_roce = rdma_cap_eth_ah(device, port_num); - if (rec->hop_limit > 1 || force_grh) { + if (use_roce) { + struct net_device *idev; + struct net_device *resolved_dev; + struct rdma_dev_addr dev_addr = {.bound_dev_if = rec->ifindex, + .net = rec->net ? rec->net : + &init_net}; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; + + if (!device->get_netdev) + return -EOPNOTSUPP; + + rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid); + rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid); + + /* validate the route */ + ret = rdma_resolve_ip_route(&sgid_addr._sockaddr, + &dgid_addr._sockaddr, &dev_addr); + if (ret) + return ret; + + if ((dev_addr.network == RDMA_NETWORK_IPV4 || + dev_addr.network == RDMA_NETWORK_IPV6) && + rec->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) + return -EINVAL; + + idev = device->get_netdev(device, port_num); + if (!idev) + return -ENODEV; + + resolved_dev = dev_get_by_index(dev_addr.net, + dev_addr.bound_dev_if); + if (resolved_dev->if_flags & IFF_LOOPBACK) { + dev_put(resolved_dev); + resolved_dev = idev; + dev_hold(resolved_dev); + } + ndev = ib_get_ndev_from_path(rec); + rcu_read_lock(); + if ((ndev && ndev != resolved_dev) || + (resolved_dev != idev && + !rdma_is_upper_dev_rcu(idev, resolved_dev))) + ret = -EHOSTUNREACH; + rcu_read_unlock(); + dev_put(idev); + dev_put(resolved_dev); + if (ret) { + if (ndev) + dev_put(ndev); + return ret; + } + } + + if (rec->hop_limit > 0 || use_roce) { ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = rec->dgid; - ret = ib_find_cached_gid(device, &rec->sgid, &port_num, - &gid_index); - if (ret) + ret = ib_find_cached_gid_by_port(device, &rec->sgid, + rec->gid_type, port_num, ndev, + &gid_index); + if (ret) { + if (ndev) + dev_put(ndev); return ret; + } ah_attr->grh.sgid_index = gid_index; ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label); ah_attr->grh.hop_limit = rec->hop_limit; ah_attr->grh.traffic_class = rec->traffic_class; + if (ndev) + dev_put(ndev); } - if (force_grh) { - memcpy(ah_attr->dmac, rec->dmac, 6); - ah_attr->vlan_id = rec->vlan_id; - } else { - memset(ah_attr->dmac, 0, 6); - ah_attr->vlan_id = 0xffff; - } + if (use_roce) + memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN); + return 0; } EXPORT_SYMBOL(ib_init_ah_from_path); static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask) { unsigned long flags; spin_lock_irqsave(&query->port->ah_lock, flags); if (!query->port->sm_ah) { spin_unlock_irqrestore(&query->port->ah_lock, flags); return -EAGAIN; } kref_get(&query->port->sm_ah->ref); query->sm_ah = query->port->sm_ah; spin_unlock_irqrestore(&query->port->ah_lock, flags); query->mad_buf = ib_create_send_mad(query->port->agent, 1, query->sm_ah->pkey_index, 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA, - gfp_mask); + gfp_mask, + IB_MGMT_BASE_VERSION); if (IS_ERR(query->mad_buf)) { kref_put(&query->sm_ah->ref, free_sm_ah); return -ENOMEM; } query->mad_buf->ah = query->sm_ah->ah; return 0; } static void free_mad(struct ib_sa_query *query) { ib_free_send_mad(query->mad_buf); kref_put(&query->sm_ah->ref, free_sm_ah); } static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent) { unsigned long flags; memset(mad, 0, sizeof *mad); mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION; mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; mad->mad_hdr.class_version = IB_SA_CLASS_VERSION; spin_lock_irqsave(&tid_lock, flags); mad->mad_hdr.tid = cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++); spin_unlock_irqrestore(&tid_lock, flags); } static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) { + bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; int ret, id; -retry: - if (!idr_pre_get(&query_idr, gfp_mask)) - return -ENOMEM; + if (preload) + idr_preload(gfp_mask); spin_lock_irqsave(&idr_lock, flags); - ret = idr_get_new(&query_idr, query, &id); + + id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT); + spin_unlock_irqrestore(&idr_lock, flags); - if (ret == -EAGAIN) - goto retry; - if (ret) - return ret; + if (preload) + idr_preload_end(); + if (id < 0) + return id; query->mad_buf->timeout_ms = timeout_ms; query->mad_buf->context[0] = query; query->id = id; + if (query->flags & IB_SA_ENABLE_LOCAL_SERVICE) { + ib_sa_disable_local_svc(query); + } + ret = ib_post_send_mad(query->mad_buf, NULL); if (ret) { spin_lock_irqsave(&idr_lock, flags); idr_remove(&query_idr, id); spin_unlock_irqrestore(&idr_lock, flags); } /* * It's not safe to dereference query any more, because the * send may already have completed and freed the query in * another context. */ return ret ? ret : id; } void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec) { ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec); } EXPORT_SYMBOL(ib_sa_unpack_path); +void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute) +{ + ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute); +} +EXPORT_SYMBOL(ib_sa_pack_path); + static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { struct ib_sa_path_query *query = container_of(sa_query, struct ib_sa_path_query, sa_query); if (mad) { struct ib_sa_path_rec rec; ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), mad->data, &rec); - rec.vlan_id = 0xffff; - memset(rec.dmac, 0, ETH_ALEN); - memset(rec.smac, 0, ETH_ALEN); - + rec.net = NULL; + rec.ifindex = 0; + rec.gid_type = IB_GID_TYPE_IB; + eth_zero_addr(rec.dmac); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); } static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) { kfree(container_of(sa_query, struct ib_sa_path_query, sa_query)); } - /** * ib_sa_path_rec_get - Start a Path get query * @client:SA client * @device:device to send query on * @port_num: port number to send query on * @rec:Path Record to send in query * @comp_mask:component mask to send in query * @timeout_ms:time to wait for response * @gfp_mask:GFP mask to use for internal allocations * @callback:function called when query completes, times out or is * canceled * @context:opaque user context passed to callback * @sa_query:query context, used to cancel query * * Send a Path Record Get query to the SA to look up a path. The * callback function will be called when the query completes (or * fails); status is 0 for a successful response, -EINTR if the query * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error * occurred sending the query. The resp parameter of the callback is * only valid if status is 0. * * If the return value of ib_sa_path_rec_get() is negative, it is an * error code. Otherwise it is a query ID that can be used to cancel * the query. */ int ib_sa_path_rec_get(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - struct ib_sa_path_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_path_rec *resp, - void *context), - void *context, - struct ib_sa_query **sa_query) + struct ib_device *device, u8 port_num, + struct ib_sa_path_rec *rec, + ib_sa_comp_mask comp_mask, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_path_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) { struct ib_sa_path_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; int ret; if (!sa_dev) return -ENODEV; port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; query->sa_query.port = port; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err1; ib_sa_client_get(client); query->sa_query.client = client; query->callback = callback; query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(mad, agent); query->sa_query.callback = callback ? ib_sa_path_rec_callback : NULL; query->sa_query.release = ib_sa_path_rec_release; mad->mad_hdr.method = IB_MGMT_METHOD_GET; mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC); mad->sa_hdr.comp_mask = comp_mask; ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, mad->data); *sa_query = &query->sa_query; + query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE; + query->sa_query.mad_buf->context[1] = rec; + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; return ret; err2: *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); err1: kfree(query); return ret; } EXPORT_SYMBOL(ib_sa_path_rec_get); static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { struct ib_sa_service_query *query = container_of(sa_query, struct ib_sa_service_query, sa_query); if (mad) { struct ib_sa_service_rec rec; ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table), mad->data, &rec); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); } static void ib_sa_service_rec_release(struct ib_sa_query *sa_query) { kfree(container_of(sa_query, struct ib_sa_service_query, sa_query)); } /** * ib_sa_service_rec_query - Start Service Record operation * @client:SA client * @device:device to send request on * @port_num: port number to send request on * @method:SA method - should be get, set, or delete * @rec:Service Record to send in request * @comp_mask:component mask to send in request * @timeout_ms:time to wait for response * @gfp_mask:GFP mask to use for internal allocations * @callback:function called when request completes, times out or is * canceled * @context:opaque user context passed to callback * @sa_query:request context, used to cancel request * * Send a Service Record set/get/delete to the SA to register, * unregister or query a service record. * The callback function will be called when the request completes (or * fails); status is 0 for a successful response, -EINTR if the query * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error * occurred sending the query. The resp parameter of the callback is * only valid if status is 0. * * If the return value of ib_sa_service_rec_query() is negative, it is an * error code. Otherwise it is a request ID that can be used to cancel * the query. */ int ib_sa_service_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_service_rec *rec, ib_sa_comp_mask comp_mask, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_service_rec *resp, void *context), void *context, struct ib_sa_query **sa_query) { struct ib_sa_service_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; int ret; if (!sa_dev) return -ENODEV; port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; if (method != IB_MGMT_METHOD_GET && method != IB_MGMT_METHOD_SET && method != IB_SA_METHOD_DELETE) return -EINVAL; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; query->sa_query.port = port; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err1; ib_sa_client_get(client); query->sa_query.client = client; query->callback = callback; query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(mad, agent); query->sa_query.callback = callback ? ib_sa_service_rec_callback : NULL; query->sa_query.release = ib_sa_service_rec_release; mad->mad_hdr.method = method; mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_SERVICE_REC); mad->sa_hdr.comp_mask = comp_mask; ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table), rec, mad->data); *sa_query = &query->sa_query; ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; return ret; err2: *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); err1: kfree(query); return ret; } EXPORT_SYMBOL(ib_sa_service_rec_query); static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { struct ib_sa_mcmember_query *query = container_of(sa_query, struct ib_sa_mcmember_query, sa_query); if (mad) { struct ib_sa_mcmember_rec rec; ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table), mad->data, &rec); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); } static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query) { kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query)); } int ib_sa_mcmember_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), void *context, struct ib_sa_query **sa_query) { struct ib_sa_mcmember_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; int ret; if (!sa_dev) return -ENODEV; port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; query->sa_query.port = port; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err1; ib_sa_client_get(client); query->sa_query.client = client; query->callback = callback; query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(mad, agent); query->sa_query.callback = callback ? ib_sa_mcmember_rec_callback : NULL; query->sa_query.release = ib_sa_mcmember_rec_release; mad->mad_hdr.method = method; mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); mad->sa_hdr.comp_mask = comp_mask; ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table), rec, mad->data); *sa_query = &query->sa_query; ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; return ret; err2: *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); err1: kfree(query); return ret; } /* Support GuidInfoRecord */ static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query, - int status, - struct ib_sa_mad *mad) + int status, + struct ib_sa_mad *mad) { struct ib_sa_guidinfo_query *query = container_of(sa_query, struct ib_sa_guidinfo_query, sa_query); if (mad) { struct ib_sa_guidinfo_rec rec; ib_unpack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), mad->data, &rec); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); } static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query) { kfree(container_of(sa_query, struct ib_sa_guidinfo_query, sa_query)); } int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), void *context, struct ib_sa_query **sa_query) { struct ib_sa_guidinfo_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); - struct ib_sa_port *port; + struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; int ret; if (!sa_dev) return -ENODEV; if (method != IB_MGMT_METHOD_GET && method != IB_MGMT_METHOD_SET && method != IB_SA_METHOD_DELETE) { return -EINVAL; } port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; - query->sa_query.port = port; + query->sa_query.port = port; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err1; ib_sa_client_get(client); query->sa_query.client = client; - query->callback = callback; - query->context = context; + query->callback = callback; + query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(mad, agent); query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL; query->sa_query.release = ib_sa_guidinfo_rec_release; mad->mad_hdr.method = method; mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_GUID_INFO_REC); mad->sa_hdr.comp_mask = comp_mask; ib_pack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), rec, mad->data); *sa_query = &query->sa_query; ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; return ret; err2: *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); err1: kfree(query); return ret; } EXPORT_SYMBOL(ib_sa_guid_info_rec_query); +/* Support get SA ClassPortInfo */ +static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, + int status, + struct ib_sa_mad *mad) +{ + unsigned long flags; + struct ib_sa_classport_info_query *query = + container_of(sa_query, struct ib_sa_classport_info_query, sa_query); + + if (mad) { + struct ib_class_port_info rec; + + ib_unpack(classport_info_rec_table, + ARRAY_SIZE(classport_info_rec_table), + mad->data, &rec); + + spin_lock_irqsave(&sa_query->port->classport_lock, flags); + if (!status && !sa_query->port->classport_info.valid) { + memcpy(&sa_query->port->classport_info.data, &rec, + sizeof(sa_query->port->classport_info.data)); + + sa_query->port->classport_info.valid = true; + } + spin_unlock_irqrestore(&sa_query->port->classport_lock, flags); + + query->callback(status, &rec, query->context); + } else { + query->callback(status, NULL, query->context); + } +} + +static void ib_sa_portclass_info_rec_release(struct ib_sa_query *sa_query) +{ + kfree(container_of(sa_query, struct ib_sa_classport_info_query, + sa_query)); +} + +int ib_sa_classport_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_class_port_info *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_sa_classport_info_query *query; + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + struct ib_mad_agent *agent; + struct ib_sa_mad *mad; + struct ib_class_port_info cached_class_port_info; + int ret; + unsigned long flags; + + if (!sa_dev) + return -ENODEV; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + /* Use cached ClassPortInfo attribute if valid instead of sending mad */ + spin_lock_irqsave(&port->classport_lock, flags); + if (port->classport_info.valid && callback) { + memcpy(&cached_class_port_info, &port->classport_info.data, + sizeof(cached_class_port_info)); + spin_unlock_irqrestore(&port->classport_lock, flags); + callback(0, &cached_class_port_info, context); + return 0; + } + spin_unlock_irqrestore(&port->classport_lock, flags); + + query = kzalloc(sizeof(*query), gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(mad, agent); + + query->sa_query.callback = callback ? ib_sa_classport_info_rec_callback : NULL; + + query->sa_query.release = ib_sa_portclass_info_rec_release; + /* support GET only */ + mad->mad_hdr.method = IB_MGMT_METHOD_GET; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO); + mad->sa_hdr.comp_mask = 0; + *sa_query = &query->sa_query; + + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); + +err1: + kfree(query); + return ret; +} +EXPORT_SYMBOL(ib_sa_classport_info_rec_query); + static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) { struct ib_sa_query *query = mad_send_wc->send_buf->context[0]; unsigned long flags; if (query->callback) switch (mad_send_wc->status) { case IB_WC_SUCCESS: /* No callback -- already got recv */ break; case IB_WC_RESP_TIMEOUT_ERR: query->callback(query, -ETIMEDOUT, NULL); break; case IB_WC_WR_FLUSH_ERR: query->callback(query, -EINTR, NULL); break; default: query->callback(query, -EIO, NULL); break; } spin_lock_irqsave(&idr_lock, flags); idr_remove(&query_idr, query->id); spin_unlock_irqrestore(&idr_lock, flags); free_mad(query); ib_sa_client_put(query->client); query->release(query); } static void recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_sa_query *query; - struct ib_mad_send_buf *mad_buf; - mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id; - query = mad_buf->context[0]; + if (!send_buf) + return; + query = send_buf->context[0]; if (query->callback) { if (mad_recv_wc->wc->status == IB_WC_SUCCESS) query->callback(query, mad_recv_wc->recv_buf.mad->mad_hdr.status ? -EINVAL : 0, (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad); else query->callback(query, -EIO, NULL); } ib_free_recv_mad(mad_recv_wc); } static void ib_sa_add_one(struct ib_device *device) { struct ib_sa_device *sa_dev; int s, e, i; + int count = 0; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; + s = rdma_start_port(device); + e = rdma_end_port(device); - if (device->node_type == RDMA_NODE_IB_SWITCH) - s = e = 0; - else { - s = 1; - e = device->phys_port_cnt; - } - sa_dev = kzalloc(sizeof *sa_dev + (e - s + 1) * sizeof (struct ib_sa_port), GFP_KERNEL); if (!sa_dev) return; sa_dev->start_port = s; sa_dev->end_port = e; for (i = 0; i <= e - s; ++i) { spin_lock_init(&sa_dev->port[i].ah_lock); - if (rdma_port_get_link_layer(device, i + 1) != IB_LINK_LAYER_INFINIBAND) + if (!rdma_cap_ib_sa(device, i + 1)) continue; sa_dev->port[i].sm_ah = NULL; sa_dev->port[i].port_num = i + s; + spin_lock_init(&sa_dev->port[i].classport_lock); + sa_dev->port[i].classport_info.valid = false; + sa_dev->port[i].agent = ib_register_mad_agent(device, i + s, IB_QPT_GSI, NULL, 0, send_handler, - recv_handler, sa_dev); + recv_handler, sa_dev, 0); if (IS_ERR(sa_dev->port[i].agent)) goto err; INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah); + + count++; } + if (!count) + goto free; + ib_set_client_data(device, &sa_client, sa_dev); /* * We register our event handler after everything is set up, * and then update our cached info after the event handler is * registered to avoid any problems if a port changes state * during our initialization. */ INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event); if (ib_register_event_handler(&sa_dev->event_handler)) - goto reg_err; + goto err; - for (i = 0; i <= e - s; ++i) - if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) + for (i = 0; i <= e - s; ++i) { + if (rdma_cap_ib_sa(device, i + 1)) update_sm_ah(&sa_dev->port[i].update_task); + } return; -reg_err: - ib_set_client_data(device, &sa_client, NULL); - i = e - s; err: - for (; i >= 0; --i) - if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND && - !IS_ERR(sa_dev->port[i].agent)) - ib_unregister_mad_agent(sa_dev->port[i].agent); - + while (--i >= 0) { + if (rdma_cap_ib_sa(device, i + 1)) + ib_unregister_mad_agent(sa_dev->port[i].agent); + } +free: kfree(sa_dev); - return; } -static void ib_sa_remove_one(struct ib_device *device) +static void ib_sa_remove_one(struct ib_device *device, void *client_data) { - struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_device *sa_dev = client_data; int i; if (!sa_dev) return; ib_unregister_event_handler(&sa_dev->event_handler); flush_workqueue(ib_wq); for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) { - if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) { + if (rdma_cap_ib_sa(device, i + 1)) { ib_unregister_mad_agent(sa_dev->port[i].agent); if (sa_dev->port[i].sm_ah) kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); } } kfree(sa_dev); } -static int __init ib_sa_init(void) +int ib_sa_init(void) { int ret; get_random_bytes(&tid, sizeof tid); ret = ib_register_client(&sa_client); if (ret) { - printk(KERN_ERR "Couldn't register ib_sa client\n"); + pr_err("Couldn't register ib_sa client\n"); goto err1; } ret = mcast_init(); if (ret) { - printk(KERN_ERR "Couldn't initialize multicast handling\n"); + pr_err("Couldn't initialize multicast handling\n"); goto err2; } return 0; + err2: ib_unregister_client(&sa_client); err1: return ret; } -static void __exit ib_sa_cleanup(void) +void ib_sa_cleanup(void) { mcast_cleanup(); ib_unregister_client(&sa_client); idr_destroy(&query_idr); } - -module_init_order(ib_sa_init, SI_ORDER_SECOND); -module_exit(ib_sa_cleanup); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/smi.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/smi.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/smi.h (revision 319974) @@ -1,90 +1,90 @@ /* * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #ifndef __SMI_H_ #define __SMI_H_ #include enum smi_action { IB_SMI_DISCARD, IB_SMI_HANDLE }; enum smi_forward_action { IB_SMI_LOCAL, /* SMP should be completed up the stack */ IB_SMI_SEND, /* received DR SMP should be forwarded to the send queue */ IB_SMI_FORWARD /* SMP should be forwarded (for switches only) */ }; -enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, +enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, int port_num, int phys_port_cnt); int smi_get_fwd_port(struct ib_smp *smp); extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp); extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, - u8 node_type, int port_num); + bool is_switch, int port_num); /* * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM * via process_mad */ static inline enum smi_action smi_check_local_smp(struct ib_smp *smp, struct ib_device *device) { /* C14-9:3 -- We're at the end of the DR segment of path */ /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */ return ((device->process_mad && !ib_get_smp_direction(smp) && (smp->hop_ptr == smp->hop_cnt + 1)) ? IB_SMI_HANDLE : IB_SMI_DISCARD); } /* * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM * via process_mad */ static inline enum smi_action smi_check_local_returning_smp(struct ib_smp *smp, struct ib_device *device) { /* C14-13:3 -- We're at the end of the DR segment of path */ /* C14-13:4 -- Hop Pointer == 0 -> give to SM */ return ((device->process_mad && ib_get_smp_direction(smp) && !smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD); } #endif /* __SMI_H_ */ Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ucm.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ucm.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ucm.c (revision 319974) @@ -1,1388 +1,1371 @@ /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include -#include #include +#include #include #include #include MODULE_AUTHOR("Libor Michalek"); MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access"); MODULE_LICENSE("Dual BSD/GPL"); struct ib_ucm_device { int devnum; struct cdev cdev; struct device dev; struct ib_device *ib_dev; }; struct ib_ucm_file { struct mutex file_mutex; struct file *filp; struct ib_ucm_device *device; struct list_head ctxs; struct list_head events; wait_queue_head_t poll_wait; }; struct ib_ucm_context { int id; struct completion comp; atomic_t ref; int events_reported; struct ib_ucm_file *file; struct ib_cm_id *cm_id; __u64 uid; struct list_head events; /* list of pending events. */ struct list_head file_list; /* member in file ctx list */ }; struct ib_ucm_event { struct ib_ucm_context *ctx; struct list_head file_list; /* member in file event list */ struct list_head ctx_list; /* member in ctx event list */ struct ib_cm_id *cm_id; struct ib_ucm_event_resp resp; void *data; void *info; int data_len; int info_len; }; enum { IB_UCM_MAJOR = 231, IB_UCM_BASE_MINOR = 224, IB_UCM_MAX_DEVICES = 32 }; #define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR) static void ib_ucm_add_one(struct ib_device *device); -static void ib_ucm_remove_one(struct ib_device *device); +static void ib_ucm_remove_one(struct ib_device *device, void *client_data); static struct ib_client ucm_client = { .name = "ucm", .add = ib_ucm_add_one, .remove = ib_ucm_remove_one }; static DEFINE_MUTEX(ctx_id_mutex); static DEFINE_IDR(ctx_id_table); static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES); static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id) { struct ib_ucm_context *ctx; mutex_lock(&ctx_id_mutex); ctx = idr_find(&ctx_id_table, id); if (!ctx) ctx = ERR_PTR(-ENOENT); else if (ctx->file != file) ctx = ERR_PTR(-EINVAL); else atomic_inc(&ctx->ref); mutex_unlock(&ctx_id_mutex); return ctx; } static void ib_ucm_ctx_put(struct ib_ucm_context *ctx) { if (atomic_dec_and_test(&ctx->ref)) complete(&ctx->comp); } static inline int ib_ucm_new_cm_id(int event) { return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED; } static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx) { struct ib_ucm_event *uevent; mutex_lock(&ctx->file->file_mutex); list_del(&ctx->file_list); while (!list_empty(&ctx->events)) { uevent = list_entry(ctx->events.next, struct ib_ucm_event, ctx_list); list_del(&uevent->file_list); list_del(&uevent->ctx_list); mutex_unlock(&ctx->file->file_mutex); /* clear incoming connections. */ if (ib_ucm_new_cm_id(uevent->resp.event)) ib_destroy_cm_id(uevent->cm_id); kfree(uevent); mutex_lock(&ctx->file->file_mutex); } mutex_unlock(&ctx->file->file_mutex); } static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file) { struct ib_ucm_context *ctx; - int result; ctx = kzalloc(sizeof *ctx, GFP_KERNEL); if (!ctx) return NULL; atomic_set(&ctx->ref, 1); init_completion(&ctx->comp); ctx->file = file; INIT_LIST_HEAD(&ctx->events); - do { - result = idr_pre_get(&ctx_id_table, GFP_KERNEL); - if (!result) - goto error; - - mutex_lock(&ctx_id_mutex); - result = idr_get_new(&ctx_id_table, ctx, &ctx->id); - mutex_unlock(&ctx_id_mutex); - } while (result == -EAGAIN); - - if (result) + mutex_lock(&ctx_id_mutex); + ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL); + mutex_unlock(&ctx_id_mutex); + if (ctx->id < 0) goto error; list_add_tail(&ctx->file_list, &file->ctxs); return ctx; error: kfree(ctx); return NULL; } static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq, struct ib_cm_req_event_param *kreq) { ureq->remote_ca_guid = kreq->remote_ca_guid; ureq->remote_qkey = kreq->remote_qkey; ureq->remote_qpn = kreq->remote_qpn; ureq->qp_type = kreq->qp_type; ureq->starting_psn = kreq->starting_psn; ureq->responder_resources = kreq->responder_resources; ureq->initiator_depth = kreq->initiator_depth; ureq->local_cm_response_timeout = kreq->local_cm_response_timeout; ureq->flow_control = kreq->flow_control; ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout; ureq->retry_count = kreq->retry_count; ureq->rnr_retry_count = kreq->rnr_retry_count; ureq->srq = kreq->srq; ureq->port = kreq->port; ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path); if (kreq->alternate_path) ib_copy_path_rec_to_user(&ureq->alternate_path, kreq->alternate_path); } static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep, struct ib_cm_rep_event_param *krep) { urep->remote_ca_guid = krep->remote_ca_guid; urep->remote_qkey = krep->remote_qkey; urep->remote_qpn = krep->remote_qpn; urep->starting_psn = krep->starting_psn; urep->responder_resources = krep->responder_resources; urep->initiator_depth = krep->initiator_depth; urep->target_ack_delay = krep->target_ack_delay; urep->failover_accepted = krep->failover_accepted; urep->flow_control = krep->flow_control; urep->rnr_retry_count = krep->rnr_retry_count; urep->srq = krep->srq; } static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep, struct ib_cm_sidr_rep_event_param *krep) { urep->status = krep->status; urep->qkey = krep->qkey; urep->qpn = krep->qpn; }; static int ib_ucm_event_process(struct ib_cm_event *evt, struct ib_ucm_event *uvt) { void *info = NULL; switch (evt->event) { case IB_CM_REQ_RECEIVED: ib_ucm_event_req_get(&uvt->resp.u.req_resp, &evt->param.req_rcvd); uvt->data_len = IB_CM_REQ_PRIVATE_DATA_SIZE; uvt->resp.present = IB_UCM_PRES_PRIMARY; uvt->resp.present |= (evt->param.req_rcvd.alternate_path ? IB_UCM_PRES_ALTERNATE : 0); break; case IB_CM_REP_RECEIVED: ib_ucm_event_rep_get(&uvt->resp.u.rep_resp, &evt->param.rep_rcvd); uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE; break; case IB_CM_RTU_RECEIVED: uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE; uvt->resp.u.send_status = evt->param.send_status; break; case IB_CM_DREQ_RECEIVED: uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE; uvt->resp.u.send_status = evt->param.send_status; break; case IB_CM_DREP_RECEIVED: uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE; uvt->resp.u.send_status = evt->param.send_status; break; case IB_CM_MRA_RECEIVED: uvt->resp.u.mra_resp.timeout = evt->param.mra_rcvd.service_timeout; uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE; break; case IB_CM_REJ_RECEIVED: uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason; uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; uvt->info_len = evt->param.rej_rcvd.ari_length; info = evt->param.rej_rcvd.ari; break; case IB_CM_LAP_RECEIVED: ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path, evt->param.lap_rcvd.alternate_path); uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE; uvt->resp.present = IB_UCM_PRES_ALTERNATE; break; case IB_CM_APR_RECEIVED: uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status; uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE; uvt->info_len = evt->param.apr_rcvd.info_len; info = evt->param.apr_rcvd.apr_info; break; case IB_CM_SIDR_REQ_RECEIVED: uvt->resp.u.sidr_req_resp.pkey = evt->param.sidr_req_rcvd.pkey; uvt->resp.u.sidr_req_resp.port = evt->param.sidr_req_rcvd.port; uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE; break; case IB_CM_SIDR_REP_RECEIVED: ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp, &evt->param.sidr_rep_rcvd); uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; uvt->info_len = evt->param.sidr_rep_rcvd.info_len; info = evt->param.sidr_rep_rcvd.info; break; default: uvt->resp.u.send_status = evt->param.send_status; break; } if (uvt->data_len) { uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL); if (!uvt->data) goto err1; uvt->resp.present |= IB_UCM_PRES_DATA; } if (uvt->info_len) { uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL); if (!uvt->info) goto err2; uvt->resp.present |= IB_UCM_PRES_INFO; } return 0; err2: kfree(uvt->data); err1: return -ENOMEM; } static int ib_ucm_event_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct ib_ucm_event *uevent; struct ib_ucm_context *ctx; int result = 0; ctx = cm_id->context; uevent = kzalloc(sizeof *uevent, GFP_KERNEL); if (!uevent) goto err1; uevent->ctx = ctx; uevent->cm_id = cm_id; uevent->resp.uid = ctx->uid; uevent->resp.id = ctx->id; uevent->resp.event = event->event; result = ib_ucm_event_process(event, uevent); if (result) goto err2; mutex_lock(&ctx->file->file_mutex); list_add_tail(&uevent->file_list, &ctx->file->events); list_add_tail(&uevent->ctx_list, &ctx->events); wake_up_interruptible(&ctx->file->poll_wait); - if (ctx->file->filp) - selwakeup(&ctx->file->filp->f_selinfo); + linux_poll_wakeup(ctx->file->filp); mutex_unlock(&ctx->file->file_mutex); return 0; err2: kfree(uevent); err1: /* Destroy new cm_id's */ return ib_ucm_new_cm_id(event->event); } static ssize_t ib_ucm_event(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { struct ib_ucm_context *ctx; struct ib_ucm_event_get cmd; struct ib_ucm_event *uevent; int result = 0; if (out_len < sizeof(struct ib_ucm_event_resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; mutex_lock(&file->file_mutex); while (list_empty(&file->events)) { mutex_unlock(&file->file_mutex); if (file->filp->f_flags & O_NONBLOCK) return -EAGAIN; if (wait_event_interruptible(file->poll_wait, !list_empty(&file->events))) return -ERESTARTSYS; mutex_lock(&file->file_mutex); } uevent = list_entry(file->events.next, struct ib_ucm_event, file_list); if (ib_ucm_new_cm_id(uevent->resp.event)) { ctx = ib_ucm_ctx_alloc(file); if (!ctx) { result = -ENOMEM; goto done; } ctx->cm_id = uevent->cm_id; ctx->cm_id->context = ctx; uevent->resp.id = ctx->id; } if (copy_to_user((void __user *)(unsigned long)cmd.response, &uevent->resp, sizeof(uevent->resp))) { result = -EFAULT; goto done; } if (uevent->data) { if (cmd.data_len < uevent->data_len) { result = -ENOMEM; goto done; } if (copy_to_user((void __user *)(unsigned long)cmd.data, uevent->data, uevent->data_len)) { result = -EFAULT; goto done; } } if (uevent->info) { if (cmd.info_len < uevent->info_len) { result = -ENOMEM; goto done; } if (copy_to_user((void __user *)(unsigned long)cmd.info, uevent->info, uevent->info_len)) { result = -EFAULT; goto done; } } list_del(&uevent->file_list); list_del(&uevent->ctx_list); uevent->ctx->events_reported++; kfree(uevent->data); kfree(uevent->info); kfree(uevent); done: mutex_unlock(&file->file_mutex); return result; } static ssize_t ib_ucm_create_id(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { struct ib_ucm_create_id cmd; struct ib_ucm_create_id_resp resp; struct ib_ucm_context *ctx; int result; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; mutex_lock(&file->file_mutex); ctx = ib_ucm_ctx_alloc(file); mutex_unlock(&file->file_mutex); if (!ctx) return -ENOMEM; ctx->uid = cmd.uid; ctx->cm_id = ib_create_cm_id(file->device->ib_dev, ib_ucm_event_handler, ctx); if (IS_ERR(ctx->cm_id)) { result = PTR_ERR(ctx->cm_id); goto err1; } resp.id = ctx->id; if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) { result = -EFAULT; goto err2; } return 0; err2: ib_destroy_cm_id(ctx->cm_id); err1: mutex_lock(&ctx_id_mutex); idr_remove(&ctx_id_table, ctx->id); mutex_unlock(&ctx_id_mutex); kfree(ctx); return result; } static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { struct ib_ucm_destroy_id cmd; struct ib_ucm_destroy_id_resp resp; struct ib_ucm_context *ctx; int result = 0; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; mutex_lock(&ctx_id_mutex); ctx = idr_find(&ctx_id_table, cmd.id); if (!ctx) ctx = ERR_PTR(-ENOENT); else if (ctx->file != file) ctx = ERR_PTR(-EINVAL); else idr_remove(&ctx_id_table, ctx->id); mutex_unlock(&ctx_id_mutex); if (IS_ERR(ctx)) return PTR_ERR(ctx); ib_ucm_ctx_put(ctx); wait_for_completion(&ctx->comp); /* No new events will be generated after destroying the cm_id. */ ib_destroy_cm_id(ctx->cm_id); /* Cleanup events not yet reported to the user. */ ib_ucm_cleanup_events(ctx); resp.events_reported = ctx->events_reported; if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) result = -EFAULT; kfree(ctx); return result; } static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { struct ib_ucm_attr_id_resp resp; struct ib_ucm_attr_id cmd; struct ib_ucm_context *ctx; int result = 0; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ib_ucm_ctx_get(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); resp.service_id = ctx->cm_id->service_id; resp.service_mask = ctx->cm_id->service_mask; resp.local_id = ctx->cm_id->local_id; resp.remote_id = ctx->cm_id->remote_id; if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) result = -EFAULT; ib_ucm_ctx_put(ctx); return result; } static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { struct ib_uverbs_qp_attr resp; struct ib_ucm_init_qp_attr cmd; struct ib_ucm_context *ctx; struct ib_qp_attr qp_attr; int result = 0; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ib_ucm_ctx_get(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); resp.qp_attr_mask = 0; memset(&qp_attr, 0, sizeof qp_attr); qp_attr.qp_state = cmd.qp_state; result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask); if (result) goto out; ib_copy_qp_attr_to_user(&resp, &qp_attr); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) result = -EFAULT; out: ib_ucm_ctx_put(ctx); return result; } static int ucm_validate_listen(__be64 service_id, __be64 service_mask) { service_id &= service_mask; if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) || ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID)) return -EINVAL; return 0; } static ssize_t ib_ucm_listen(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { struct ib_ucm_listen cmd; struct ib_ucm_context *ctx; int result; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ib_ucm_ctx_get(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); result = ucm_validate_listen(cmd.service_id, cmd.service_mask); if (result) goto out; - result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask, - NULL); + result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask); out: ib_ucm_ctx_put(ctx); return result; } static ssize_t ib_ucm_notify(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { struct ib_ucm_notify cmd; struct ib_ucm_context *ctx; int result; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ib_ucm_ctx_get(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event); ib_ucm_ctx_put(ctx); return result; } static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len) { void *data; *dest = NULL; if (!len) return 0; - data = kmalloc(len, GFP_KERNEL); - if (!data) - return -ENOMEM; + data = memdup_user((void __user *)(unsigned long)src, len); + if (IS_ERR(data)) + return PTR_ERR(data); - if (copy_from_user(data, (void __user *)(unsigned long)src, len)) { - kfree(data); - return -EFAULT; - } - *dest = data; return 0; } static int ib_ucm_path_get(struct ib_sa_path_rec **path, u64 src) { struct ib_user_path_rec upath; struct ib_sa_path_rec *sa_path; *path = NULL; if (!src) return 0; sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL); if (!sa_path) return -ENOMEM; if (copy_from_user(&upath, (void __user *)(unsigned long)src, sizeof(upath))) { kfree(sa_path); return -EFAULT; } ib_copy_path_rec_from_user(sa_path, &upath); *path = sa_path; return 0; } static ssize_t ib_ucm_send_req(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { struct ib_cm_req_param param; struct ib_ucm_context *ctx; struct ib_ucm_req cmd; int result; param.private_data = NULL; param.primary_path = NULL; param.alternate_path = NULL; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.len); if (result) goto done; result = ib_ucm_path_get(¶m.primary_path, cmd.primary_path); if (result) goto done; result = ib_ucm_path_get(¶m.alternate_path, cmd.alternate_path); if (result) goto done; param.private_data_len = cmd.len; param.service_id = cmd.sid; param.qp_num = cmd.qpn; param.qp_type = cmd.qp_type; param.starting_psn = cmd.psn; param.peer_to_peer = cmd.peer_to_peer; param.responder_resources = cmd.responder_resources; param.initiator_depth = cmd.initiator_depth; param.remote_cm_response_timeout = cmd.remote_cm_response_timeout; param.flow_control = cmd.flow_control; param.local_cm_response_timeout = cmd.local_cm_response_timeout; param.retry_count = cmd.retry_count; param.rnr_retry_count = cmd.rnr_retry_count; param.max_cm_retries = cmd.max_cm_retries; param.srq = cmd.srq; ctx = ib_ucm_ctx_get(file, cmd.id); if (!IS_ERR(ctx)) { result = ib_send_cm_req(ctx->cm_id, ¶m); ib_ucm_ctx_put(ctx); } else result = PTR_ERR(ctx); done: kfree(param.private_data); kfree(param.primary_path); kfree(param.alternate_path); return result; } static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { struct ib_cm_rep_param param; struct ib_ucm_context *ctx; struct ib_ucm_rep cmd; int result; param.private_data = NULL; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.len); if (result) return result; param.qp_num = cmd.qpn; param.starting_psn = cmd.psn; param.private_data_len = cmd.len; param.responder_resources = cmd.responder_resources; param.initiator_depth = cmd.initiator_depth; param.failover_accepted = cmd.failover_accepted; param.flow_control = cmd.flow_control; param.rnr_retry_count = cmd.rnr_retry_count; param.srq = cmd.srq; ctx = ib_ucm_ctx_get(file, cmd.id); if (!IS_ERR(ctx)) { ctx->uid = cmd.uid; result = ib_send_cm_rep(ctx->cm_id, ¶m); ib_ucm_ctx_put(ctx); } else result = PTR_ERR(ctx); kfree(param.private_data); return result; } static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int (*func)(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len)) { struct ib_ucm_private_data cmd; struct ib_ucm_context *ctx; const void *private_data = NULL; int result; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len); if (result) return result; ctx = ib_ucm_ctx_get(file, cmd.id); if (!IS_ERR(ctx)) { result = func(ctx->cm_id, private_data, cmd.len); ib_ucm_ctx_put(ctx); } else result = PTR_ERR(ctx); kfree(private_data); return result; } static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu); } static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq); } static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep); } static ssize_t ib_ucm_send_info(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int (*func)(struct ib_cm_id *cm_id, int status, const void *info, u8 info_len, const void *data, u8 data_len)) { struct ib_ucm_context *ctx; struct ib_ucm_info cmd; const void *data = NULL; const void *info = NULL; int result; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len); if (result) goto done; result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len); if (result) goto done; ctx = ib_ucm_ctx_get(file, cmd.id); if (!IS_ERR(ctx)) { result = func(ctx->cm_id, cmd.status, info, cmd.info_len, data, cmd.data_len); ib_ucm_ctx_put(ctx); } else result = PTR_ERR(ctx); done: kfree(data); kfree(info); return result; } static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej); } static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr); } static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { struct ib_ucm_context *ctx; struct ib_ucm_mra cmd; const void *data = NULL; int result; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; result = ib_ucm_alloc_data(&data, cmd.data, cmd.len); if (result) return result; ctx = ib_ucm_ctx_get(file, cmd.id); if (!IS_ERR(ctx)) { result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len); ib_ucm_ctx_put(ctx); } else result = PTR_ERR(ctx); kfree(data); return result; } static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { struct ib_ucm_context *ctx; struct ib_sa_path_rec *path = NULL; struct ib_ucm_lap cmd; const void *data = NULL; int result; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; result = ib_ucm_alloc_data(&data, cmd.data, cmd.len); if (result) goto done; result = ib_ucm_path_get(&path, cmd.path); if (result) goto done; ctx = ib_ucm_ctx_get(file, cmd.id); if (!IS_ERR(ctx)) { result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len); ib_ucm_ctx_put(ctx); } else result = PTR_ERR(ctx); done: kfree(data); kfree(path); return result; } static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { struct ib_cm_sidr_req_param param; struct ib_ucm_context *ctx; struct ib_ucm_sidr_req cmd; int result; param.private_data = NULL; param.path = NULL; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.len); if (result) goto done; result = ib_ucm_path_get(¶m.path, cmd.path); if (result) goto done; param.private_data_len = cmd.len; param.service_id = cmd.sid; param.timeout_ms = cmd.timeout; param.max_cm_retries = cmd.max_cm_retries; ctx = ib_ucm_ctx_get(file, cmd.id); if (!IS_ERR(ctx)) { result = ib_send_cm_sidr_req(ctx->cm_id, ¶m); ib_ucm_ctx_put(ctx); } else result = PTR_ERR(ctx); done: kfree(param.private_data); kfree(param.path); return result; } static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { struct ib_cm_sidr_rep_param param; struct ib_ucm_sidr_rep cmd; struct ib_ucm_context *ctx; int result; param.info = NULL; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.data_len); if (result) goto done; result = ib_ucm_alloc_data(¶m.info, cmd.info, cmd.info_len); if (result) goto done; param.qp_num = cmd.qpn; param.qkey = cmd.qkey; param.status = cmd.status; param.info_length = cmd.info_len; param.private_data_len = cmd.data_len; ctx = ib_ucm_ctx_get(file, cmd.id); if (!IS_ERR(ctx)) { result = ib_send_cm_sidr_rep(ctx->cm_id, ¶m); ib_ucm_ctx_put(ctx); } else result = PTR_ERR(ctx); done: kfree(param.private_data); kfree(param.info); return result; } static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) = { [IB_USER_CM_CMD_CREATE_ID] = ib_ucm_create_id, [IB_USER_CM_CMD_DESTROY_ID] = ib_ucm_destroy_id, [IB_USER_CM_CMD_ATTR_ID] = ib_ucm_attr_id, [IB_USER_CM_CMD_LISTEN] = ib_ucm_listen, [IB_USER_CM_CMD_NOTIFY] = ib_ucm_notify, [IB_USER_CM_CMD_SEND_REQ] = ib_ucm_send_req, [IB_USER_CM_CMD_SEND_REP] = ib_ucm_send_rep, [IB_USER_CM_CMD_SEND_RTU] = ib_ucm_send_rtu, [IB_USER_CM_CMD_SEND_DREQ] = ib_ucm_send_dreq, [IB_USER_CM_CMD_SEND_DREP] = ib_ucm_send_drep, [IB_USER_CM_CMD_SEND_REJ] = ib_ucm_send_rej, [IB_USER_CM_CMD_SEND_MRA] = ib_ucm_send_mra, [IB_USER_CM_CMD_SEND_LAP] = ib_ucm_send_lap, [IB_USER_CM_CMD_SEND_APR] = ib_ucm_send_apr, [IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req, [IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep, [IB_USER_CM_CMD_EVENT] = ib_ucm_event, [IB_USER_CM_CMD_INIT_QP_ATTR] = ib_ucm_init_qp_attr, }; static ssize_t ib_ucm_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct ib_ucm_file *file = filp->private_data; struct ib_ucm_cmd_hdr hdr; ssize_t result; + if (WARN_ON_ONCE(!ib_safe_file_access(filp))) + return -EACCES; + if (len < sizeof(hdr)) return -EINVAL; if (copy_from_user(&hdr, buf, sizeof(hdr))) return -EFAULT; if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table)) return -EINVAL; if (hdr.in + sizeof(hdr) > len) return -EINVAL; result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out); if (!result) result = len; return result; } static unsigned int ib_ucm_poll(struct file *filp, struct poll_table_struct *wait) { struct ib_ucm_file *file = filp->private_data; unsigned int mask = 0; poll_wait(filp, &file->poll_wait, wait); if (!list_empty(&file->events)) mask = POLLIN | POLLRDNORM; return mask; } /* * ib_ucm_open() does not need the BKL: * * - no global state is referred to; * - there is no ioctl method to race against; * - no further module initialization is required for open to work * after the device is registered. */ static int ib_ucm_open(struct inode *inode, struct file *filp) { struct ib_ucm_file *file; file = kmalloc(sizeof(*file), GFP_KERNEL); if (!file) return -ENOMEM; INIT_LIST_HEAD(&file->events); INIT_LIST_HEAD(&file->ctxs); init_waitqueue_head(&file->poll_wait); mutex_init(&file->file_mutex); filp->private_data = file; file->filp = filp; file->device = container_of(inode->i_cdev->si_drv1, struct ib_ucm_device, cdev); return nonseekable_open(inode, filp); } static int ib_ucm_close(struct inode *inode, struct file *filp) { struct ib_ucm_file *file = filp->private_data; struct ib_ucm_context *ctx; mutex_lock(&file->file_mutex); while (!list_empty(&file->ctxs)) { ctx = list_entry(file->ctxs.next, struct ib_ucm_context, file_list); mutex_unlock(&file->file_mutex); mutex_lock(&ctx_id_mutex); idr_remove(&ctx_id_table, ctx->id); mutex_unlock(&ctx_id_mutex); ib_destroy_cm_id(ctx->cm_id); ib_ucm_cleanup_events(ctx); kfree(ctx); mutex_lock(&file->file_mutex); } mutex_unlock(&file->file_mutex); kfree(file); return 0; } +static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); static void ib_ucm_release_dev(struct device *dev) { struct ib_ucm_device *ucm_dev; ucm_dev = container_of(dev, struct ib_ucm_device, dev); cdev_del(&ucm_dev->cdev); if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) - clear_bit(ucm_dev->devnum, dev_map); + clear_bit(ucm_dev->devnum, dev_map); else - clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map); + clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, overflow_map); kfree(ucm_dev); } static const struct file_operations ucm_fops = { - .owner = THIS_MODULE, - .open = ib_ucm_open, + .owner = THIS_MODULE, + .open = ib_ucm_open, .release = ib_ucm_close, - .write = ib_ucm_write, + .write = ib_ucm_write, .poll = ib_ucm_poll, .llseek = no_llseek, }; static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, char *buf) { struct ib_ucm_device *ucm_dev; ucm_dev = container_of(dev, struct ib_ucm_device, dev); return sprintf(buf, "%s\n", ucm_dev->ib_dev->name); } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); static dev_t overflow_maj; -static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); static int find_overflow_devnum(void) { int ret; if (!overflow_maj) { ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES, "infiniband_cm"); if (ret) { - printk(KERN_ERR "ucm: couldn't register dynamic device number\n"); + pr_err("ucm: couldn't register dynamic device number\n"); return ret; } } ret = find_first_zero_bit(overflow_map, IB_UCM_MAX_DEVICES); if (ret >= IB_UCM_MAX_DEVICES) return -1; return ret; } static void ib_ucm_add_one(struct ib_device *device) { int devnum; dev_t base; struct ib_ucm_device *ucm_dev; - if (!device->alloc_ucontext || - rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + if (!device->alloc_ucontext || !rdma_cap_ib_cm(device, 1)) return; ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL); if (!ucm_dev) return; ucm_dev->ib_dev = device; devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES); if (devnum >= IB_UCM_MAX_DEVICES) { devnum = find_overflow_devnum(); if (devnum < 0) - goto err; + goto err; ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES; base = devnum + overflow_maj; set_bit(devnum, overflow_map); } else { ucm_dev->devnum = devnum; base = devnum + IB_UCM_BASE_DEV; set_bit(devnum, dev_map); } cdev_init(&ucm_dev->cdev, &ucm_fops); ucm_dev->cdev.owner = THIS_MODULE; kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum); if (cdev_add(&ucm_dev->cdev, base, 1)) goto err; ucm_dev->dev.class = &cm_class; ucm_dev->dev.parent = device->dma_device; ucm_dev->dev.devt = ucm_dev->cdev.dev; ucm_dev->dev.release = ib_ucm_release_dev; dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum); if (device_register(&ucm_dev->dev)) goto err_cdev; if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev)) goto err_dev; ib_set_client_data(device, &ucm_client, ucm_dev); return; err_dev: device_unregister(&ucm_dev->dev); err_cdev: cdev_del(&ucm_dev->cdev); if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) clear_bit(devnum, dev_map); else clear_bit(devnum, overflow_map); err: kfree(ucm_dev); return; } -static void ib_ucm_remove_one(struct ib_device *device) +static void ib_ucm_remove_one(struct ib_device *device, void *client_data) { - struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client); + struct ib_ucm_device *ucm_dev = client_data; if (!ucm_dev) return; device_unregister(&ucm_dev->dev); } -static ssize_t show_abi_version(struct class *class, struct class_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", IB_USER_CM_ABI_VERSION); -} +static CLASS_ATTR_STRING(abi_version, S_IRUGO, + __stringify(IB_USER_CM_ABI_VERSION)); -static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); - static int __init ib_ucm_init(void) { int ret; ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES, "infiniband_cm"); if (ret) { - printk(KERN_ERR "ucm: couldn't register device number\n"); + pr_err("ucm: couldn't register device number\n"); goto error1; } - ret = class_create_file(&cm_class, &class_attr_abi_version); + ret = class_create_file(&cm_class, &class_attr_abi_version.attr); if (ret) { - printk(KERN_ERR "ucm: couldn't create abi_version attribute\n"); + pr_err("ucm: couldn't create abi_version attribute\n"); goto error2; } ret = ib_register_client(&ucm_client); if (ret) { - printk(KERN_ERR "ucm: couldn't register client\n"); + pr_err("ucm: couldn't register client\n"); goto error3; } return 0; error3: - class_remove_file(&cm_class, &class_attr_abi_version); + class_remove_file(&cm_class, &class_attr_abi_version.attr); error2: unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); error1: return ret; } static void __exit ib_ucm_cleanup(void) { ib_unregister_client(&ucm_client); - class_remove_file(&cm_class, &class_attr_abi_version); + class_remove_file(&cm_class, &class_attr_abi_version.attr); unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); if (overflow_maj) unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES); idr_destroy(&ctx_id_table); } module_init_order(ib_ucm_init, SI_ORDER_THIRD); module_exit(ib_ucm_cleanup); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ucma.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ucma.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ucma.c (revision 319974) @@ -1,1417 +1,1755 @@ /* * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include +#include MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); MODULE_LICENSE("Dual BSD/GPL"); static unsigned int max_backlog = 1024; struct ucma_file { struct mutex mut; struct file *filp; struct list_head ctx_list; struct list_head event_list; wait_queue_head_t poll_wait; + struct workqueue_struct *close_wq; }; struct ucma_context { int id; struct completion comp; atomic_t ref; int events_reported; int backlog; struct ucma_file *file; struct rdma_cm_id *cm_id; u64 uid; struct list_head list; struct list_head mc_list; + /* mark that device is in process of destroying the internal HW + * resources, protected by the global mut + */ + int closing; + /* sync between removal event and id destroy, protected by file mut */ + int destroying; + struct work_struct close_work; }; struct ucma_multicast { struct ucma_context *ctx; int id; int events_reported; u64 uid; + u8 join_state; struct list_head list; struct sockaddr_storage addr; }; struct ucma_event { struct ucma_context *ctx; struct ucma_multicast *mc; struct list_head list; struct rdma_cm_id *cm_id; struct rdma_ucm_event_resp resp; + struct work_struct close_work; }; static DEFINE_MUTEX(mut); static DEFINE_IDR(ctx_idr); static DEFINE_IDR(multicast_idr); static inline struct ucma_context *_ucma_find_context(int id, struct ucma_file *file) { struct ucma_context *ctx; ctx = idr_find(&ctx_idr, id); if (!ctx) ctx = ERR_PTR(-ENOENT); else if (ctx->file != file) ctx = ERR_PTR(-EINVAL); return ctx; } static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id) { struct ucma_context *ctx; mutex_lock(&mut); ctx = _ucma_find_context(id, file); - if (!IS_ERR(ctx)) - atomic_inc(&ctx->ref); + if (!IS_ERR(ctx)) { + if (ctx->closing) + ctx = ERR_PTR(-EIO); + else + atomic_inc(&ctx->ref); + } mutex_unlock(&mut); return ctx; } static void ucma_put_ctx(struct ucma_context *ctx) { if (atomic_dec_and_test(&ctx->ref)) complete(&ctx->comp); } +static void ucma_close_event_id(struct work_struct *work) +{ + struct ucma_event *uevent_close = container_of(work, struct ucma_event, close_work); + + rdma_destroy_id(uevent_close->cm_id); + kfree(uevent_close); +} + +static void ucma_close_id(struct work_struct *work) +{ + struct ucma_context *ctx = container_of(work, struct ucma_context, close_work); + + /* once all inflight tasks are finished, we close all underlying + * resources. The context is still alive till its explicit destryoing + * by its creator. + */ + ucma_put_ctx(ctx); + wait_for_completion(&ctx->comp); + /* No new events will be generated after destroying the id. */ + rdma_destroy_id(ctx->cm_id); +} + static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) { struct ucma_context *ctx; - int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; + INIT_WORK(&ctx->close_work, ucma_close_id); atomic_set(&ctx->ref, 1); init_completion(&ctx->comp); INIT_LIST_HEAD(&ctx->mc_list); ctx->file = file; - do { - ret = idr_pre_get(&ctx_idr, GFP_KERNEL); - if (!ret) - goto error; - - mutex_lock(&mut); - ret = idr_get_new(&ctx_idr, ctx, &ctx->id); - mutex_unlock(&mut); - } while (ret == -EAGAIN); - - if (ret) + mutex_lock(&mut); + ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL); + mutex_unlock(&mut); + if (ctx->id < 0) goto error; list_add_tail(&ctx->list, &file->ctx_list); return ctx; error: kfree(ctx); return NULL; } static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx) { struct ucma_multicast *mc; - int ret; mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return NULL; - do { - ret = idr_pre_get(&multicast_idr, GFP_KERNEL); - if (!ret) - goto error; - - mutex_lock(&mut); - ret = idr_get_new(&multicast_idr, mc, &mc->id); - mutex_unlock(&mut); - } while (ret == -EAGAIN); - - if (ret) + mutex_lock(&mut); + mc->id = idr_alloc(&multicast_idr, mc, 0, 0, GFP_KERNEL); + mutex_unlock(&mut); + if (mc->id < 0) goto error; mc->ctx = ctx; list_add_tail(&mc->list, &ctx->mc_list); return mc; error: kfree(mc); return NULL; } static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, struct rdma_conn_param *src) { if (src->private_data_len) memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; dst->responder_resources =src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; dst->qp_num = src->qp_num; } static void ucma_copy_ud_event(struct rdma_ucm_ud_param *dst, struct rdma_ud_param *src) { if (src->private_data_len) memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr); dst->qp_num = src->qp_num; dst->qkey = src->qkey; } static void ucma_set_event_context(struct ucma_context *ctx, struct rdma_cm_event *event, struct ucma_event *uevent) { uevent->ctx = ctx; switch (event->event) { case RDMA_CM_EVENT_MULTICAST_JOIN: case RDMA_CM_EVENT_MULTICAST_ERROR: - uevent->mc = (struct ucma_multicast *) - event->param.ud.private_data; + uevent->mc = __DECONST(struct ucma_multicast *, + event->param.ud.private_data); uevent->resp.uid = uevent->mc->uid; uevent->resp.id = uevent->mc->id; break; default: uevent->resp.uid = ctx->uid; uevent->resp.id = ctx->id; break; } } +/* Called with file->mut locked for the relevant context. */ +static void ucma_removal_event_handler(struct rdma_cm_id *cm_id) +{ + struct ucma_context *ctx = cm_id->context; + struct ucma_event *con_req_eve; + int event_found = 0; + + if (ctx->destroying) + return; + + /* only if context is pointing to cm_id that it owns it and can be + * queued to be closed, otherwise that cm_id is an inflight one that + * is part of that context event list pending to be detached and + * reattached to its new context as part of ucma_get_event, + * handled separately below. + */ + if (ctx->cm_id == cm_id) { + mutex_lock(&mut); + ctx->closing = 1; + mutex_unlock(&mut); + queue_work(ctx->file->close_wq, &ctx->close_work); + return; + } + + list_for_each_entry(con_req_eve, &ctx->file->event_list, list) { + if (con_req_eve->cm_id == cm_id && + con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) { + list_del(&con_req_eve->list); + INIT_WORK(&con_req_eve->close_work, ucma_close_event_id); + queue_work(ctx->file->close_wq, &con_req_eve->close_work); + event_found = 1; + break; + } + } + if (!event_found) + pr_err("ucma_removal_event_handler: warning: connect request event wasn't found\n"); +} + static int ucma_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { struct ucma_event *uevent; struct ucma_context *ctx = cm_id->context; int ret = 0; uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); if (!uevent) return event->event == RDMA_CM_EVENT_CONNECT_REQUEST; mutex_lock(&ctx->file->mut); uevent->cm_id = cm_id; ucma_set_event_context(ctx, event, uevent); uevent->resp.event = event->event; uevent->resp.status = event->status; if (cm_id->qp_type == IB_QPT_UD) ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud); else ucma_copy_conn_event(&uevent->resp.param.conn, &event->param.conn); if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) { if (!ctx->backlog) { ret = -ENOMEM; kfree(uevent); goto out; } ctx->backlog--; - } else if (!ctx->uid) { + } else if (!ctx->uid || ctx->cm_id != cm_id) { /* * We ignore events for new connections until userspace has set * their context. This can only happen if an error occurs on a * new connection before the user accepts it. This is okay, - * since the accept will just fail later. + * since the accept will just fail later. However, we do need + * to release the underlying HW resources in case of a device + * removal event. */ + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + ucma_removal_event_handler(cm_id); + kfree(uevent); goto out; } list_add_tail(&uevent->list, &ctx->file->event_list); wake_up_interruptible(&ctx->file->poll_wait); - if (ctx->file->filp) - selwakeup(&ctx->file->filp->f_selinfo); + linux_poll_wakeup(ctx->file->filp); + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + ucma_removal_event_handler(cm_id); out: mutex_unlock(&ctx->file->mut); return ret; } static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct ucma_context *ctx; struct rdma_ucm_get_event cmd; struct ucma_event *uevent; int ret = 0; if (out_len < sizeof uevent->resp) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; mutex_lock(&file->mut); while (list_empty(&file->event_list)) { mutex_unlock(&file->mut); if (file->filp->f_flags & O_NONBLOCK) return -EAGAIN; if (wait_event_interruptible(file->poll_wait, !list_empty(&file->event_list))) return -ERESTARTSYS; mutex_lock(&file->mut); } uevent = list_entry(file->event_list.next, struct ucma_event, list); if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) { ctx = ucma_alloc_ctx(file); if (!ctx) { ret = -ENOMEM; goto done; } uevent->ctx->backlog++; ctx->cm_id = uevent->cm_id; ctx->cm_id->context = ctx; uevent->resp.id = ctx->id; - ctx->cm_id->ucontext = ctx; } if (copy_to_user((void __user *)(unsigned long)cmd.response, &uevent->resp, sizeof uevent->resp)) { ret = -EFAULT; goto done; } list_del(&uevent->list); uevent->ctx->events_reported++; if (uevent->mc) uevent->mc->events_reported++; kfree(uevent); done: mutex_unlock(&file->mut); return ret; } static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type) { switch (cmd->ps) { case RDMA_PS_TCP: *qp_type = IB_QPT_RC; return 0; case RDMA_PS_UDP: case RDMA_PS_IPOIB: *qp_type = IB_QPT_UD; return 0; case RDMA_PS_IB: *qp_type = cmd->qp_type; return 0; default: return -EINVAL; } } static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, - int in_len, int out_len) + int in_len, int out_len) { struct rdma_ucm_create_id cmd; struct rdma_ucm_create_id_resp resp; struct ucma_context *ctx; enum ib_qp_type qp_type; int ret; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ret = ucma_get_qp_type(&cmd, &qp_type); if (ret) return ret; mutex_lock(&file->mut); ctx = ucma_alloc_ctx(file); mutex_unlock(&file->mut); if (!ctx) return -ENOMEM; ctx->uid = cmd.uid; - ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps, qp_type); + ctx->cm_id = rdma_create_id(TD_TO_VNET(curthread), + ucma_event_handler, ctx, cmd.ps, qp_type); if (IS_ERR(ctx->cm_id)) { ret = PTR_ERR(ctx->cm_id); goto err1; } - ctx->cm_id->ucontext = ctx; resp.id = ctx->id; if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) { ret = -EFAULT; goto err2; } return 0; err2: rdma_destroy_id(ctx->cm_id); err1: mutex_lock(&mut); idr_remove(&ctx_idr, ctx->id); mutex_unlock(&mut); kfree(ctx); return ret; } static void ucma_cleanup_multicast(struct ucma_context *ctx) { struct ucma_multicast *mc, *tmp; mutex_lock(&mut); list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) { list_del(&mc->list); idr_remove(&multicast_idr, mc->id); kfree(mc); } mutex_unlock(&mut); } static void ucma_cleanup_mc_events(struct ucma_multicast *mc) { struct ucma_event *uevent, *tmp; list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) { if (uevent->mc != mc) continue; list_del(&uevent->list); kfree(uevent); } } /* - * We cannot hold file->mut when calling rdma_destroy_id() or we can - * deadlock. We also acquire file->mut in ucma_event_handler(), and - * rdma_destroy_id() will wait until all callbacks have completed. + * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At + * this point, no new events will be reported from the hardware. However, we + * still need to cleanup the UCMA context for this ID. Specifically, there + * might be events that have not yet been consumed by the user space software. + * These might include pending connect requests which we have not completed + * processing. We cannot call rdma_destroy_id while holding the lock of the + * context (file->mut), as it might cause a deadlock. We therefore extract all + * relevant events from the context pending events list while holding the + * mutex. After that we release them as needed. */ static int ucma_free_ctx(struct ucma_context *ctx) { int events_reported; struct ucma_event *uevent, *tmp; LIST_HEAD(list); - /* No new events will be generated after destroying the id. */ - rdma_destroy_id(ctx->cm_id); ucma_cleanup_multicast(ctx); /* Cleanup events not yet reported to the user. */ mutex_lock(&ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) { if (uevent->ctx == ctx) list_move_tail(&uevent->list, &list); } list_del(&ctx->list); mutex_unlock(&ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &list, list) { list_del(&uevent->list); if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) rdma_destroy_id(uevent->cm_id); kfree(uevent); } events_reported = ctx->events_reported; kfree(ctx); return events_reported; } static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_destroy_id cmd; struct rdma_ucm_destroy_id_resp resp; struct ucma_context *ctx; int ret = 0; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; mutex_lock(&mut); ctx = _ucma_find_context(cmd.id, file); if (!IS_ERR(ctx)) idr_remove(&ctx_idr, ctx->id); mutex_unlock(&mut); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ucma_put_ctx(ctx); - wait_for_completion(&ctx->comp); - resp.events_reported = ucma_free_ctx(ctx); + mutex_lock(&ctx->file->mut); + ctx->destroying = 1; + mutex_unlock(&ctx->file->mut); + flush_workqueue(ctx->file->close_wq); + /* At this point it's guaranteed that there is no inflight + * closing task */ + mutex_lock(&mut); + if (!ctx->closing) { + mutex_unlock(&mut); + ucma_put_ctx(ctx); + wait_for_completion(&ctx->comp); + rdma_destroy_id(ctx->cm_id); + } else { + mutex_unlock(&mut); + } + + resp.events_reported = ucma_free_ctx(ctx); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) ret = -EFAULT; return ret; } -static ssize_t ucma_bind_addr(struct ucma_file *file, const char __user *inbuf, +static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { - struct rdma_ucm_bind_addr cmd; + struct rdma_ucm_bind_ip cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); ucma_put_ctx(ctx); return ret; } +static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_bind cmd; + struct sockaddr *addr; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + addr = (struct sockaddr *) &cmd.addr; + if (cmd.reserved || !cmd.addr_size || (cmd.addr_size != rdma_addr_size(addr))) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_bind_addr(ctx->cm_id, addr); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_resolve_ip(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_resolve_ip cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, + (struct sockaddr *) &cmd.dst_addr, + cmd.timeout_ms); + ucma_put_ctx(ctx); + return ret; +} + static ssize_t ucma_resolve_addr(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_resolve_addr cmd; + struct sockaddr *src, *dst; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; + src = (struct sockaddr *) &cmd.src_addr; + dst = (struct sockaddr *) &cmd.dst_addr; + if (cmd.reserved || (cmd.src_size && (cmd.src_size != rdma_addr_size(src))) || + !cmd.dst_size || (cmd.dst_size != rdma_addr_size(dst))) + return -EINVAL; + ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, - (struct sockaddr *) &cmd.dst_addr, - cmd.timeout_ms); + ret = rdma_resolve_addr(ctx->cm_id, src, dst, cmd.timeout_ms); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_resolve_route(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_resolve_route cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms); ucma_put_ctx(ctx); return ret; } static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { struct rdma_dev_addr *dev_addr; resp->num_paths = route->num_paths; switch (route->num_paths) { case 0: dev_addr = &route->addr.dev_addr; rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid); rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid); resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); break; case 2: ib_copy_path_rec_to_user(&resp->ib_route[1], &route->path_rec[1]); /* fall through */ case 1: ib_copy_path_rec_to_user(&resp->ib_route[0], &route->path_rec[0]); break; default: break; } } static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { resp->num_paths = route->num_paths; switch (route->num_paths) { case 0: rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr, (union ib_gid *)&resp->ib_route[0].dgid); rdma_ip2gid((struct sockaddr *)&route->addr.src_addr, (union ib_gid *)&resp->ib_route[0].sgid); resp->ib_route[0].pkey = cpu_to_be16(0xffff); break; case 2: ib_copy_path_rec_to_user(&resp->ib_route[1], &route->path_rec[1]); /* fall through */ case 1: ib_copy_path_rec_to_user(&resp->ib_route[0], &route->path_rec[0]); break; default: break; } } static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { struct rdma_dev_addr *dev_addr; dev_addr = &route->addr.dev_addr; rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid); rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid); } static ssize_t ucma_query_route(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { - struct rdma_ucm_query_route cmd; + struct rdma_ucm_query cmd; struct rdma_ucm_query_route_resp resp; struct ucma_context *ctx; struct sockaddr *addr; int ret = 0; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); memset(&resp, 0, sizeof resp); addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)); addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)); if (!ctx->cm_id->device) goto out; resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; resp.port_num = ctx->cm_id->port_num; - switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) { - case RDMA_TRANSPORT_IB: - switch (rdma_port_get_link_layer(ctx->cm_id->device, - ctx->cm_id->port_num)) { - case IB_LINK_LAYER_INFINIBAND: - ucma_copy_ib_route(&resp, &ctx->cm_id->route); - break; - case IB_LINK_LAYER_ETHERNET: - ucma_copy_iboe_route(&resp, &ctx->cm_id->route); - break; - default: - break; - } - break; - case RDMA_TRANSPORT_IWARP: + + if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num)) + ucma_copy_ib_route(&resp, &ctx->cm_id->route); + else if (rdma_protocol_roce(ctx->cm_id->device, ctx->cm_id->port_num)) + ucma_copy_iboe_route(&resp, &ctx->cm_id->route); + else if (rdma_protocol_iwarp(ctx->cm_id->device, ctx->cm_id->port_num)) ucma_copy_iw_route(&resp, &ctx->cm_id->route); - break; - default: - break; - } out: if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) ret = -EFAULT; ucma_put_ctx(ctx); return ret; } -static void ucma_copy_conn_param(struct rdma_conn_param *dst, +static void ucma_query_device_addr(struct rdma_cm_id *cm_id, + struct rdma_ucm_query_addr_resp *resp) +{ + if (!cm_id->device) + return; + + resp->node_guid = (__force __u64) cm_id->device->node_guid; + resp->port_num = cm_id->port_num; + resp->pkey = (__force __u16) cpu_to_be16( + ib_addr_get_pkey(&cm_id->route.addr.dev_addr)); +} + +static ssize_t ucma_query_addr(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_addr_resp resp; + struct sockaddr *addr; + int ret = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + memset(&resp, 0, sizeof resp); + + addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; + resp.src_size = rdma_addr_size(addr); + memcpy(&resp.src_addr, addr, resp.src_size); + + addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; + resp.dst_size = rdma_addr_size(addr); + memcpy(&resp.dst_addr, addr, resp.dst_size); + + ucma_query_device_addr(ctx->cm_id, &resp); + + if (copy_to_user(response, &resp, sizeof(resp))) + ret = -EFAULT; + + return ret; +} + +static ssize_t ucma_query_path(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_path_resp *resp; + int i, ret = 0; + + if (out_len < sizeof(*resp)) + return -ENOSPC; + + resp = kzalloc(out_len, GFP_KERNEL); + if (!resp) + return -ENOMEM; + + resp->num_paths = ctx->cm_id->route.num_paths; + for (i = 0, out_len -= sizeof(*resp); + i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data); + i++, out_len -= sizeof(struct ib_path_rec_data)) { + + resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY | + IB_PATH_BIDIRECTIONAL; + ib_sa_pack_path(&ctx->cm_id->route.path_rec[i], + &resp->path_data[i].path_rec); + } + + if (copy_to_user(response, resp, + sizeof(*resp) + (i * sizeof(struct ib_path_rec_data)))) + ret = -EFAULT; + + kfree(resp); + return ret; +} + +static ssize_t ucma_query_gid(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_addr_resp resp; + struct sockaddr_ib *addr; + int ret = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + memset(&resp, 0, sizeof resp); + + ucma_query_device_addr(ctx->cm_id, &resp); + + addr = (struct sockaddr_ib *) &resp.src_addr; + resp.src_size = sizeof(*addr); + if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) { + memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size); + } else { + addr->sib_family = AF_IB; + addr->sib_pkey = (__force __be16) resp.pkey; + rdma_addr_get_sgid(&ctx->cm_id->route.addr.dev_addr, + (union ib_gid *) &addr->sib_addr); + addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) + &ctx->cm_id->route.addr.src_addr); + } + + addr = (struct sockaddr_ib *) &resp.dst_addr; + resp.dst_size = sizeof(*addr); + if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) { + memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size); + } else { + addr->sib_family = AF_IB; + addr->sib_pkey = (__force __be16) resp.pkey; + rdma_addr_get_dgid(&ctx->cm_id->route.addr.dev_addr, + (union ib_gid *) &addr->sib_addr); + addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) + &ctx->cm_id->route.addr.dst_addr); + } + + if (copy_to_user(response, &resp, sizeof(resp))) + ret = -EFAULT; + + return ret; +} + +static ssize_t ucma_query(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_query cmd; + struct ucma_context *ctx; + void __user *response; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + response = (void __user *)(unsigned long) cmd.response; + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + switch (cmd.option) { + case RDMA_USER_CM_QUERY_ADDR: + ret = ucma_query_addr(ctx, response, out_len); + break; + case RDMA_USER_CM_QUERY_PATH: + ret = ucma_query_path(ctx, response, out_len); + break; + case RDMA_USER_CM_QUERY_GID: + ret = ucma_query_gid(ctx, response, out_len); + break; + default: + ret = -ENOSYS; + break; + } + + ucma_put_ctx(ctx); + return ret; +} + +static void ucma_copy_conn_param(struct rdma_cm_id *id, + struct rdma_conn_param *dst, struct rdma_ucm_conn_param *src) { dst->private_data = src->private_data; dst->private_data_len = src->private_data_len; dst->responder_resources =src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; dst->qp_num = src->qp_num; + dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0; } static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_connect cmd; struct rdma_conn_param conn_param; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (!cmd.conn_param.valid) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ucma_copy_conn_param(&conn_param, &cmd.conn_param); + ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); ret = rdma_connect(ctx->cm_id, &conn_param); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_listen cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ? cmd.backlog : max_backlog; ret = rdma_listen(ctx->cm_id, ctx->backlog); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_accept cmd; struct rdma_conn_param conn_param; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); if (cmd.conn_param.valid) { - ucma_copy_conn_param(&conn_param, &cmd.conn_param); + ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); mutex_lock(&file->mut); ret = rdma_accept(ctx->cm_id, &conn_param); if (!ret) ctx->uid = cmd.uid; mutex_unlock(&file->mut); } else ret = rdma_accept(ctx->cm_id, NULL); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_reject cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_disconnect cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); ret = rdma_disconnect(ctx->cm_id); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_init_qp_attr(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_init_qp_attr cmd; struct ib_uverbs_qp_attr resp; struct ucma_context *ctx; struct ib_qp_attr qp_attr; int ret; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); resp.qp_attr_mask = 0; memset(&qp_attr, 0, sizeof qp_attr); qp_attr.qp_state = cmd.qp_state; ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask); if (ret) goto out; ib_copy_qp_attr_to_user(&resp, &qp_attr); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) ret = -EFAULT; out: ucma_put_ctx(ctx); return ret; } static int ucma_set_option_id(struct ucma_context *ctx, int optname, void *optval, size_t optlen) { int ret = 0; switch (optname) { case RDMA_OPTION_ID_TOS: if (optlen != sizeof(u8)) { ret = -EINVAL; break; } rdma_set_service_type(ctx->cm_id, *((u8 *) optval)); break; case RDMA_OPTION_ID_REUSEADDR: if (optlen != sizeof(int)) { ret = -EINVAL; break; } ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0); break; case RDMA_OPTION_ID_AFONLY: if (optlen != sizeof(int)) { ret = -EINVAL; break; } ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0); break; default: ret = -ENOSYS; } return ret; } static int ucma_set_ib_path(struct ucma_context *ctx, struct ib_path_rec_data *path_data, size_t optlen) { struct ib_sa_path_rec sa_path; struct rdma_cm_event event; int ret; if (optlen % sizeof(*path_data)) return -EINVAL; for (; optlen; optlen -= sizeof(*path_data), path_data++) { if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_BIDIRECTIONAL)) break; } if (!optlen) return -EINVAL; + memset(&sa_path, 0, sizeof(sa_path)); + ib_sa_unpack_path(path_data->path_rec, &sa_path); ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1); if (ret) return ret; memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; return ucma_event_handler(ctx->cm_id, &event); } static int ucma_set_option_ib(struct ucma_context *ctx, int optname, void *optval, size_t optlen) { - int ret = 0; + int ret; switch (optname) { case RDMA_OPTION_IB_PATH: ret = ucma_set_ib_path(ctx, optval, optlen); break; - - case RDMA_OPTION_IB_APM: - if (optlen != sizeof(u8)) { - ret = -EINVAL; - break; - } - if (*(u8 *)optval) - ret = rdma_enable_apm(ctx->cm_id, RDMA_ALT_PATH_BEST); - break; - default: ret = -ENOSYS; } return ret; } static int ucma_set_option_level(struct ucma_context *ctx, int level, int optname, void *optval, size_t optlen) { int ret; switch (level) { case RDMA_OPTION_ID: ret = ucma_set_option_id(ctx, optname, optval, optlen); break; case RDMA_OPTION_IB: ret = ucma_set_option_ib(ctx, optname, optval, optlen); break; default: ret = -ENOSYS; } return ret; } static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_set_option cmd; struct ucma_context *ctx; void *optval; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); - optval = kmalloc(cmd.optlen, GFP_KERNEL); - if (!optval) { - ret = -ENOMEM; - goto err_ucma_put_ctx; + optval = memdup_user((void __user *) (unsigned long) cmd.optval, + cmd.optlen); + if (IS_ERR(optval)) { + ret = PTR_ERR(optval); + goto out; } - if (copy_from_user(optval, (void __user *)(unsigned long)cmd.optval, - cmd.optlen)) { - ret = -EFAULT; - goto err_kfree; - } - ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval, cmd.optlen); - -err_kfree: kfree(optval); -err_ucma_put_ctx: + +out: ucma_put_ctx(ctx); return ret; } static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_notify cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); ret = rdma_notify(ctx->cm_id, (enum ib_event_type) cmd.event); ucma_put_ctx(ctx); return ret; } -static ssize_t ucma_join_multicast(struct ucma_file *file, - const char __user *inbuf, - int in_len, int out_len) +static ssize_t ucma_process_join(struct ucma_file *file, + struct rdma_ucm_join_mcast *cmd, int out_len) { - struct rdma_ucm_join_mcast cmd; struct rdma_ucm_create_id_resp resp; struct ucma_context *ctx; struct ucma_multicast *mc; + struct sockaddr *addr; int ret; + u8 join_state; if (out_len < sizeof(resp)) return -ENOSPC; - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; + addr = (struct sockaddr *) &cmd->addr; + if (!cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr))) + return -EINVAL; - ctx = ucma_get_ctx(file, cmd.id); + if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER) + join_state = BIT(FULLMEMBER_JOIN); + else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER) + join_state = BIT(SENDONLY_FULLMEMBER_JOIN); + else + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd->id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&file->mut); mc = ucma_alloc_multicast(ctx); if (!mc) { ret = -ENOMEM; goto err1; } - - mc->uid = cmd.uid; - memcpy(&mc->addr, &cmd.addr, sizeof cmd.addr); - ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc); + mc->join_state = join_state; + mc->uid = cmd->uid; + memcpy(&mc->addr, addr, cmd->addr_size); + ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr, + join_state, mc); if (ret) goto err2; resp.id = mc->id; - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user((void __user *)(unsigned long) cmd->response, &resp, sizeof(resp))) { ret = -EFAULT; goto err3; } mutex_unlock(&file->mut); ucma_put_ctx(ctx); return 0; err3: rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr); ucma_cleanup_mc_events(mc); err2: mutex_lock(&mut); idr_remove(&multicast_idr, mc->id); mutex_unlock(&mut); list_del(&mc->list); kfree(mc); err1: mutex_unlock(&file->mut); ucma_put_ctx(ctx); return ret; } +static ssize_t ucma_join_ip_multicast(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_join_ip_mcast cmd; + struct rdma_ucm_join_mcast join_cmd; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + join_cmd.response = cmd.response; + join_cmd.uid = cmd.uid; + join_cmd.id = cmd.id; + join_cmd.addr_size = rdma_addr_size((struct sockaddr *) &cmd.addr); + join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER; + memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size); + + return ucma_process_join(file, &join_cmd, out_len); +} + +static ssize_t ucma_join_multicast(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_join_mcast cmd; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + return ucma_process_join(file, &cmd, out_len); +} + static ssize_t ucma_leave_multicast(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_destroy_id cmd; struct rdma_ucm_destroy_id_resp resp; struct ucma_multicast *mc; int ret = 0; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; mutex_lock(&mut); mc = idr_find(&multicast_idr, cmd.id); if (!mc) mc = ERR_PTR(-ENOENT); else if (mc->ctx->file != file) mc = ERR_PTR(-EINVAL); - else { + else if (!atomic_inc_not_zero(&mc->ctx->ref)) + mc = ERR_PTR(-ENXIO); + else idr_remove(&multicast_idr, mc->id); - atomic_inc(&mc->ctx->ref); - } mutex_unlock(&mut); if (IS_ERR(mc)) { ret = PTR_ERR(mc); goto out; } rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr); mutex_lock(&mc->ctx->file->mut); ucma_cleanup_mc_events(mc); list_del(&mc->list); mutex_unlock(&mc->ctx->file->mut); ucma_put_ctx(mc->ctx); resp.events_reported = mc->events_reported; kfree(mc); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) ret = -EFAULT; out: return ret; } static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2) { /* Acquire mutex's based on pointer comparison to prevent deadlock. */ if (file1 < file2) { mutex_lock(&file1->mut); - mutex_lock(&file2->mut); + mutex_lock_nested(&file2->mut, SINGLE_DEPTH_NESTING); } else { mutex_lock(&file2->mut); - mutex_lock(&file1->mut); + mutex_lock_nested(&file1->mut, SINGLE_DEPTH_NESTING); } } static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2) { if (file1 < file2) { mutex_unlock(&file2->mut); mutex_unlock(&file1->mut); } else { mutex_unlock(&file1->mut); mutex_unlock(&file2->mut); } } static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file) { struct ucma_event *uevent, *tmp; list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) if (uevent->ctx == ctx) list_move_tail(&uevent->list, &file->event_list); } static ssize_t ucma_migrate_id(struct ucma_file *new_file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_migrate_id cmd; struct rdma_ucm_migrate_resp resp; struct ucma_context *ctx; struct fd f; struct ucma_file *cur_file; int ret = 0; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; /* Get current fd to protect against it being closed */ f = fdget(cmd.fd); if (!f.file) return -ENOENT; /* Validate current fd and prevent destruction of id. */ ctx = ucma_get_ctx(f.file->private_data, cmd.id); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); goto file_put; } cur_file = ctx->file; if (cur_file == new_file) { resp.events_reported = ctx->events_reported; goto response; } /* * Migrate events between fd's, maintaining order, and avoiding new * events being added before existing events. */ ucma_lock_files(cur_file, new_file); mutex_lock(&mut); list_move_tail(&ctx->list, &new_file->ctx_list); ucma_move_events(ctx, new_file); ctx->file = new_file; resp.events_reported = ctx->events_reported; mutex_unlock(&mut); ucma_unlock_files(cur_file, new_file); response: if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) ret = -EFAULT; ucma_put_ctx(ctx); file_put: fdput(f); return ret; } static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) = { - [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id, - [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id, - [RDMA_USER_CM_CMD_BIND_ADDR] = ucma_bind_addr, - [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, - [RDMA_USER_CM_CMD_RESOLVE_ROUTE]= ucma_resolve_route, - [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route, - [RDMA_USER_CM_CMD_CONNECT] = ucma_connect, - [RDMA_USER_CM_CMD_LISTEN] = ucma_listen, - [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept, - [RDMA_USER_CM_CMD_REJECT] = ucma_reject, - [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect, - [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr, - [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event, - [RDMA_USER_CM_CMD_GET_OPTION] = NULL, - [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option, - [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify, - [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast, - [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast, - [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id + [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id, + [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id, + [RDMA_USER_CM_CMD_BIND_IP] = ucma_bind_ip, + [RDMA_USER_CM_CMD_RESOLVE_IP] = ucma_resolve_ip, + [RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route, + [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route, + [RDMA_USER_CM_CMD_CONNECT] = ucma_connect, + [RDMA_USER_CM_CMD_LISTEN] = ucma_listen, + [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept, + [RDMA_USER_CM_CMD_REJECT] = ucma_reject, + [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect, + [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr, + [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event, + [RDMA_USER_CM_CMD_GET_OPTION] = NULL, + [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option, + [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify, + [RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast, + [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast, + [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id, + [RDMA_USER_CM_CMD_QUERY] = ucma_query, + [RDMA_USER_CM_CMD_BIND] = ucma_bind, + [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, + [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast }; static ssize_t ucma_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct ucma_file *file = filp->private_data; struct rdma_ucm_cmd_hdr hdr; ssize_t ret; + if (WARN_ON_ONCE(!ib_safe_file_access(filp))) + return -EACCES; + if (len < sizeof(hdr)) return -EINVAL; if (copy_from_user(&hdr, buf, sizeof(hdr))) return -EFAULT; if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table)) return -EINVAL; if (hdr.in + sizeof(hdr) > len) return -EINVAL; if (!ucma_cmd_table[hdr.cmd]) return -ENOSYS; ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out); if (!ret) ret = len; return ret; } static unsigned int ucma_poll(struct file *filp, struct poll_table_struct *wait) { struct ucma_file *file = filp->private_data; unsigned int mask = 0; poll_wait(filp, &file->poll_wait, wait); if (!list_empty(&file->event_list)) mask = POLLIN | POLLRDNORM; return mask; } /* * ucma_open() does not need the BKL: * * - no global state is referred to; * - there is no ioctl method to race against; * - no further module initialization is required for open to work * after the device is registered. */ static int ucma_open(struct inode *inode, struct file *filp) { struct ucma_file *file; file = kmalloc(sizeof *file, GFP_KERNEL); if (!file) return -ENOMEM; + file->close_wq = alloc_ordered_workqueue("ucma_close_id", + WQ_MEM_RECLAIM); + if (!file->close_wq) { + kfree(file); + return -ENOMEM; + } + INIT_LIST_HEAD(&file->event_list); INIT_LIST_HEAD(&file->ctx_list); init_waitqueue_head(&file->poll_wait); mutex_init(&file->mut); filp->private_data = file; file->filp = filp; return nonseekable_open(inode, filp); } static int ucma_close(struct inode *inode, struct file *filp) { struct ucma_file *file = filp->private_data; struct ucma_context *ctx, *tmp; mutex_lock(&file->mut); list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) { + ctx->destroying = 1; mutex_unlock(&file->mut); mutex_lock(&mut); idr_remove(&ctx_idr, ctx->id); mutex_unlock(&mut); + flush_workqueue(file->close_wq); + /* At that step once ctx was marked as destroying and workqueue + * was flushed we are safe from any inflights handlers that + * might put other closing task. + */ + mutex_lock(&mut); + if (!ctx->closing) { + mutex_unlock(&mut); + /* rdma_destroy_id ensures that no event handlers are + * inflight for that id before releasing it. + */ + rdma_destroy_id(ctx->cm_id); + } else { + mutex_unlock(&mut); + } + ucma_free_ctx(ctx); mutex_lock(&file->mut); } mutex_unlock(&file->mut); + destroy_workqueue(file->close_wq); kfree(file); return 0; } static long ucma_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case FIONBIO: case FIOASYNC: return (0); default: return (-ENOTTY); } } static const struct file_operations ucma_fops = { .owner = THIS_MODULE, .open = ucma_open, .release = ucma_close, .write = ucma_write, .unlocked_ioctl = ucma_ioctl, .poll = ucma_poll, .llseek = no_llseek, }; static struct miscdevice ucma_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = "rdma_cm", + .minor = MISC_DYNAMIC_MINOR, + .name = "rdma_cm", .nodename = "infiniband/rdma_cm", .mode = 0666, - .fops = &ucma_fops, + .fops = &ucma_fops, }; static ssize_t show_abi_version(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", RDMA_USER_CM_ABI_VERSION); } static DEVICE_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); static int __init ucma_init(void) { int ret; ret = misc_register(&ucma_misc); if (ret) return ret; ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version); if (ret) { - printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n"); + pr_err("rdma_ucm: couldn't create abi_version attr\n"); goto err1; } return 0; err1: misc_deregister(&ucma_misc); return ret; } static void __exit ucma_cleanup(void) { device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); misc_deregister(&ucma_misc); idr_destroy(&ctx_idr); + idr_destroy(&multicast_idr); } module_init(ucma_init); module_exit(ucma_cleanup); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ud_header.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ud_header.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/ud_header.c (revision 319974) @@ -1,414 +1,547 @@ /* * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include -#include #include #include +#include + #define STRUCT_FIELD(header, field) \ .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \ .struct_size_bytes = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \ .field_name = #header ":" #field static const struct ib_field lrh_table[] = { { STRUCT_FIELD(lrh, virtual_lane), .offset_words = 0, .offset_bits = 0, .size_bits = 4 }, { STRUCT_FIELD(lrh, link_version), .offset_words = 0, .offset_bits = 4, .size_bits = 4 }, { STRUCT_FIELD(lrh, service_level), .offset_words = 0, .offset_bits = 8, .size_bits = 4 }, { RESERVED, .offset_words = 0, .offset_bits = 12, .size_bits = 2 }, { STRUCT_FIELD(lrh, link_next_header), .offset_words = 0, .offset_bits = 14, .size_bits = 2 }, { STRUCT_FIELD(lrh, destination_lid), .offset_words = 0, .offset_bits = 16, .size_bits = 16 }, { RESERVED, .offset_words = 1, .offset_bits = 0, .size_bits = 5 }, { STRUCT_FIELD(lrh, packet_length), .offset_words = 1, .offset_bits = 5, .size_bits = 11 }, { STRUCT_FIELD(lrh, source_lid), .offset_words = 1, .offset_bits = 16, .size_bits = 16 } }; static const struct ib_field eth_table[] = { { STRUCT_FIELD(eth, dmac_h), .offset_words = 0, .offset_bits = 0, .size_bits = 32 }, { STRUCT_FIELD(eth, dmac_l), .offset_words = 1, .offset_bits = 0, .size_bits = 16 }, { STRUCT_FIELD(eth, smac_h), .offset_words = 1, .offset_bits = 16, .size_bits = 16 }, { STRUCT_FIELD(eth, smac_l), .offset_words = 2, .offset_bits = 0, .size_bits = 32 }, { STRUCT_FIELD(eth, type), .offset_words = 3, .offset_bits = 0, .size_bits = 16 } }; static const struct ib_field vlan_table[] = { { STRUCT_FIELD(vlan, tag), .offset_words = 0, .offset_bits = 0, .size_bits = 16 }, { STRUCT_FIELD(vlan, type), .offset_words = 0, .offset_bits = 16, .size_bits = 16 } }; +static const struct ib_field ip4_table[] = { + { STRUCT_FIELD(ip4, ver), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 4 }, + { STRUCT_FIELD(ip4, hdr_len), + .offset_words = 0, + .offset_bits = 4, + .size_bits = 4 }, + { STRUCT_FIELD(ip4, tos), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, tot_len), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, id), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, frag_off), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, ttl), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, protocol), + .offset_words = 2, + .offset_bits = 8, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, check), + .offset_words = 2, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, saddr), + .offset_words = 3, + .offset_bits = 0, + .size_bits = 32 }, + { STRUCT_FIELD(ip4, daddr), + .offset_words = 4, + .offset_bits = 0, + .size_bits = 32 } +}; + +static const struct ib_field udp_table[] = { + { STRUCT_FIELD(udp, sport), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(udp, dport), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(udp, length), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(udp, csum), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 } +}; + static const struct ib_field grh_table[] = { { STRUCT_FIELD(grh, ip_version), .offset_words = 0, .offset_bits = 0, .size_bits = 4 }, { STRUCT_FIELD(grh, traffic_class), .offset_words = 0, .offset_bits = 4, .size_bits = 8 }, { STRUCT_FIELD(grh, flow_label), .offset_words = 0, .offset_bits = 12, .size_bits = 20 }, { STRUCT_FIELD(grh, payload_length), .offset_words = 1, .offset_bits = 0, .size_bits = 16 }, { STRUCT_FIELD(grh, next_header), .offset_words = 1, .offset_bits = 16, .size_bits = 8 }, { STRUCT_FIELD(grh, hop_limit), .offset_words = 1, .offset_bits = 24, .size_bits = 8 }, { STRUCT_FIELD(grh, source_gid), .offset_words = 2, .offset_bits = 0, .size_bits = 128 }, { STRUCT_FIELD(grh, destination_gid), .offset_words = 6, .offset_bits = 0, .size_bits = 128 } }; static const struct ib_field bth_table[] = { { STRUCT_FIELD(bth, opcode), .offset_words = 0, .offset_bits = 0, .size_bits = 8 }, { STRUCT_FIELD(bth, solicited_event), .offset_words = 0, .offset_bits = 8, .size_bits = 1 }, { STRUCT_FIELD(bth, mig_req), .offset_words = 0, .offset_bits = 9, .size_bits = 1 }, { STRUCT_FIELD(bth, pad_count), .offset_words = 0, .offset_bits = 10, .size_bits = 2 }, { STRUCT_FIELD(bth, transport_header_version), .offset_words = 0, .offset_bits = 12, .size_bits = 4 }, { STRUCT_FIELD(bth, pkey), .offset_words = 0, .offset_bits = 16, .size_bits = 16 }, { RESERVED, .offset_words = 1, .offset_bits = 0, .size_bits = 8 }, { STRUCT_FIELD(bth, destination_qpn), .offset_words = 1, .offset_bits = 8, .size_bits = 24 }, { STRUCT_FIELD(bth, ack_req), .offset_words = 2, .offset_bits = 0, .size_bits = 1 }, { RESERVED, .offset_words = 2, .offset_bits = 1, .size_bits = 7 }, { STRUCT_FIELD(bth, psn), .offset_words = 2, .offset_bits = 8, .size_bits = 24 } }; static const struct ib_field deth_table[] = { { STRUCT_FIELD(deth, qkey), .offset_words = 0, .offset_bits = 0, .size_bits = 32 }, { RESERVED, .offset_words = 1, .offset_bits = 0, .size_bits = 8 }, { STRUCT_FIELD(deth, source_qpn), .offset_words = 1, .offset_bits = 8, .size_bits = 24 } }; +__sum16 ib_ud_ip4_csum(struct ib_ud_header *header) +{ + struct ip iph; + + iph.ip_hl = 5; + iph.ip_v = 4; + iph.ip_tos = header->ip4.tos; + iph.ip_len = header->ip4.tot_len; + iph.ip_id = header->ip4.id; + iph.ip_off = header->ip4.frag_off; + iph.ip_ttl = header->ip4.ttl; + iph.ip_p = header->ip4.protocol; + iph.ip_sum = 0; + iph.ip_src.s_addr = header->ip4.saddr; + iph.ip_dst.s_addr = header->ip4.daddr; + + return in_cksum_hdr(&iph); +} +EXPORT_SYMBOL(ib_ud_ip4_csum); + /** * ib_ud_header_init - Initialize UD header structure * @payload_bytes:Length of packet payload * @lrh_present: specify if LRH is present * @eth_present: specify if Eth header is present * @vlan_present: packet is tagged vlan - * @grh_present:GRH flag (if non-zero, GRH will be included) + * @grh_present: GRH flag (if non-zero, GRH will be included) + * @ip_version: if non-zero, IP header, V4 or V6, will be included + * @udp_present :if non-zero, UDP header will be included * @immediate_present: specify if immediate data is present * @header:Structure to initialize */ -void ib_ud_header_init(int payload_bytes, - int lrh_present, - int eth_present, - int vlan_present, - int grh_present, - int immediate_present, - struct ib_ud_header *header) +int ib_ud_header_init(int payload_bytes, + int lrh_present, + int eth_present, + int vlan_present, + int grh_present, + int ip_version, + int udp_present, + int immediate_present, + struct ib_ud_header *header) { + size_t udp_bytes = udp_present ? IB_UDP_BYTES : 0; + + grh_present = grh_present && !ip_version; memset(header, 0, sizeof *header); + /* + * UDP header without IP header doesn't make sense + */ + if (udp_present && ip_version != 4 && ip_version != 6) + return -EINVAL; + if (lrh_present) { - u16 packet_length = 0; + u16 packet_length; header->lrh.link_version = 0; header->lrh.link_next_header = grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL; packet_length = (IB_LRH_BYTES + IB_BTH_BYTES + IB_DETH_BYTES + (grh_present ? IB_GRH_BYTES : 0) + payload_bytes + 4 + /* ICRC */ 3) / 4; /* round up */ header->lrh.packet_length = cpu_to_be16(packet_length); } if (vlan_present) - header->eth.type = cpu_to_be16(ETH_P_8021Q); + header->eth.type = cpu_to_be16(ETH_P_8021Q); - if (grh_present) { - header->grh.ip_version = 6; - header->grh.payload_length = - cpu_to_be16((IB_BTH_BYTES + + if (ip_version == 6 || grh_present) { + header->grh.ip_version = 6; + header->grh.payload_length = + cpu_to_be16((udp_bytes + + IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4 + /* ICRC */ + 3) & ~3); /* round up */ + header->grh.next_header = udp_present ? IPPROTO_UDP : 0x1b; + } + + if (ip_version == 4) { + header->ip4.ver = 4; /* version 4 */ + header->ip4.hdr_len = 5; /* 5 words */ + header->ip4.tot_len = + cpu_to_be16(IB_IP4_BYTES + + udp_bytes + + IB_BTH_BYTES + IB_DETH_BYTES + payload_bytes + - 4 + /* ICRC */ - 3) & ~3); /* round up */ - header->grh.next_header = 0x1b; + 4); /* ICRC */ + header->ip4.protocol = IPPROTO_UDP; } + if (udp_present && ip_version) + header->udp.length = + cpu_to_be16(IB_UDP_BYTES + + IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4); /* ICRC */ if (immediate_present) header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; else header->bth.opcode = IB_OPCODE_UD_SEND_ONLY; header->bth.pad_count = (4 - payload_bytes) & 3; header->bth.transport_header_version = 0; header->lrh_present = lrh_present; header->eth_present = eth_present; header->vlan_present = vlan_present; - header->grh_present = grh_present; + header->grh_present = grh_present || (ip_version == 6); + header->ipv4_present = ip_version == 4; + header->udp_present = udp_present; header->immediate_present = immediate_present; + return 0; } EXPORT_SYMBOL(ib_ud_header_init); /** * ib_ud_header_pack - Pack UD header struct into wire format * @header:UD header struct * @buf:Buffer to pack into * * ib_ud_header_pack() packs the UD header structure @header into wire * format in the buffer @buf. */ int ib_ud_header_pack(struct ib_ud_header *header, void *buf) { int len = 0; if (header->lrh_present) { ib_pack(lrh_table, ARRAY_SIZE(lrh_table), - &header->lrh, buf + len); + &header->lrh, (char *)buf + len); len += IB_LRH_BYTES; } if (header->eth_present) { ib_pack(eth_table, ARRAY_SIZE(eth_table), - &header->eth, buf + len); + &header->eth, (char *)buf + len); len += IB_ETH_BYTES; } if (header->vlan_present) { ib_pack(vlan_table, ARRAY_SIZE(vlan_table), - &header->vlan, buf + len); + &header->vlan, (char *)buf + len); len += IB_VLAN_BYTES; } if (header->grh_present) { ib_pack(grh_table, ARRAY_SIZE(grh_table), - &header->grh, buf + len); + &header->grh, (char *)buf + len); len += IB_GRH_BYTES; } + if (header->ipv4_present) { + ib_pack(ip4_table, ARRAY_SIZE(ip4_table), + &header->ip4, (char *)buf + len); + len += IB_IP4_BYTES; + } + if (header->udp_present) { + ib_pack(udp_table, ARRAY_SIZE(udp_table), + &header->udp, (char *)buf + len); + len += IB_UDP_BYTES; + } ib_pack(bth_table, ARRAY_SIZE(bth_table), - &header->bth, buf + len); + &header->bth, (char *)buf + len); len += IB_BTH_BYTES; ib_pack(deth_table, ARRAY_SIZE(deth_table), - &header->deth, buf + len); + &header->deth, (char *)buf + len); len += IB_DETH_BYTES; if (header->immediate_present) { - memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data); + memcpy((char *)buf + len, &header->immediate_data, sizeof header->immediate_data); len += sizeof header->immediate_data; } return len; } EXPORT_SYMBOL(ib_ud_header_pack); /** * ib_ud_header_unpack - Unpack UD header struct from wire format * @header:UD header struct * @buf:Buffer to pack into * * ib_ud_header_pack() unpacks the UD header structure @header from wire * format in the buffer @buf. */ int ib_ud_header_unpack(void *buf, struct ib_ud_header *header) { ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), buf, &header->lrh); - buf += IB_LRH_BYTES; + buf = (char *)buf + IB_LRH_BYTES; if (header->lrh.link_version != 0) { - printk(KERN_WARNING "Invalid LRH.link_version %d\n", - header->lrh.link_version); + pr_warn("Invalid LRH.link_version %d\n", + header->lrh.link_version); return -EINVAL; } switch (header->lrh.link_next_header) { case IB_LNH_IBA_LOCAL: header->grh_present = 0; break; case IB_LNH_IBA_GLOBAL: header->grh_present = 1; ib_unpack(grh_table, ARRAY_SIZE(grh_table), buf, &header->grh); - buf += IB_GRH_BYTES; + buf = (char *)buf + IB_GRH_BYTES; if (header->grh.ip_version != 6) { - printk(KERN_WARNING "Invalid GRH.ip_version %d\n", - header->grh.ip_version); + pr_warn("Invalid GRH.ip_version %d\n", + header->grh.ip_version); return -EINVAL; } if (header->grh.next_header != 0x1b) { - printk(KERN_WARNING "Invalid GRH.next_header 0x%02x\n", - header->grh.next_header); + pr_warn("Invalid GRH.next_header 0x%02x\n", + header->grh.next_header); return -EINVAL; } break; default: - printk(KERN_WARNING "Invalid LRH.link_next_header %d\n", - header->lrh.link_next_header); + pr_warn("Invalid LRH.link_next_header %d\n", + header->lrh.link_next_header); return -EINVAL; } ib_unpack(bth_table, ARRAY_SIZE(bth_table), buf, &header->bth); - buf += IB_BTH_BYTES; + buf = (char *)buf + IB_BTH_BYTES; switch (header->bth.opcode) { case IB_OPCODE_UD_SEND_ONLY: header->immediate_present = 0; break; case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE: header->immediate_present = 1; break; default: - printk(KERN_WARNING "Invalid BTH.opcode 0x%02x\n", - header->bth.opcode); + pr_warn("Invalid BTH.opcode 0x%02x\n", header->bth.opcode); return -EINVAL; } if (header->bth.transport_header_version != 0) { - printk(KERN_WARNING "Invalid BTH.transport_header_version %d\n", - header->bth.transport_header_version); + pr_warn("Invalid BTH.transport_header_version %d\n", + header->bth.transport_header_version); return -EINVAL; } ib_unpack(deth_table, ARRAY_SIZE(deth_table), buf, &header->deth); - buf += IB_DETH_BYTES; + buf = (char *)buf + IB_DETH_BYTES; if (header->immediate_present) memcpy(&header->immediate_data, buf, sizeof header->immediate_data); return 0; } EXPORT_SYMBOL(ib_ud_header_unpack); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/user_mad.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/user_mad.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/user_mad.c (revision 319974) @@ -1,1313 +1,1404 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2008 Cisco. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#define pr_fmt(fmt) "user_mad: " fmt + #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand userspace MAD packet access"); MODULE_LICENSE("Dual BSD/GPL"); enum { IB_UMAD_MAX_PORTS = 64, IB_UMAD_MAX_AGENTS = 32, IB_UMAD_MAJOR = 231, IB_UMAD_MINOR_BASE = 0 }; /* * Our lifetime rules for these structs are the following: * device special file is opened, we take a reference on the * ib_umad_port's struct ib_umad_device. We drop these * references in the corresponding close(). * * In addition to references coming from open character devices, there * is one more reference to each ib_umad_device representing the * module's reference taken when allocating the ib_umad_device in * ib_umad_add_one(). * * When destroying an ib_umad_device, we drop the module's reference. */ struct ib_umad_port { - struct cdev *cdev; + struct cdev cdev; struct device *dev; - struct cdev *sm_cdev; + struct cdev sm_cdev; struct device *sm_dev; struct semaphore sm_sem; struct mutex file_mutex; struct list_head file_list; struct ib_device *ib_dev; struct ib_umad_device *umad_dev; int dev_num; u8 port_num; - struct list_head port_lst; }; struct ib_umad_device { - int start_port, end_port; - struct kref ref; + struct kobject kobj; struct ib_umad_port port[0]; }; struct ib_umad_file { struct mutex mutex; struct ib_umad_port *port; struct file *filp; struct list_head recv_list; struct list_head send_list; struct list_head port_list; spinlock_t send_lock; wait_queue_head_t recv_wait; struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS]; int agents_dead; u8 use_pkey_index; u8 already_used; }; struct ib_umad_packet { struct ib_mad_send_buf *msg; struct ib_mad_recv_wc *recv_wc; struct list_head list; int length; struct ib_user_mad mad; }; static struct class *umad_class; static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE); static DEFINE_SPINLOCK(port_lock); static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); -static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS); static void ib_umad_add_one(struct ib_device *device); -static void ib_umad_remove_one(struct ib_device *device); +static void ib_umad_remove_one(struct ib_device *device, void *client_data); -static DEFINE_SPINLOCK(ports_list_lock); -static struct list_head ports_list; - - -static void remove_ports(struct kref *ref) +static void ib_umad_release_dev(struct kobject *kobj) { - int i; - struct ib_umad_port *p, *p1; struct ib_umad_device *dev = - container_of(ref, struct ib_umad_device, ref); + container_of(kobj, struct ib_umad_device, kobj); - for (i = 0; i <= dev->end_port - dev->start_port; ++i) { - struct ib_umad_port *port = &dev->port[i]; - - list_for_each_entry_safe(p, p1, &ports_list, port_lst) - if (p == port) { - list_del(&p->port_lst); - break; - } - } -} - -static void put_umad_dev(struct kref *ref) -{ - int ret, i; - struct ib_umad_device *dev = - container_of(ref, struct ib_umad_device, ref); - - spin_lock(&ports_list_lock); - ret = (kref_put(ref, remove_ports)); - spin_unlock(&ports_list_lock); - if (ret) { - for (i = 0; i <= dev->end_port - dev->start_port; ++i) { - if (dev->port[i].dev_num < IB_UMAD_MAX_PORTS) - clear_bit(dev->port[i].dev_num, dev_map); - else - clear_bit(dev->port[i].dev_num - IB_UMAD_MAX_PORTS, overflow_map); - cdev_del(dev->port[i].cdev); - cdev_del(dev->port[i].sm_cdev); - } kfree(dev); - } } -static void release_port(struct ib_umad_port *port) -{ - put_umad_dev(&port->umad_dev->ref); -} +static struct kobj_type ib_umad_dev_ktype = { + .release = ib_umad_release_dev, +}; - -static struct ib_umad_port *get_port(struct cdev *cdev) -{ - struct ib_umad_port *port; - - spin_lock(&ports_list_lock); - list_for_each_entry(port, &ports_list, port_lst) { - if (port->cdev == cdev || port->sm_cdev == cdev) { - kref_get(&port->umad_dev->ref); - spin_unlock(&ports_list_lock); - - return port; - } - } - spin_unlock(&ports_list_lock); - - return NULL; -} - -static void insert_port(struct ib_umad_port *port) -{ - spin_lock(&ports_list_lock); - list_add(&port->port_lst, &ports_list); - spin_unlock(&ports_list_lock); -} - static int hdr_size(struct ib_umad_file *file) { return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) : sizeof (struct ib_user_mad_hdr_old); } /* caller must hold file->mutex */ static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id) { return file->agents_dead ? NULL : file->agent[id]; } static int queue_packet(struct ib_umad_file *file, struct ib_mad_agent *agent, struct ib_umad_packet *packet) { int ret = 1; mutex_lock(&file->mutex); for (packet->mad.hdr.id = 0; packet->mad.hdr.id < IB_UMAD_MAX_AGENTS; packet->mad.hdr.id++) if (agent == __get_agent(file, packet->mad.hdr.id)) { list_add_tail(&packet->list, &file->recv_list); - selwakeup(&file->filp->f_selinfo); wake_up_interruptible(&file->recv_wait); + linux_poll_wakeup(file->filp); ret = 0; break; } mutex_unlock(&file->mutex); return ret; } static void dequeue_send(struct ib_umad_file *file, struct ib_umad_packet *packet) { spin_lock_irq(&file->send_lock); list_del(&packet->list); spin_unlock_irq(&file->send_lock); } static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *send_wc) { struct ib_umad_file *file = agent->context; struct ib_umad_packet *packet = send_wc->send_buf->context[0]; dequeue_send(file, packet); ib_destroy_ah(packet->msg->ah); ib_free_send_mad(packet->msg); if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) { packet->length = IB_MGMT_MAD_HDR; packet->mad.hdr.status = ETIMEDOUT; if (!queue_packet(file, agent, packet)) return; } kfree(packet); } static void recv_handler(struct ib_mad_agent *agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_umad_file *file = agent->context; struct ib_umad_packet *packet; if (mad_recv_wc->wc->status != IB_WC_SUCCESS) goto err1; packet = kzalloc(sizeof *packet, GFP_KERNEL); if (!packet) goto err1; packet->length = mad_recv_wc->mad_len; packet->recv_wc = mad_recv_wc; packet->mad.hdr.status = 0; packet->mad.hdr.length = hdr_size(file) + mad_recv_wc->mad_len; packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp); packet->mad.hdr.lid = cpu_to_be16(mad_recv_wc->wc->slid); packet->mad.hdr.sl = mad_recv_wc->wc->sl; packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits; packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index; packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH); if (packet->mad.hdr.grh_present) { struct ib_ah_attr ah_attr; ib_init_ah_from_wc(agent->device, agent->port_num, mad_recv_wc->wc, mad_recv_wc->recv_buf.grh, &ah_attr); packet->mad.hdr.gid_index = ah_attr.grh.sgid_index; packet->mad.hdr.hop_limit = ah_attr.grh.hop_limit; packet->mad.hdr.traffic_class = ah_attr.grh.traffic_class; memcpy(packet->mad.hdr.gid, &ah_attr.grh.dgid, 16); packet->mad.hdr.flow_label = cpu_to_be32(ah_attr.grh.flow_label); } if (queue_packet(file, agent, packet)) goto err2; return; err2: kfree(packet); err1: ib_free_recv_mad(mad_recv_wc); } static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf, struct ib_umad_packet *packet, size_t count) { struct ib_mad_recv_buf *recv_buf; int left, seg_payload, offset, max_seg_payload; + size_t seg_size; - /* We need enough room to copy the first (or only) MAD segment. */ recv_buf = &packet->recv_wc->recv_buf; - if ((packet->length <= sizeof (*recv_buf->mad) && + seg_size = packet->recv_wc->mad_seg_size; + + /* We need enough room to copy the first (or only) MAD segment. */ + if ((packet->length <= seg_size && count < hdr_size(file) + packet->length) || - (packet->length > sizeof (*recv_buf->mad) && - count < hdr_size(file) + sizeof (*recv_buf->mad))) + (packet->length > seg_size && + count < hdr_size(file) + seg_size)) return -EINVAL; if (copy_to_user(buf, &packet->mad, hdr_size(file))) return -EFAULT; buf += hdr_size(file); - seg_payload = min_t(int, packet->length, sizeof (*recv_buf->mad)); + seg_payload = min_t(int, packet->length, seg_size); if (copy_to_user(buf, recv_buf->mad, seg_payload)) return -EFAULT; if (seg_payload < packet->length) { /* * Multipacket RMPP MAD message. Copy remainder of message. * Note that last segment may have a shorter payload. */ if (count < hdr_size(file) + packet->length) { /* * The buffer is too small, return the first RMPP segment, * which includes the RMPP message length. */ return -ENOSPC; } offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class); - max_seg_payload = sizeof (struct ib_mad) - offset; + max_seg_payload = seg_size - offset; for (left = packet->length - seg_payload, buf += seg_payload; left; left -= seg_payload, buf += seg_payload) { recv_buf = container_of(recv_buf->list.next, struct ib_mad_recv_buf, list); seg_payload = min(left, max_seg_payload); - if (copy_to_user(buf, ((void *) recv_buf->mad) + offset, + if (copy_to_user(buf, (char *)recv_buf->mad + offset, seg_payload)) return -EFAULT; } } return hdr_size(file) + packet->length; } static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf, struct ib_umad_packet *packet, size_t count) { ssize_t size = hdr_size(file) + packet->length; if (count < size) return -EINVAL; if (copy_to_user(buf, &packet->mad, hdr_size(file))) return -EFAULT; buf += hdr_size(file); if (copy_to_user(buf, packet->mad.data, packet->length)) return -EFAULT; return size; } static ssize_t ib_umad_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { struct ib_umad_file *file = filp->private_data; struct ib_umad_packet *packet; ssize_t ret; if (count < hdr_size(file)) return -EINVAL; mutex_lock(&file->mutex); while (list_empty(&file->recv_list)) { mutex_unlock(&file->mutex); if (filp->f_flags & O_NONBLOCK) return -EAGAIN; if (wait_event_interruptible(file->recv_wait, !list_empty(&file->recv_list))) return -ERESTARTSYS; mutex_lock(&file->mutex); } packet = list_entry(file->recv_list.next, struct ib_umad_packet, list); list_del(&packet->list); mutex_unlock(&file->mutex); if (packet->recv_wc) ret = copy_recv_mad(file, buf, packet, count); else ret = copy_send_mad(file, buf, packet, count); if (ret < 0) { /* Requeue packet */ mutex_lock(&file->mutex); list_add(&packet->list, &file->recv_list); mutex_unlock(&file->mutex); } else { if (packet->recv_wc) ib_free_recv_mad(packet->recv_wc); kfree(packet); } return ret; } static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf) { int left, seg; /* Copy class specific header */ if ((msg->hdr_len > IB_MGMT_RMPP_HDR) && - copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR, + copy_from_user((char *)msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR, msg->hdr_len - IB_MGMT_RMPP_HDR)) return -EFAULT; /* All headers are in place. Copy data segments. */ for (seg = 1, left = msg->data_len, buf += msg->hdr_len; left > 0; seg++, left -= msg->seg_size, buf += msg->seg_size) { if (copy_from_user(ib_get_rmpp_segment(msg, seg), buf, min(left, msg->seg_size))) return -EFAULT; } return 0; } static int same_destination(struct ib_user_mad_hdr *hdr1, struct ib_user_mad_hdr *hdr2) { if (!hdr1->grh_present && !hdr2->grh_present) return (hdr1->lid == hdr2->lid); if (hdr1->grh_present && hdr2->grh_present) return !memcmp(hdr1->gid, hdr2->gid, 16); return 0; } static int is_duplicate(struct ib_umad_file *file, struct ib_umad_packet *packet) { struct ib_umad_packet *sent_packet; struct ib_mad_hdr *sent_hdr, *hdr; hdr = (struct ib_mad_hdr *) packet->mad.data; list_for_each_entry(sent_packet, &file->send_list, list) { sent_hdr = (struct ib_mad_hdr *) sent_packet->mad.data; if ((hdr->tid != sent_hdr->tid) || (hdr->mgmt_class != sent_hdr->mgmt_class)) continue; /* * No need to be overly clever here. If two new operations have * the same TID, reject the second as a duplicate. This is more * restrictive than required by the spec. */ - if (!ib_response_mad((struct ib_mad *) hdr)) { - if (!ib_response_mad((struct ib_mad *) sent_hdr)) + if (!ib_response_mad(hdr)) { + if (!ib_response_mad(sent_hdr)) return 1; continue; - } else if (!ib_response_mad((struct ib_mad *) sent_hdr)) + } else if (!ib_response_mad(sent_hdr)) continue; if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr)) return 1; } return 0; } static ssize_t ib_umad_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct ib_umad_file *file = filp->private_data; struct ib_umad_packet *packet; struct ib_mad_agent *agent; struct ib_ah_attr ah_attr; struct ib_ah *ah; struct ib_rmpp_mad *rmpp_mad; __be64 *tid; int ret, data_len, hdr_len, copy_offset, rmpp_active; + u8 base_version; if (count < hdr_size(file) + IB_MGMT_RMPP_HDR) return -EINVAL; packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL); if (!packet) return -ENOMEM; if (copy_from_user(&packet->mad, buf, hdr_size(file))) { ret = -EFAULT; goto err; } if (packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) { ret = -EINVAL; goto err; } buf += hdr_size(file); if (copy_from_user(packet->mad.data, buf, IB_MGMT_RMPP_HDR)) { ret = -EFAULT; goto err; } mutex_lock(&file->mutex); agent = __get_agent(file, packet->mad.hdr.id); if (!agent) { ret = -EINVAL; goto err_up; } memset(&ah_attr, 0, sizeof ah_attr); ah_attr.dlid = be16_to_cpu(packet->mad.hdr.lid); ah_attr.sl = packet->mad.hdr.sl; ah_attr.src_path_bits = packet->mad.hdr.path_bits; ah_attr.port_num = file->port->port_num; if (packet->mad.hdr.grh_present) { ah_attr.ah_flags = IB_AH_GRH; memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16); ah_attr.grh.sgid_index = packet->mad.hdr.gid_index; - ah_attr.grh.flow_label = be32_to_cpu(packet->mad.hdr.flow_label); - ah_attr.grh.hop_limit = packet->mad.hdr.hop_limit; + ah_attr.grh.flow_label = be32_to_cpu(packet->mad.hdr.flow_label); + ah_attr.grh.hop_limit = packet->mad.hdr.hop_limit; ah_attr.grh.traffic_class = packet->mad.hdr.traffic_class; } ah = ib_create_ah(agent->qp->pd, &ah_attr); if (IS_ERR(ah)) { ret = PTR_ERR(ah); goto err_up; } rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data; hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class); - if (!ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)) { - copy_offset = IB_MGMT_MAD_HDR; - rmpp_active = 0; - } else { + + if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) + && ib_mad_kernel_rmpp_agent(agent)) { copy_offset = IB_MGMT_RMPP_HDR; rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & - IB_MGMT_RMPP_FLAG_ACTIVE; + IB_MGMT_RMPP_FLAG_ACTIVE; + } else { + copy_offset = IB_MGMT_MAD_HDR; + rmpp_active = 0; } + base_version = ((struct ib_mad_hdr *)&packet->mad.data)->base_version; data_len = count - hdr_size(file) - hdr_len; packet->msg = ib_create_send_mad(agent, be32_to_cpu(packet->mad.hdr.qpn), packet->mad.hdr.pkey_index, rmpp_active, - hdr_len, data_len, GFP_KERNEL); + hdr_len, data_len, GFP_KERNEL, + base_version); if (IS_ERR(packet->msg)) { ret = PTR_ERR(packet->msg); goto err_ah; } - packet->msg->ah = ah; + packet->msg->ah = ah; packet->msg->timeout_ms = packet->mad.hdr.timeout_ms; - packet->msg->retries = packet->mad.hdr.retries; + packet->msg->retries = packet->mad.hdr.retries; packet->msg->context[0] = packet; /* Copy MAD header. Any RMPP header is already in place. */ memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR); if (!rmpp_active) { - if (copy_from_user(packet->msg->mad + copy_offset, + if (copy_from_user((char *)packet->msg->mad + copy_offset, buf + copy_offset, hdr_len + data_len - copy_offset)) { ret = -EFAULT; goto err_msg; } } else { ret = copy_rmpp_mad(packet->msg, buf); if (ret) goto err_msg; } /* * Set the high-order part of the transaction ID to make MADs from * different agents unique, and allow routing responses back to the * original requestor. */ if (!ib_response_mad(packet->msg->mad)) { tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid; *tid = cpu_to_be64(((u64) agent->hi_tid) << 32 | (be64_to_cpup(tid) & 0xffffffff)); rmpp_mad->mad_hdr.tid = *tid; } - spin_lock_irq(&file->send_lock); - ret = is_duplicate(file, packet); - if (!ret) + if (!ib_mad_kernel_rmpp_agent(agent) + && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) + && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) { + spin_lock_irq(&file->send_lock); list_add_tail(&packet->list, &file->send_list); - spin_unlock_irq(&file->send_lock); - if (ret) { - ret = -EINVAL; - goto err_msg; + spin_unlock_irq(&file->send_lock); + } else { + spin_lock_irq(&file->send_lock); + ret = is_duplicate(file, packet); + if (!ret) + list_add_tail(&packet->list, &file->send_list); + spin_unlock_irq(&file->send_lock); + if (ret) { + ret = -EINVAL; + goto err_msg; + } } ret = ib_post_send_mad(packet->msg, NULL); if (ret) goto err_send; mutex_unlock(&file->mutex); return count; err_send: dequeue_send(file, packet); err_msg: ib_free_send_mad(packet->msg); err_ah: ib_destroy_ah(ah); err_up: mutex_unlock(&file->mutex); err: kfree(packet); return ret; } static unsigned int ib_umad_poll(struct file *filp, struct poll_table_struct *wait) { struct ib_umad_file *file = filp->private_data; /* we will always be able to post a MAD send */ unsigned int mask = POLLOUT | POLLWRNORM; poll_wait(filp, &file->recv_wait, wait); if (!list_empty(&file->recv_list)) mask |= POLLIN | POLLRDNORM; return mask; } static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, int compat_method_mask) { struct ib_user_mad_reg_req ureq; struct ib_mad_reg_req req; struct ib_mad_agent *agent = NULL; int agent_id; int ret; mutex_lock(&file->port->file_mutex); mutex_lock(&file->mutex); if (!file->port->ib_dev) { + dev_notice(file->port->dev, + "ib_umad_reg_agent: invalid device\n"); ret = -EPIPE; goto out; } if (copy_from_user(&ureq, arg, sizeof ureq)) { ret = -EFAULT; goto out; } if (ureq.qpn != 0 && ureq.qpn != 1) { + dev_notice(file->port->dev, + "ib_umad_reg_agent: invalid QPN %d specified\n", + ureq.qpn); ret = -EINVAL; goto out; } for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id) if (!__get_agent(file, agent_id)) goto found; + dev_notice(file->port->dev, + "ib_umad_reg_agent: Max Agents (%u) reached\n", + IB_UMAD_MAX_AGENTS); ret = -ENOMEM; goto out; found: if (ureq.mgmt_class) { + memset(&req, 0, sizeof(req)); req.mgmt_class = ureq.mgmt_class; req.mgmt_class_version = ureq.mgmt_class_version; memcpy(req.oui, ureq.oui, sizeof req.oui); if (compat_method_mask) { u32 *umm = (u32 *) ureq.method_mask; int i; for (i = 0; i < BITS_TO_LONGS(IB_MGMT_MAX_METHODS); ++i) req.method_mask[i] = umm[i * 2] | ((u64) umm[i * 2 + 1] << 32); } else memcpy(req.method_mask, ureq.method_mask, sizeof req.method_mask); } agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num, ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI, ureq.mgmt_class ? &req : NULL, ureq.rmpp_version, - send_handler, recv_handler, file); + send_handler, recv_handler, file, 0); if (IS_ERR(agent)) { ret = PTR_ERR(agent); agent = NULL; goto out; } if (put_user(agent_id, - (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) { + (u32 __user *) ((char *)arg + offsetof(struct ib_user_mad_reg_req, id)))) { ret = -EFAULT; goto out; } if (!file->already_used) { file->already_used = 1; if (!file->use_pkey_index) { - printk(KERN_WARNING "user_mad: process %s did not enable " - "P_Key index support.\n", curthread->td_proc->p_comm); - printk(KERN_WARNING "user_mad: Documentation/infiniband/user_mad.txt " - "has info on the new ABI.\n"); + dev_warn(file->port->dev, + "process %s did not enable P_Key index support.\n", + current->comm); + dev_warn(file->port->dev, + " Documentation/infiniband/user_mad.txt has info on the new ABI.\n"); } } file->agent[agent_id] = agent; ret = 0; out: mutex_unlock(&file->mutex); if (ret && agent) ib_unregister_mad_agent(agent); mutex_unlock(&file->port->file_mutex); return ret; } +static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg) +{ + struct ib_user_mad_reg_req2 ureq; + struct ib_mad_reg_req req; + struct ib_mad_agent *agent = NULL; + int agent_id; + int ret; + + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); + + if (!file->port->ib_dev) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2: invalid device\n"); + ret = -EPIPE; + goto out; + } + + if (copy_from_user(&ureq, arg, sizeof(ureq))) { + ret = -EFAULT; + goto out; + } + + if (ureq.qpn != 0 && ureq.qpn != 1) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2: invalid QPN %d specified\n", + ureq.qpn); + ret = -EINVAL; + goto out; + } + + if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) { + const u32 flags = IB_USER_MAD_REG_FLAGS_CAP; + dev_notice(file->port->dev, + "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n", + ureq.flags, IB_USER_MAD_REG_FLAGS_CAP); + ret = -EINVAL; + + if (put_user(flags, + (u32 __user *) ((char *)arg + offsetof(struct + ib_user_mad_reg_req2, flags)))) + ret = -EFAULT; + + goto out; + } + + for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id) + if (!__get_agent(file, agent_id)) + goto found; + + dev_notice(file->port->dev, + "ib_umad_reg_agent2: Max Agents (%u) reached\n", + IB_UMAD_MAX_AGENTS); + ret = -ENOMEM; + goto out; + +found: + if (ureq.mgmt_class) { + memset(&req, 0, sizeof(req)); + req.mgmt_class = ureq.mgmt_class; + req.mgmt_class_version = ureq.mgmt_class_version; + if (ureq.oui & 0xff000000) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n", + ureq.oui); + ret = -EINVAL; + goto out; + } + req.oui[2] = ureq.oui & 0x0000ff; + req.oui[1] = (ureq.oui & 0x00ff00) >> 8; + req.oui[0] = (ureq.oui & 0xff0000) >> 16; + memcpy(req.method_mask, ureq.method_mask, + sizeof(req.method_mask)); + } + + agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num, + ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI, + ureq.mgmt_class ? &req : NULL, + ureq.rmpp_version, + send_handler, recv_handler, file, + ureq.flags); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + agent = NULL; + goto out; + } + + if (put_user(agent_id, + (u32 __user *)((char *)arg + + offsetof(struct ib_user_mad_reg_req2, id)))) { + ret = -EFAULT; + goto out; + } + + if (!file->already_used) { + file->already_used = 1; + file->use_pkey_index = 1; + } + + file->agent[agent_id] = agent; + ret = 0; + +out: + mutex_unlock(&file->mutex); + + if (ret && agent) + ib_unregister_mad_agent(agent); + + mutex_unlock(&file->port->file_mutex); + + return ret; +} + + static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg) { struct ib_mad_agent *agent = NULL; u32 id; int ret = 0; if (get_user(id, arg)) return -EFAULT; mutex_lock(&file->port->file_mutex); mutex_lock(&file->mutex); if (id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) { ret = -EINVAL; goto out; } agent = file->agent[id]; file->agent[id] = NULL; out: mutex_unlock(&file->mutex); if (agent) ib_unregister_mad_agent(agent); mutex_unlock(&file->port->file_mutex); return ret; } static long ib_umad_enable_pkey(struct ib_umad_file *file) { int ret = 0; mutex_lock(&file->mutex); if (file->already_used) ret = -EINVAL; else file->use_pkey_index = 1; mutex_unlock(&file->mutex); return ret; } static long ib_umad_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case IB_USER_MAD_REGISTER_AGENT: return ib_umad_reg_agent(filp->private_data, (void __user *) arg, 0); case IB_USER_MAD_UNREGISTER_AGENT: return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg); case IB_USER_MAD_ENABLE_PKEY: return ib_umad_enable_pkey(filp->private_data); + case IB_USER_MAD_REGISTER_AGENT2: + return ib_umad_reg_agent2(filp->private_data, (void __user *) arg); default: return -ENOIOCTLCMD; } } #ifdef CONFIG_COMPAT static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case IB_USER_MAD_REGISTER_AGENT: return ib_umad_reg_agent(filp->private_data, compat_ptr(arg), 1); case IB_USER_MAD_UNREGISTER_AGENT: return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg)); case IB_USER_MAD_ENABLE_PKEY: return ib_umad_enable_pkey(filp->private_data); + case IB_USER_MAD_REGISTER_AGENT2: + return ib_umad_reg_agent2(filp->private_data, compat_ptr(arg)); default: return -ENOIOCTLCMD; } } #endif /* * ib_umad_open() does not need the BKL: * * - the ib_umad_port structures are properly reference counted, and * everything else is purely local to the file being created, so * races against other open calls are not a problem; * - the ioctl method does not affect any global state outside of the * file structure being operated on; */ static int ib_umad_open(struct inode *inode, struct file *filp) { struct ib_umad_port *port; struct ib_umad_file *file; - int ret; + int ret = -ENXIO; - port = get_port(inode->i_cdev->si_drv1); - if (!port) - return -ENXIO; + port = container_of(inode->i_cdev->si_drv1, struct ib_umad_port, cdev); mutex_lock(&port->file_mutex); - if (!port->ib_dev) { - release_port(port); - ret = -ENXIO; + if (!port->ib_dev) goto out; - } + ret = -ENOMEM; file = kzalloc(sizeof *file, GFP_KERNEL); - if (!file) { - release_port(port); - ret = -ENOMEM; + if (!file) goto out; - } mutex_init(&file->mutex); spin_lock_init(&file->send_lock); INIT_LIST_HEAD(&file->recv_list); INIT_LIST_HEAD(&file->send_list); init_waitqueue_head(&file->recv_wait); file->port = port; file->filp = filp; filp->private_data = file; list_add_tail(&file->port_list, &port->file_list); ret = nonseekable_open(inode, filp); + if (ret) { + list_del(&file->port_list); + kfree(file); + goto out; + } + kobject_get(&port->umad_dev->kobj); + out: mutex_unlock(&port->file_mutex); return ret; } static int ib_umad_close(struct inode *inode, struct file *filp) { struct ib_umad_file *file = filp->private_data; - struct ib_umad_port *port = file->port; + struct ib_umad_device *dev = file->port->umad_dev; struct ib_umad_packet *packet, *tmp; int already_dead; int i; mutex_lock(&file->port->file_mutex); mutex_lock(&file->mutex); already_dead = file->agents_dead; file->agents_dead = 1; list_for_each_entry_safe(packet, tmp, &file->recv_list, list) { if (packet->recv_wc) ib_free_recv_mad(packet->recv_wc); kfree(packet); } list_del(&file->port_list); mutex_unlock(&file->mutex); if (!already_dead) for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i) if (file->agent[i]) ib_unregister_mad_agent(file->agent[i]); mutex_unlock(&file->port->file_mutex); kfree(file); - release_port(port); + kobject_put(&dev->kobj); return 0; } static const struct file_operations umad_fops = { - .owner = THIS_MODULE, - .read = ib_umad_read, - .write = ib_umad_write, - .poll = ib_umad_poll, + .owner = THIS_MODULE, + .read = ib_umad_read, + .write = ib_umad_write, + .poll = ib_umad_poll, .unlocked_ioctl = ib_umad_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = ib_umad_compat_ioctl, + .compat_ioctl = ib_umad_compat_ioctl, #endif - .open = ib_umad_open, + .open = ib_umad_open, .release = ib_umad_close, .llseek = no_llseek, }; static int ib_umad_sm_open(struct inode *inode, struct file *filp) { struct ib_umad_port *port; struct ib_port_modify props = { .set_port_cap_mask = IB_PORT_SM }; int ret; - port = get_port(inode->i_cdev->si_drv1); - if (!port) - return -ENXIO; + port = container_of(inode->i_cdev->si_drv1, struct ib_umad_port, sm_cdev); if (filp->f_flags & O_NONBLOCK) { if (down_trylock(&port->sm_sem)) { ret = -EAGAIN; goto fail; } } else { if (down_interruptible(&port->sm_sem)) { ret = -ERESTARTSYS; goto fail; } } ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); - if (ret) { - up(&port->sm_sem); - goto fail; - } + if (ret) + goto err_up_sem; filp->private_data = port; - return nonseekable_open(inode, filp); + ret = nonseekable_open(inode, filp); + if (ret) + goto err_clr_sm_cap; + kobject_get(&port->umad_dev->kobj); + + return 0; + +err_clr_sm_cap: + swap(props.set_port_cap_mask, props.clr_port_cap_mask); + ib_modify_port(port->ib_dev, port->port_num, 0, &props); + +err_up_sem: + up(&port->sm_sem); + fail: - release_port(port); return ret; } static int ib_umad_sm_close(struct inode *inode, struct file *filp) { struct ib_umad_port *port = filp->private_data; struct ib_port_modify props = { .clr_port_cap_mask = IB_PORT_SM }; int ret = 0; mutex_lock(&port->file_mutex); if (port->ib_dev) ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); mutex_unlock(&port->file_mutex); up(&port->sm_sem); - release_port(port); + kobject_put(&port->umad_dev->kobj); return ret; } static const struct file_operations umad_sm_fops = { - .owner = THIS_MODULE, - .open = ib_umad_sm_open, + .owner = THIS_MODULE, + .open = ib_umad_sm_open, .release = ib_umad_sm_close, .llseek = no_llseek, }; static struct ib_client umad_client = { .name = "umad", .add = ib_umad_add_one, .remove = ib_umad_remove_one }; static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, char *buf) { struct ib_umad_port *port = dev_get_drvdata(dev); if (!port) return -ENODEV; return sprintf(buf, "%s\n", port->ib_dev->name); } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); static ssize_t show_port(struct device *dev, struct device_attribute *attr, char *buf) { struct ib_umad_port *port = dev_get_drvdata(dev); if (!port) return -ENODEV; return sprintf(buf, "%d\n", port->port_num); } static DEVICE_ATTR(port, S_IRUGO, show_port, NULL); -static ssize_t show_abi_version(struct class *class, struct class_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION); -} -static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); +static CLASS_ATTR_STRING(abi_version, S_IRUGO, + __stringify(IB_USER_MAD_ABI_VERSION)); static dev_t overflow_maj; -static int find_overflow_devnum(void) +static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS); +static int find_overflow_devnum(struct ib_device *device) { int ret; if (!overflow_maj) { ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2, "infiniband_mad"); if (ret) { - printk(KERN_ERR "user_mad: couldn't register dynamic device number\n"); + dev_err(&device->dev, + "couldn't register dynamic device number\n"); return ret; } } ret = find_first_zero_bit(overflow_map, IB_UMAD_MAX_PORTS); if (ret >= IB_UMAD_MAX_PORTS) return -1; return ret; } static int ib_umad_init_port(struct ib_device *device, int port_num, + struct ib_umad_device *umad_dev, struct ib_umad_port *port) { int devnum; dev_t base; spin_lock(&port_lock); devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); if (devnum >= IB_UMAD_MAX_PORTS) { spin_unlock(&port_lock); - devnum = find_overflow_devnum(); + devnum = find_overflow_devnum(device); if (devnum < 0) - return -1; + return -1; spin_lock(&port_lock); port->dev_num = devnum + IB_UMAD_MAX_PORTS; base = devnum + overflow_maj; set_bit(devnum, overflow_map); } else { port->dev_num = devnum; base = devnum + base_dev; set_bit(devnum, dev_map); } spin_unlock(&port_lock); port->ib_dev = device; port->port_num = port_num; sema_init(&port->sm_sem, 1); mutex_init(&port->file_mutex); INIT_LIST_HEAD(&port->file_list); - port->cdev = cdev_alloc(); - if (!port->cdev) - goto err_cdev_c; - - port->cdev->ops = &umad_fops; - port->cdev->owner = THIS_MODULE; - kobject_set_name(&port->cdev->kobj, "umad%d", port->dev_num); - if (cdev_add(port->cdev, base, 1)) + cdev_init(&port->cdev, &umad_fops); + port->cdev.owner = THIS_MODULE; + port->cdev.kobj.parent = &umad_dev->kobj; + kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num); + if (cdev_add(&port->cdev, base, 1)) goto err_cdev; port->dev = device_create(umad_class, device->dma_device, - port->cdev->dev, port, + port->cdev.dev, port, "umad%d", port->dev_num); if (IS_ERR(port->dev)) goto err_cdev; if (device_create_file(port->dev, &dev_attr_ibdev)) goto err_dev; if (device_create_file(port->dev, &dev_attr_port)) goto err_dev; base += IB_UMAD_MAX_PORTS; - port->sm_cdev = cdev_alloc(); - if (!port->sm_cdev) - goto err_dev; - - port->sm_cdev->ops = &umad_sm_fops; - port->sm_cdev->owner = THIS_MODULE; - kobject_set_name(&port->sm_cdev->kobj, "issm%d", port->dev_num); - if (cdev_add(port->sm_cdev, base, 1)) + cdev_init(&port->sm_cdev, &umad_sm_fops); + port->sm_cdev.owner = THIS_MODULE; + port->sm_cdev.kobj.parent = &umad_dev->kobj; + kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num); + if (cdev_add(&port->sm_cdev, base, 1)) goto err_sm_cdev; port->sm_dev = device_create(umad_class, device->dma_device, - port->sm_cdev->dev, port, + port->sm_cdev.dev, port, "issm%d", port->dev_num); if (IS_ERR(port->sm_dev)) goto err_sm_cdev; if (device_create_file(port->sm_dev, &dev_attr_ibdev)) goto err_sm_dev; if (device_create_file(port->sm_dev, &dev_attr_port)) goto err_sm_dev; return 0; err_sm_dev: - device_destroy(umad_class, port->sm_cdev->dev); + device_destroy(umad_class, port->sm_cdev.dev); err_sm_cdev: - cdev_del(port->sm_cdev); + cdev_del(&port->sm_cdev); err_dev: - device_destroy(umad_class, port->cdev->dev); + device_destroy(umad_class, port->cdev.dev); err_cdev: - cdev_del(port->cdev); -err_cdev_c: + cdev_del(&port->cdev); if (port->dev_num < IB_UMAD_MAX_PORTS) clear_bit(devnum, dev_map); else clear_bit(devnum, overflow_map); return -1; } static void ib_umad_kill_port(struct ib_umad_port *port) { struct ib_umad_file *file; int id; dev_set_drvdata(port->dev, NULL); dev_set_drvdata(port->sm_dev, NULL); - device_destroy(umad_class, port->cdev->dev); - device_destroy(umad_class, port->sm_cdev->dev); + device_destroy(umad_class, port->cdev.dev); + device_destroy(umad_class, port->sm_cdev.dev); + cdev_del(&port->cdev); + cdev_del(&port->sm_cdev); + mutex_lock(&port->file_mutex); port->ib_dev = NULL; list_for_each_entry(file, &port->file_list, port_list) { mutex_lock(&file->mutex); file->agents_dead = 1; mutex_unlock(&file->mutex); for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id) if (file->agent[id]) ib_unregister_mad_agent(file->agent[id]); } mutex_unlock(&port->file_mutex); + + if (port->dev_num < IB_UMAD_MAX_PORTS) + clear_bit(port->dev_num, dev_map); + else + clear_bit(port->dev_num - IB_UMAD_MAX_PORTS, overflow_map); } static void ib_umad_add_one(struct ib_device *device) { struct ib_umad_device *umad_dev; int s, e, i; + int count = 0; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; + s = rdma_start_port(device); + e = rdma_end_port(device); - if (device->node_type == RDMA_NODE_IB_SWITCH) - s = e = 0; - else { - s = 1; - e = device->phys_port_cnt; - } - umad_dev = kzalloc(sizeof *umad_dev + (e - s + 1) * sizeof (struct ib_umad_port), GFP_KERNEL); if (!umad_dev) return; - kref_init(&umad_dev->ref); + kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype); - umad_dev->start_port = s; - umad_dev->end_port = e; - - for (i = 0; i <= e - s; ++i) - insert_port(&umad_dev->port[i]); - for (i = s; i <= e; ++i) { + if (!rdma_cap_ib_mad(device, i)) + continue; + umad_dev->port[i - s].umad_dev = umad_dev; - if (ib_umad_init_port(device, i, &umad_dev->port[i - s])) - goto err; + if (ib_umad_init_port(device, i, umad_dev, + &umad_dev->port[i - s])) + goto err; + + count++; } + if (!count) + goto free; + ib_set_client_data(device, &umad_client, umad_dev); return; err: - while (--i >= s) - ib_umad_kill_port(&umad_dev->port[i - s]); + while (--i >= s) { + if (!rdma_cap_ib_mad(device, i)) + continue; - put_umad_dev(&umad_dev->ref); + ib_umad_kill_port(&umad_dev->port[i - s]); + } +free: + kobject_put(&umad_dev->kobj); } -static void ib_umad_remove_one(struct ib_device *device) +static void ib_umad_remove_one(struct ib_device *device, void *client_data) { - struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client); + struct ib_umad_device *umad_dev = client_data; int i; if (!umad_dev) return; - for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i) + for (i = 0; i <= rdma_end_port(device) - rdma_start_port(device); ++i) { + if (rdma_cap_ib_mad(device, i + rdma_start_port(device))) ib_umad_kill_port(&umad_dev->port[i]); + } - put_umad_dev(&umad_dev->ref); + kobject_put(&umad_dev->kobj); } static char *umad_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); } static int __init ib_umad_init(void) { int ret; - INIT_LIST_HEAD(&ports_list); - ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2, "infiniband_mad"); if (ret) { - printk(KERN_ERR "user_mad: couldn't register device number\n"); + pr_err("couldn't register device number\n"); goto out; } umad_class = class_create(THIS_MODULE, "infiniband_mad"); if (IS_ERR(umad_class)) { ret = PTR_ERR(umad_class); - printk(KERN_ERR "user_mad: couldn't create class infiniband_mad\n"); + pr_err("couldn't create class infiniband_mad\n"); goto out_chrdev; } umad_class->devnode = umad_devnode; - ret = class_create_file(umad_class, &class_attr_abi_version); + ret = class_create_file(umad_class, &class_attr_abi_version.attr); if (ret) { - printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n"); + pr_err("couldn't create abi_version attribute\n"); goto out_class; } ret = ib_register_client(&umad_client); if (ret) { - printk(KERN_ERR "user_mad: couldn't register ib_umad client\n"); + pr_err("couldn't register ib_umad client\n"); goto out_class; } return 0; out_class: class_destroy(umad_class); out_chrdev: unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2); out: return ret; } static void __exit ib_umad_cleanup(void) { ib_unregister_client(&umad_client); class_destroy(umad_class); unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2); if (overflow_maj) unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2); } -module_init(ib_umad_init); +module_init_order(ib_umad_init, SI_ORDER_THIRD); module_exit(ib_umad_cleanup); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/uverbs.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/uverbs.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/uverbs.h (revision 319974) @@ -1,269 +1,297 @@ /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef UVERBS_H #define UVERBS_H #include #include #include #include #include +#include +#include #include #include -#include #include #include -#include +#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->inbuf = (const void __user *) (ibuf); \ + (udata)->outbuf = (void __user *) (obuf); \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) + +#define INIT_UDATA_BUF_OR_NULL(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->inbuf = (ilen) ? (const void __user *) (ibuf) : NULL; \ + (udata)->outbuf = (olen) ? (void __user *) (obuf) : NULL; \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) + /* * Our lifetime rules for these structs are the following: * * struct ib_uverbs_device: One reference is held by the module and * released in ib_uverbs_remove_one(). Another reference is taken by * ib_uverbs_open() each time the character special file is opened, * and released in ib_uverbs_release_file() when the file is released. * * struct ib_uverbs_file: One reference is held by the VFS and * released when the file is closed. Another reference is taken when * an asynchronous event queue file is created and released when the * event file is closed. * * struct ib_uverbs_event_file: One reference is held by the VFS and * released when the file is closed. For asynchronous event files, * another reference is held by the corresponding main context file * and released when that file is closed. For completion event files, * a reference is taken when a CQ is created that uses the file, and * released when the CQ is destroyed. */ struct ib_uverbs_device { - struct kref ref; + atomic_t refcount; int num_comp_vectors; struct completion comp; struct device *dev; - struct ib_device *ib_dev; + struct ib_device __rcu *ib_dev; int devnum; struct cdev cdev; struct rb_root xrcd_tree; struct mutex xrcd_tree_mutex; + struct kobject kobj; + struct srcu_struct disassociate_srcu; + struct mutex lists_mutex; /* protect lists */ + struct list_head uverbs_file_list; + struct list_head uverbs_events_file_list; }; struct ib_uverbs_event_file { struct kref ref; struct file *filp; int is_async; struct ib_uverbs_file *uverbs_file; spinlock_t lock; int is_closed; wait_queue_head_t poll_wait; struct fasync_struct *async_queue; struct list_head event_list; + struct list_head list; }; struct ib_uverbs_file { struct kref ref; struct mutex mutex; + struct mutex cleanup_mutex; /* protect cleanup */ struct ib_uverbs_device *device; struct ib_ucontext *ucontext; struct ib_event_handler event_handler; struct ib_uverbs_event_file *async_file; + struct list_head list; + int is_closed; }; struct ib_uverbs_event { union { struct ib_uverbs_async_event_desc async; struct ib_uverbs_comp_event_desc comp; } desc; struct list_head list; struct list_head obj_list; u32 *counter; }; struct ib_uverbs_mcast_entry { struct list_head list; union ib_gid gid; u16 lid; }; struct ib_uevent_object { struct ib_uobject uobject; struct list_head event_list; u32 events_reported; }; struct ib_uxrcd_object { struct ib_uobject uobject; atomic_t refcnt; }; struct ib_usrq_object { struct ib_uevent_object uevent; struct ib_uxrcd_object *uxrcd; }; struct ib_uqp_object { struct ib_uevent_object uevent; struct list_head mcast_list; struct ib_uxrcd_object *uxrcd; }; +struct ib_uwq_object { + struct ib_uevent_object uevent; +}; + struct ib_ucq_object { struct ib_uobject uobject; struct ib_uverbs_file *uverbs_file; struct list_head comp_list; struct list_head async_list; u32 comp_events_reported; u32 async_events_reported; }; -struct ib_udct_object { - struct ib_uobject uobject; -}; - extern spinlock_t ib_uverbs_idr_lock; extern struct idr ib_uverbs_pd_idr; extern struct idr ib_uverbs_mr_idr; extern struct idr ib_uverbs_mw_idr; extern struct idr ib_uverbs_ah_idr; extern struct idr ib_uverbs_cq_idr; extern struct idr ib_uverbs_qp_idr; extern struct idr ib_uverbs_srq_idr; extern struct idr ib_uverbs_xrcd_idr; extern struct idr ib_uverbs_rule_idr; -extern struct idr ib_uverbs_dct_idr; +extern struct idr ib_uverbs_wq_idr; +extern struct idr ib_uverbs_rwq_ind_tbl_idr; void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj); struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, + struct ib_device *ib_dev, int is_async); +void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file); struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd); void ib_uverbs_release_ucq(struct ib_uverbs_file *file, struct ib_uverbs_event_file *ev_file, struct ib_ucq_object *uobj); void ib_uverbs_release_uevent(struct ib_uverbs_file *file, struct ib_uevent_object *uobj); void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context); void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr); +void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd); +int uverbs_dealloc_mw(struct ib_mw *mw); + struct ib_uverbs_flow_spec { union { union { struct ib_uverbs_flow_spec_hdr hdr; struct { __u32 type; __u16 size; __u16 reserved; }; }; struct ib_uverbs_flow_spec_eth eth; - struct ib_uverbs_flow_spec_ib ib; struct ib_uverbs_flow_spec_ipv4 ipv4; struct ib_uverbs_flow_spec_tcp_udp tcp_udp; + struct ib_uverbs_flow_spec_ipv6 ipv6; }; }; #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ + struct ib_device *ib_dev, \ const char __user *buf, int in_len, \ int out_len) IB_UVERBS_DECLARE_CMD(get_context); IB_UVERBS_DECLARE_CMD(query_device); IB_UVERBS_DECLARE_CMD(query_port); IB_UVERBS_DECLARE_CMD(alloc_pd); IB_UVERBS_DECLARE_CMD(dealloc_pd); IB_UVERBS_DECLARE_CMD(reg_mr); +IB_UVERBS_DECLARE_CMD(rereg_mr); IB_UVERBS_DECLARE_CMD(dereg_mr); IB_UVERBS_DECLARE_CMD(alloc_mw); IB_UVERBS_DECLARE_CMD(dealloc_mw); IB_UVERBS_DECLARE_CMD(create_comp_channel); IB_UVERBS_DECLARE_CMD(create_cq); IB_UVERBS_DECLARE_CMD(resize_cq); IB_UVERBS_DECLARE_CMD(poll_cq); IB_UVERBS_DECLARE_CMD(req_notify_cq); IB_UVERBS_DECLARE_CMD(destroy_cq); IB_UVERBS_DECLARE_CMD(create_qp); IB_UVERBS_DECLARE_CMD(open_qp); IB_UVERBS_DECLARE_CMD(query_qp); IB_UVERBS_DECLARE_CMD(modify_qp); IB_UVERBS_DECLARE_CMD(destroy_qp); IB_UVERBS_DECLARE_CMD(post_send); IB_UVERBS_DECLARE_CMD(post_recv); IB_UVERBS_DECLARE_CMD(post_srq_recv); IB_UVERBS_DECLARE_CMD(create_ah); IB_UVERBS_DECLARE_CMD(destroy_ah); IB_UVERBS_DECLARE_CMD(attach_mcast); IB_UVERBS_DECLARE_CMD(detach_mcast); IB_UVERBS_DECLARE_CMD(create_srq); IB_UVERBS_DECLARE_CMD(modify_srq); IB_UVERBS_DECLARE_CMD(query_srq); IB_UVERBS_DECLARE_CMD(destroy_srq); IB_UVERBS_DECLARE_CMD(create_xsrq); IB_UVERBS_DECLARE_CMD(open_xrcd); IB_UVERBS_DECLARE_CMD(close_xrcd); #define IB_UVERBS_DECLARE_EX_CMD(name) \ - int ib_uverbs_ex_##name(struct ib_uverbs_file *file,\ - struct ib_udata *ucore, \ + int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \ + struct ib_device *ib_dev, \ + struct ib_udata *ucore, \ struct ib_udata *uhw) -#define IB_UVERBS_DECLARE_EXP_CMD(name) \ - ssize_t ib_uverbs_exp_##name(struct ib_uverbs_file *file, \ - struct ib_udata *ucore, \ - struct ib_udata *uhw) - IB_UVERBS_DECLARE_EX_CMD(create_flow); IB_UVERBS_DECLARE_EX_CMD(destroy_flow); - -IB_UVERBS_DECLARE_EXP_CMD(create_qp); -IB_UVERBS_DECLARE_EXP_CMD(modify_cq); -IB_UVERBS_DECLARE_EXP_CMD(modify_qp); -IB_UVERBS_DECLARE_EXP_CMD(create_cq); -IB_UVERBS_DECLARE_EXP_CMD(query_device); -IB_UVERBS_DECLARE_EXP_CMD(create_dct); -IB_UVERBS_DECLARE_EXP_CMD(destroy_dct); -IB_UVERBS_DECLARE_EXP_CMD(query_dct); +IB_UVERBS_DECLARE_EX_CMD(query_device); +IB_UVERBS_DECLARE_EX_CMD(create_cq); +IB_UVERBS_DECLARE_EX_CMD(create_qp); +IB_UVERBS_DECLARE_EX_CMD(create_wq); +IB_UVERBS_DECLARE_EX_CMD(modify_wq); +IB_UVERBS_DECLARE_EX_CMD(destroy_wq); +IB_UVERBS_DECLARE_EX_CMD(create_rwq_ind_table); +IB_UVERBS_DECLARE_EX_CMD(destroy_rwq_ind_table); #endif /* UVERBS_H */ Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/uverbs_cmd.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/uverbs_cmd.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/uverbs_cmd.c (revision 319974) @@ -1,3914 +1,4251 @@ /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * Copyright (c) 2006 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define LINUXKPI_PARAM_PREFIX ibcore_ #include #include #include -#include +#include #include -#include -#include #include -#include -#include #include "uverbs.h" +#include "core_priv.h" -static int disable_raw_qp_enforcement; -module_param_named(disable_raw_qp_enforcement, disable_raw_qp_enforcement, int, - 0444); -MODULE_PARM_DESC(disable_raw_qp_enforcement, "Disable RAW QP enforcement for " - "being opened by root (default: 0)"); +#include struct uverbs_lock_class { - struct lock_class_key key; char name[16]; }; static struct uverbs_lock_class pd_lock_class = { .name = "PD-uobj" }; static struct uverbs_lock_class mr_lock_class = { .name = "MR-uobj" }; static struct uverbs_lock_class mw_lock_class = { .name = "MW-uobj" }; static struct uverbs_lock_class cq_lock_class = { .name = "CQ-uobj" }; static struct uverbs_lock_class qp_lock_class = { .name = "QP-uobj" }; static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" }; static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; -static struct uverbs_lock_class dct_lock_class = { .name = "DCT-uobj" }; +static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; +static struct uverbs_lock_class wq_lock_class = { .name = "WQ-uobj" }; +static struct uverbs_lock_class rwq_ind_table_lock_class = { .name = "IND_TBL-uobj" }; -static int uverbs_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) -{ - return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; -} - -static int uverbs_copy_to_udata(struct ib_udata *udata, void *src, size_t len) -{ - return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; -} - -static struct ib_udata_ops uverbs_copy = { - .copy_from = uverbs_copy_from_udata, - .copy_to = uverbs_copy_to_udata -}; - -#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ - do { \ - (udata)->ops = &uverbs_copy; \ - (udata)->inbuf = (void __user *) (ibuf); \ - (udata)->outbuf = (void __user *) (obuf); \ - (udata)->inlen = (ilen); \ - (udata)->outlen = (olen); \ - } while (0) - -enum uverbs_cmd_type { - IB_USER_VERBS_CMD_BASIC, - IB_USER_VERBS_CMD_EXTENDED -}; - /* * The ib_uobject locking scheme is as follows: * * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it - * needs to be held during all idr operations. When an object is + * needs to be held during all idr write operations. When an object is * looked up, a reference must be taken on the object's kref before - * dropping this lock. + * dropping this lock. For read operations, the rcu_read_lock() + * and rcu_write_lock() but similarly the kref reference is grabbed + * before the rcu_read_unlock(). * * - Each object also has an rwsem. This rwsem must be held for * reading while an operation that uses the object is performed. * For example, while registering an MR, the associated PD's * uobject.mutex must be held for reading. The rwsem must be held * for writing while initializing or destroying an object. * * - In addition, each object has a "live" flag. If this flag is not * set, then lookups of the object will fail even if it is found in * the idr. This handles a reader that blocks and does not acquire * the rwsem until after the object is destroyed. The destroy * operation will set the live flag to 0 and then drop the rwsem; * this will allow the reader to acquire the rwsem, see that the * live flag is 0, and then drop the rwsem and its reference to * object. The underlying storage will not be freed until the last * reference to the object is dropped. */ static void init_uobj(struct ib_uobject *uobj, u64 user_handle, struct ib_ucontext *context, struct uverbs_lock_class *c) { uobj->user_handle = user_handle; uobj->context = context; kref_init(&uobj->ref); init_rwsem(&uobj->mutex); - lockdep_set_class_and_name(&uobj->mutex, &c->key, c->name); uobj->live = 0; } static void release_uobj(struct kref *kref) { - kfree(container_of(kref, struct ib_uobject, ref)); + kfree_rcu(container_of(kref, struct ib_uobject, ref), rcu); } static void put_uobj(struct ib_uobject *uobj) { kref_put(&uobj->ref, release_uobj); } static void put_uobj_read(struct ib_uobject *uobj) { up_read(&uobj->mutex); put_uobj(uobj); } static void put_uobj_write(struct ib_uobject *uobj) { up_write(&uobj->mutex); put_uobj(uobj); } static int idr_add_uobj(struct idr *idr, struct ib_uobject *uobj) { int ret; -retry: - if (!idr_pre_get(idr, GFP_KERNEL)) - return -ENOMEM; - + idr_preload(GFP_KERNEL); spin_lock(&ib_uverbs_idr_lock); - ret = idr_get_new(idr, uobj, &uobj->id); - spin_unlock(&ib_uverbs_idr_lock); - if (ret == -EAGAIN) - goto retry; + ret = idr_alloc(idr, uobj, 0, 0, GFP_NOWAIT); + if (ret >= 0) + uobj->id = ret; - return ret; + spin_unlock(&ib_uverbs_idr_lock); + idr_preload_end(); + + return ret < 0 ? ret : 0; } void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj) { spin_lock(&ib_uverbs_idr_lock); idr_remove(idr, uobj->id); spin_unlock(&ib_uverbs_idr_lock); } static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id, struct ib_ucontext *context) { struct ib_uobject *uobj; - spin_lock(&ib_uverbs_idr_lock); + rcu_read_lock(); uobj = idr_find(idr, id); if (uobj) { if (uobj->context == context) kref_get(&uobj->ref); else uobj = NULL; } - spin_unlock(&ib_uverbs_idr_lock); + rcu_read_unlock(); return uobj; } static struct ib_uobject *idr_read_uobj(struct idr *idr, int id, struct ib_ucontext *context, int nested) { struct ib_uobject *uobj; uobj = __idr_get_uobj(idr, id, context); if (!uobj) return NULL; if (nested) down_read_nested(&uobj->mutex, SINGLE_DEPTH_NESTING); else down_read(&uobj->mutex); if (!uobj->live) { put_uobj_read(uobj); return NULL; } return uobj; } static struct ib_uobject *idr_write_uobj(struct idr *idr, int id, struct ib_ucontext *context) { struct ib_uobject *uobj; uobj = __idr_get_uobj(idr, id, context); if (!uobj) return NULL; down_write(&uobj->mutex); if (!uobj->live) { put_uobj_write(uobj); return NULL; } return uobj; } static void *idr_read_obj(struct idr *idr, int id, struct ib_ucontext *context, int nested) { struct ib_uobject *uobj; uobj = idr_read_uobj(idr, id, context, nested); return uobj ? uobj->object : NULL; } static struct ib_pd *idr_read_pd(int pd_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_pd_idr, pd_handle, context, 0); } static void put_pd_read(struct ib_pd *pd) { put_uobj_read(pd->uobject); } static struct ib_cq *idr_read_cq(int cq_handle, struct ib_ucontext *context, int nested) { return idr_read_obj(&ib_uverbs_cq_idr, cq_handle, context, nested); } static void put_cq_read(struct ib_cq *cq) { put_uobj_read(cq->uobject); } static struct ib_ah *idr_read_ah(int ah_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_ah_idr, ah_handle, context, 0); } static void put_ah_read(struct ib_ah *ah) { put_uobj_read(ah->uobject); } static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0); } +static struct ib_wq *idr_read_wq(int wq_handle, struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_wq_idr, wq_handle, context, 0); +} + +static void put_wq_read(struct ib_wq *wq) +{ + put_uobj_read(wq->uobject); +} + +static struct ib_rwq_ind_table *idr_read_rwq_indirection_table(int ind_table_handle, + struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_rwq_ind_tbl_idr, ind_table_handle, context, 0); +} + +static void put_rwq_indirection_table_read(struct ib_rwq_ind_table *ind_table) +{ + put_uobj_read(ind_table->uobject); +} + static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context) { struct ib_uobject *uobj; uobj = idr_write_uobj(&ib_uverbs_qp_idr, qp_handle, context); return uobj ? uobj->object : NULL; } static void put_qp_read(struct ib_qp *qp) { put_uobj_read(qp->uobject); } static void put_qp_write(struct ib_qp *qp) { put_uobj_write(qp->uobject); } -static struct ib_dct *idr_read_dct(int dct_handle, struct ib_ucontext *context) -{ - return idr_read_obj(&ib_uverbs_dct_idr, dct_handle, context, 0); -} - -static void put_dct_read(struct ib_dct *dct) -{ - put_uobj_read(dct->uobject); -} - static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0); } static void put_srq_read(struct ib_srq *srq) { put_uobj_read(srq->uobject); } static struct ib_xrcd *idr_read_xrcd(int xrcd_handle, struct ib_ucontext *context, struct ib_uobject **uobj) { *uobj = idr_read_uobj(&ib_uverbs_xrcd_idr, xrcd_handle, context, 0); return *uobj ? (*uobj)->object : NULL; } static void put_xrcd_read(struct ib_uobject *uobj) { put_uobj_read(uobj); } ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_get_context cmd; struct ib_uverbs_get_context_resp resp; struct ib_udata udata; - struct ib_device *ibdev = file->device->ib_dev; struct ib_ucontext *ucontext; struct file *filp; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; mutex_lock(&file->mutex); if (file->ucontext) { ret = -EINVAL; goto err; } INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); - ucontext = ibdev->alloc_ucontext(ibdev, &udata); + ucontext = ib_dev->alloc_ucontext(ib_dev, &udata); if (IS_ERR(ucontext)) { ret = PTR_ERR(ucontext); goto err; } - ucontext->device = ibdev; + ucontext->device = ib_dev; INIT_LIST_HEAD(&ucontext->pd_list); INIT_LIST_HEAD(&ucontext->mr_list); INIT_LIST_HEAD(&ucontext->mw_list); INIT_LIST_HEAD(&ucontext->cq_list); INIT_LIST_HEAD(&ucontext->qp_list); INIT_LIST_HEAD(&ucontext->srq_list); INIT_LIST_HEAD(&ucontext->ah_list); + INIT_LIST_HEAD(&ucontext->wq_list); + INIT_LIST_HEAD(&ucontext->rwq_ind_tbl_list); INIT_LIST_HEAD(&ucontext->xrcd_list); INIT_LIST_HEAD(&ucontext->rule_list); - INIT_LIST_HEAD(&ucontext->dct_list); + rcu_read_lock(); + ucontext->tgid = get_pid(task_pid_group_leader(current)); + rcu_read_unlock(); ucontext->closing = 0; - ucontext->peer_mem_private_data = NULL; - ucontext->peer_mem_name = NULL; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + ucontext->umem_tree = RB_ROOT; + init_rwsem(&ucontext->umem_rwsem); + ucontext->odp_mrs_count = 0; + INIT_LIST_HEAD(&ucontext->no_private_counters); + + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) + ucontext->invalidate_range = NULL; + +#endif + resp.num_comp_vectors = file->device->num_comp_vectors; - ret = get_unused_fd(); + ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) goto err_free; resp.async_fd = ret; - filp = ib_uverbs_alloc_event_file(file, 1); + filp = ib_uverbs_alloc_event_file(file, ib_dev, 1); if (IS_ERR(filp)) { ret = PTR_ERR(filp); goto err_fd; } - file->async_file = filp->private_data; - - INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev, - ib_uverbs_event_handler); - ret = ib_register_event_handler(&file->event_handler); - if (ret) - goto err_file; - if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_file; } - kref_get(&file->async_file->ref); - kref_get(&file->ref); + file->ucontext = ucontext; fd_install(resp.async_fd, filp); mutex_unlock(&file->mutex); return in_len; err_file: + ib_uverbs_free_async_event_file(file); fput(filp); err_fd: put_unused_fd(resp.async_fd); err_free: - ibdev->dealloc_ucontext(ucontext); + put_pid(ucontext->tgid); + ib_dev->dealloc_ucontext(ucontext); err: mutex_unlock(&file->mutex); return ret; } -static void ib_uverbs_query_device_assign( - struct ib_uverbs_query_device_resp *resp, - struct ib_device_attr *attr, - struct ib_uverbs_file *file) +static void copy_query_dev_fields(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_uverbs_query_device_resp *resp, + struct ib_device_attr *attr) { - memset(resp, 0, sizeof(*resp)); - - resp->fw_ver = attr->fw_ver; - resp->node_guid = file->device->ib_dev->node_guid; - resp->sys_image_guid = attr->sys_image_guid; - resp->max_mr_size = attr->max_mr_size; - resp->page_size_cap = attr->page_size_cap; - resp->vendor_id = attr->vendor_id; - resp->vendor_part_id = attr->vendor_part_id; - resp->hw_ver = attr->hw_ver; - resp->max_qp = attr->max_qp; - resp->max_qp_wr = attr->max_qp_wr; - resp->device_cap_flags = attr->device_cap_flags; - resp->max_sge = attr->max_sge; - resp->max_sge_rd = attr->max_sge_rd; - resp->max_cq = attr->max_cq; - resp->max_cqe = attr->max_cqe; - resp->max_mr = attr->max_mr; - resp->max_pd = attr->max_pd; - resp->max_qp_rd_atom = attr->max_qp_rd_atom; - resp->max_ee_rd_atom = attr->max_ee_rd_atom; - resp->max_res_rd_atom = attr->max_res_rd_atom; - resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; - resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; - resp->atomic_cap = attr->atomic_cap; - resp->max_ee = attr->max_ee; - resp->max_rdd = attr->max_rdd; - resp->max_mw = attr->max_mw; - resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; - resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; - resp->max_mcast_grp = attr->max_mcast_grp; - resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; - resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; - resp->max_ah = attr->max_ah; - resp->max_fmr = attr->max_fmr; - resp->max_map_per_fmr = attr->max_map_per_fmr; - resp->max_srq = attr->max_srq; - resp->max_srq_wr = attr->max_srq_wr; - resp->max_srq_sge = attr->max_srq_sge; - resp->max_pkeys = attr->max_pkeys; - resp->local_ca_ack_delay = attr->local_ca_ack_delay; - resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt; + resp->fw_ver = attr->fw_ver; + resp->node_guid = ib_dev->node_guid; + resp->sys_image_guid = attr->sys_image_guid; + resp->max_mr_size = attr->max_mr_size; + resp->page_size_cap = attr->page_size_cap; + resp->vendor_id = attr->vendor_id; + resp->vendor_part_id = attr->vendor_part_id; + resp->hw_ver = attr->hw_ver; + resp->max_qp = attr->max_qp; + resp->max_qp_wr = attr->max_qp_wr; + resp->device_cap_flags = (u32)(attr->device_cap_flags); + resp->max_sge = attr->max_sge; + resp->max_sge_rd = attr->max_sge_rd; + resp->max_cq = attr->max_cq; + resp->max_cqe = attr->max_cqe; + resp->max_mr = attr->max_mr; + resp->max_pd = attr->max_pd; + resp->max_qp_rd_atom = attr->max_qp_rd_atom; + resp->max_ee_rd_atom = attr->max_ee_rd_atom; + resp->max_res_rd_atom = attr->max_res_rd_atom; + resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; + resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; + resp->atomic_cap = attr->atomic_cap; + resp->max_ee = attr->max_ee; + resp->max_rdd = attr->max_rdd; + resp->max_mw = attr->max_mw; + resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; + resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; + resp->max_mcast_grp = attr->max_mcast_grp; + resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; + resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; + resp->max_ah = attr->max_ah; + resp->max_fmr = attr->max_fmr; + resp->max_map_per_fmr = attr->max_map_per_fmr; + resp->max_srq = attr->max_srq; + resp->max_srq_wr = attr->max_srq_wr; + resp->max_srq_sge = attr->max_srq_sge; + resp->max_pkeys = attr->max_pkeys; + resp->local_ca_ack_delay = attr->local_ca_ack_delay; + resp->phys_port_cnt = ib_dev->phys_port_cnt; } ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_query_device cmd; struct ib_uverbs_query_device_resp resp; - struct ib_device_attr attr; - int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - ret = ib_query_device(file->device->ib_dev, &attr); - if (ret) - return ret; + memset(&resp, 0, sizeof resp); + copy_query_dev_fields(file, ib_dev, &resp, &ib_dev->attrs); - ib_uverbs_query_device_assign(&resp, &attr, file); - - if (copy_to_user((void __user *)(unsigned long) cmd.response, - &resp, sizeof(resp))) + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) return -EFAULT; return in_len; } ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_query_port cmd; struct ib_uverbs_query_port_resp resp; struct ib_port_attr attr; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr); + ret = ib_query_port(ib_dev, cmd.port_num, &attr); if (ret) return ret; memset(&resp, 0, sizeof resp); resp.state = attr.state; resp.max_mtu = attr.max_mtu; resp.active_mtu = attr.active_mtu; resp.gid_tbl_len = attr.gid_tbl_len; resp.port_cap_flags = attr.port_cap_flags; resp.max_msg_sz = attr.max_msg_sz; resp.bad_pkey_cntr = attr.bad_pkey_cntr; resp.qkey_viol_cntr = attr.qkey_viol_cntr; resp.pkey_tbl_len = attr.pkey_tbl_len; resp.lid = attr.lid; resp.sm_lid = attr.sm_lid; resp.lmc = attr.lmc; resp.max_vl_num = attr.max_vl_num; resp.sm_sl = attr.sm_sl; resp.subnet_timeout = attr.subnet_timeout; resp.init_type_reply = attr.init_type_reply; resp.active_width = attr.active_width; resp.active_speed = attr.active_speed; resp.phys_state = attr.phys_state; - resp.link_layer = rdma_port_get_link_layer(file->device->ib_dev, + resp.link_layer = rdma_port_get_link_layer(ib_dev, cmd.port_num); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) return -EFAULT; return in_len; } ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_alloc_pd cmd; struct ib_uverbs_alloc_pd_resp resp; struct ib_udata udata; struct ib_uobject *uobj; struct ib_pd *pd; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); uobj = kmalloc(sizeof *uobj, GFP_KERNEL); if (!uobj) return -ENOMEM; init_uobj(uobj, 0, file->ucontext, &pd_lock_class); down_write(&uobj->mutex); - pd = file->device->ib_dev->alloc_pd(file->device->ib_dev, - file->ucontext, &udata); + pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata); if (IS_ERR(pd)) { ret = PTR_ERR(pd); goto err; } - pd->device = file->device->ib_dev; + pd->device = ib_dev; pd->uobject = uobj; + pd->__internal_mr = NULL; atomic_set(&pd->usecnt, 0); uobj->object = pd; ret = idr_add_uobj(&ib_uverbs_pd_idr, uobj); if (ret) goto err_idr; memset(&resp, 0, sizeof resp); resp.pd_handle = uobj->id; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; } mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->pd_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); return in_len; err_copy: idr_remove_uobj(&ib_uverbs_pd_idr, uobj); err_idr: ib_dealloc_pd(pd); err: put_uobj_write(uobj); return ret; } ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_dealloc_pd cmd; struct ib_uobject *uobj; + struct ib_pd *pd; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext); if (!uobj) return -EINVAL; + pd = uobj->object; - ret = ib_dealloc_pd(uobj->object); - if (!ret) - uobj->live = 0; + if (atomic_read(&pd->usecnt)) { + ret = -EBUSY; + goto err_put; + } - put_uobj_write(uobj); - + ret = pd->device->dealloc_pd(uobj->object); + WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd"); if (ret) - return ret; + goto err_put; + uobj->live = 0; + put_uobj_write(uobj); + idr_remove_uobj(&ib_uverbs_pd_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); return in_len; + +err_put: + put_uobj_write(uobj); + return ret; } struct xrcd_table_entry { struct rb_node node; struct ib_xrcd *xrcd; struct inode *inode; }; static int xrcd_table_insert(struct ib_uverbs_device *dev, struct inode *inode, struct ib_xrcd *xrcd) { struct xrcd_table_entry *entry, *scan; struct rb_node **p = &dev->xrcd_tree.rb_node; struct rb_node *parent = NULL; entry = kmalloc(sizeof *entry, GFP_KERNEL); if (!entry) return -ENOMEM; entry->xrcd = xrcd; entry->inode = inode; while (*p) { parent = *p; scan = rb_entry(parent, struct xrcd_table_entry, node); if (inode < scan->inode) { p = &(*p)->rb_left; } else if (inode > scan->inode) { p = &(*p)->rb_right; } else { kfree(entry); return -EEXIST; } } rb_link_node(&entry->node, parent, p); rb_insert_color(&entry->node, &dev->xrcd_tree); igrab(inode); return 0; } static struct xrcd_table_entry *xrcd_table_search(struct ib_uverbs_device *dev, struct inode *inode) { struct xrcd_table_entry *entry; struct rb_node *p = dev->xrcd_tree.rb_node; while (p) { entry = rb_entry(p, struct xrcd_table_entry, node); if (inode < entry->inode) p = p->rb_left; else if (inode > entry->inode) p = p->rb_right; else return entry; } return NULL; } static struct ib_xrcd *find_xrcd(struct ib_uverbs_device *dev, struct inode *inode) { struct xrcd_table_entry *entry; entry = xrcd_table_search(dev, inode); if (!entry) return NULL; return entry->xrcd; } static void xrcd_table_delete(struct ib_uverbs_device *dev, struct inode *inode) { struct xrcd_table_entry *entry; entry = xrcd_table_search(dev, inode); if (entry) { iput(inode); rb_erase(&entry->node, &dev->xrcd_tree); kfree(entry); } } ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_open_xrcd cmd; struct ib_uverbs_open_xrcd_resp resp; - struct ib_udata udata; + struct ib_udata udata; struct ib_uxrcd_object *obj; struct ib_xrcd *xrcd = NULL; struct fd f = {NULL}; struct inode *inode = NULL; int ret = 0; int new_xrcd = 0; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + in_len - sizeof cmd, out_len - sizeof resp); mutex_lock(&file->device->xrcd_tree_mutex); if (cmd.fd != -1) { /* search for file descriptor */ f = fdget(cmd.fd); if (!f.file) { ret = -EBADF; goto err_tree_mutex_unlock; } inode = f.file->f_dentry->d_inode; xrcd = find_xrcd(file->device, inode); if (!xrcd && !(cmd.oflags & O_CREAT)) { /* no file descriptor. Need CREATE flag */ ret = -EAGAIN; goto err_tree_mutex_unlock; } if (xrcd && cmd.oflags & O_EXCL) { - ret = -EINVAL; + ret = -EINVAL; goto err_tree_mutex_unlock; } } obj = kmalloc(sizeof *obj, GFP_KERNEL); if (!obj) { ret = -ENOMEM; goto err_tree_mutex_unlock; } init_uobj(&obj->uobject, 0, file->ucontext, &xrcd_lock_class); down_write(&obj->uobject.mutex); if (!xrcd) { - xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev, - file->ucontext, &udata); + xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata); if (IS_ERR(xrcd)) { ret = PTR_ERR(xrcd); goto err; } xrcd->inode = inode; - xrcd->device = file->device->ib_dev; + xrcd->device = ib_dev; atomic_set(&xrcd->usecnt, 0); mutex_init(&xrcd->tgt_qp_mutex); INIT_LIST_HEAD(&xrcd->tgt_qp_list); new_xrcd = 1; } atomic_set(&obj->refcnt, 0); obj->uobject.object = xrcd; ret = idr_add_uobj(&ib_uverbs_xrcd_idr, &obj->uobject); if (ret) goto err_idr; memset(&resp, 0, sizeof resp); resp.xrcd_handle = obj->uobject.id; if (inode) { if (new_xrcd) { /* create new inode/xrcd table entry */ ret = xrcd_table_insert(file->device, inode, xrcd); if (ret) goto err_insert_xrcd; } atomic_inc(&xrcd->usecnt); } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; } if (f.file) fdput(f); mutex_lock(&file->mutex); list_add_tail(&obj->uobject.list, &file->ucontext->xrcd_list); mutex_unlock(&file->mutex); obj->uobject.live = 1; up_write(&obj->uobject.mutex); mutex_unlock(&file->device->xrcd_tree_mutex); return in_len; err_copy: if (inode) { if (new_xrcd) xrcd_table_delete(file->device, inode); atomic_dec(&xrcd->usecnt); } err_insert_xrcd: idr_remove_uobj(&ib_uverbs_xrcd_idr, &obj->uobject); err_idr: ib_dealloc_xrcd(xrcd); err: put_uobj_write(&obj->uobject); err_tree_mutex_unlock: if (f.file) fdput(f); mutex_unlock(&file->device->xrcd_tree_mutex); return ret; } ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_close_xrcd cmd; - struct ib_uobject *uobj; + struct ib_uobject *uobj; struct ib_xrcd *xrcd = NULL; struct inode *inode = NULL; struct ib_uxrcd_object *obj; int live; int ret = 0; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; mutex_lock(&file->device->xrcd_tree_mutex); uobj = idr_write_uobj(&ib_uverbs_xrcd_idr, cmd.xrcd_handle, file->ucontext); if (!uobj) { ret = -EINVAL; goto out; } xrcd = uobj->object; inode = xrcd->inode; obj = container_of(uobj, struct ib_uxrcd_object, uobject); if (atomic_read(&obj->refcnt)) { put_uobj_write(uobj); ret = -EBUSY; goto out; } if (!inode || atomic_dec_and_test(&xrcd->usecnt)) { ret = ib_dealloc_xrcd(uobj->object); - if (!ret) - uobj->live = 0; + if (!ret) + uobj->live = 0; } live = uobj->live; if (inode && ret) atomic_inc(&xrcd->usecnt); put_uobj_write(uobj); if (ret) goto out; if (inode && !live) xrcd_table_delete(file->device, inode); idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); ret = in_len; out: mutex_unlock(&file->device->xrcd_tree_mutex); return ret; } void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd) { struct inode *inode; inode = xrcd->inode; if (inode && !atomic_dec_and_test(&xrcd->usecnt)) return; ib_dealloc_xrcd(xrcd); if (inode) xrcd_table_delete(dev, inode); } ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_reg_mr cmd; struct ib_uverbs_reg_mr_resp resp; - struct ib_udata udata; + struct ib_udata udata; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mr *mr; - int ret; + int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) return -EINVAL; ret = ib_check_mr_access(cmd.access_flags); if (ret) return ret; uobj = kmalloc(sizeof *uobj, GFP_KERNEL); if (!uobj) return -ENOMEM; init_uobj(uobj, 0, file->ucontext, &mr_lock_class); down_write(&uobj->mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { - ret = -EINVAL; + ret = -EINVAL; goto err_free; } - /* We first get a new "obj id" to be passed later to reg mr for - further use as mr_id. - */ - ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); - if (ret) - goto err_put; + if (cmd.access_flags & IB_ACCESS_ON_DEMAND) { + if (!(pd->device->attrs.device_cap_flags & + IB_DEVICE_ON_DEMAND_PAGING)) { + pr_debug("ODP support not available\n"); + ret = -EINVAL; + goto err_put; + } + } + mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, - cmd.access_flags, &udata, uobj->id); + cmd.access_flags, &udata); if (IS_ERR(mr)) { ret = PTR_ERR(mr); - goto err_remove_uobj; + goto err_put; } mr->device = pd->device; mr->pd = pd; mr->uobject = uobj; atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); uobj->object = mr; + ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); + if (ret) + goto err_unreg; memset(&resp, 0, sizeof resp); resp.lkey = mr->lkey; resp.rkey = mr->rkey; resp.mr_handle = uobj->id; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; } put_pd_read(pd); mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->mr_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); return in_len; err_copy: - ib_dereg_mr(mr); - -err_remove_uobj: idr_remove_uobj(&ib_uverbs_mr_idr, uobj); +err_unreg: + ib_dereg_mr(mr); + err_put: put_pd_read(pd); err_free: put_uobj_write(uobj); return ret; } +ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_rereg_mr cmd; + struct ib_uverbs_rereg_mr_resp resp; + struct ib_udata udata; + struct ib_pd *pd = NULL; + struct ib_mr *mr; + struct ib_pd *old_pd; + int ret; + struct ib_uobject *uobj; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd), out_len - sizeof(resp)); + + if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags) + return -EINVAL; + + if ((cmd.flags & IB_MR_REREG_TRANS) && + (!cmd.start || !cmd.hca_va || 0 >= cmd.length || + (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))) + return -EINVAL; + + uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, + file->ucontext); + + if (!uobj) + return -EINVAL; + + mr = uobj->object; + + if (cmd.flags & IB_MR_REREG_ACCESS) { + ret = ib_check_mr_access(cmd.access_flags); + if (ret) + goto put_uobjs; + } + + if (cmd.flags & IB_MR_REREG_PD) { + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto put_uobjs; + } + } + + old_pd = mr->pd; + ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start, + cmd.length, cmd.hca_va, + cmd.access_flags, pd, &udata); + if (!ret) { + if (cmd.flags & IB_MR_REREG_PD) { + atomic_inc(&pd->usecnt); + mr->pd = pd; + atomic_dec(&old_pd->usecnt); + } + } else { + goto put_uobj_pd; + } + + memset(&resp, 0, sizeof(resp)); + resp.lkey = mr->lkey; + resp.rkey = mr->rkey; + + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + ret = -EFAULT; + else + ret = in_len; + +put_uobj_pd: + if (cmd.flags & IB_MR_REREG_PD) + put_pd_read(pd); + +put_uobjs: + + put_uobj_write(mr->uobject); + + return ret; +} + ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_dereg_mr cmd; struct ib_mr *mr; struct ib_uobject *uobj; - int ret = -EINVAL; + int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext); if (!uobj) return -EINVAL; mr = uobj->object; ret = ib_dereg_mr(mr); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_mr_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); return in_len; } ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_alloc_mw cmd; struct ib_uverbs_alloc_mw_resp resp; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mw *mw; + struct ib_udata udata; int ret; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); if (!uobj) return -ENOMEM; init_uobj(uobj, 0, file->ucontext, &mw_lock_class); down_write(&uobj->mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err_free; } - mw = pd->device->alloc_mw(pd, cmd.mw_type); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long)cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); + + mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata); if (IS_ERR(mw)) { ret = PTR_ERR(mw); goto err_put; } mw->device = pd->device; mw->pd = pd; mw->uobject = uobj; atomic_inc(&pd->usecnt); uobj->object = mw; ret = idr_add_uobj(&ib_uverbs_mw_idr, uobj); if (ret) goto err_unalloc; memset(&resp, 0, sizeof(resp)); resp.rkey = mw->rkey; resp.mw_handle = uobj->id; if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) { ret = -EFAULT; goto err_copy; } put_pd_read(pd); mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->mw_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); return in_len; err_copy: idr_remove_uobj(&ib_uverbs_mw_idr, uobj); err_unalloc: - ib_dealloc_mw(mw); + uverbs_dealloc_mw(mw); err_put: put_pd_read(pd); err_free: put_uobj_write(uobj); return ret; } ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_dealloc_mw cmd; struct ib_mw *mw; - struct ib_uobject *uobj; - int ret = -EINVAL; + struct ib_uobject *uobj; + int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_mw_idr, cmd.mw_handle, file->ucontext); if (!uobj) return -EINVAL; mw = uobj->object; - ret = ib_dealloc_mw(mw); + ret = uverbs_dealloc_mw(mw); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_mw_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); return in_len; } ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_create_comp_channel cmd; struct ib_uverbs_create_comp_channel_resp resp; struct file *filp; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - ret = get_unused_fd(); + ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) return ret; resp.fd = ret; - filp = ib_uverbs_alloc_event_file(file, 0); + filp = ib_uverbs_alloc_event_file(file, ib_dev, 0); if (IS_ERR(filp)) { put_unused_fd(resp.fd); return PTR_ERR(filp); } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { put_unused_fd(resp.fd); fput(filp); return -EFAULT; } fd_install(resp.fd, filp); return in_len; } -static ssize_t create_cq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len, void *vcmd, int ex, - void __user *response) +static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw, + struct ib_uverbs_ex_create_cq *cmd, + size_t cmd_sz, + int (*cb)(struct ib_uverbs_file *file, + struct ib_ucq_object *obj, + struct ib_uverbs_ex_create_cq_resp *resp, + struct ib_udata *udata, + void *context), + void *context) { - struct ib_uverbs_create_cq *cmd; - struct ib_uverbs_create_cq_ex *cmd_e; - struct ib_uverbs_create_cq_resp resp; - struct ib_udata udata; struct ib_ucq_object *obj; struct ib_uverbs_event_file *ev_file = NULL; struct ib_cq *cq; - struct ib_cq_init_attr attr; - int cmd_sz; int ret; + struct ib_uverbs_ex_create_cq_resp resp; + struct ib_cq_init_attr attr = {}; - if (out_len < sizeof resp) - return -ENOSPC; - - cmd = vcmd; - cmd_e = vcmd; - cmd_sz = ex ? sizeof(*cmd_e) : sizeof(*cmd); - INIT_UDATA(&udata, buf + cmd_sz, response + sizeof(resp), - in_len - sizeof(cmd), out_len - sizeof(resp)); - if (cmd->comp_vector >= file->device->num_comp_vectors) - return -EINVAL; + return ERR_PTR(-EINVAL); obj = kmalloc(sizeof *obj, GFP_KERNEL); if (!obj) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - init_uobj(&obj->uobject, cmd->user_handle, file->ucontext, - &cq_lock_class); + init_uobj(&obj->uobject, cmd->user_handle, file->ucontext, &cq_lock_class); down_write(&obj->uobject.mutex); if (cmd->comp_channel >= 0) { ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel); if (!ev_file) { ret = -EINVAL; goto err; } } obj->uverbs_file = file; obj->comp_events_reported = 0; obj->async_events_reported = 0; INIT_LIST_HEAD(&obj->comp_list); INIT_LIST_HEAD(&obj->async_list); - memset(&attr, 0, sizeof(attr)); attr.cqe = cmd->cqe; attr.comp_vector = cmd->comp_vector; - if (ex && (cmd_e->comp_mask & IB_UVERBS_CREATE_CQ_EX_CAP_FLAGS)) - attr.flags = cmd_e->create_flags; - cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr, - file->ucontext, &udata); + + if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags)) + attr.flags = cmd->flags; + + cq = ib_dev->create_cq(ib_dev, &attr, + file->ucontext, uhw); if (IS_ERR(cq)) { ret = PTR_ERR(cq); goto err_file; } - cq->device = file->device->ib_dev; + cq->device = ib_dev; cq->uobject = &obj->uobject; cq->comp_handler = ib_uverbs_comp_handler; cq->event_handler = ib_uverbs_cq_event_handler; cq->cq_context = ev_file; atomic_set(&cq->usecnt, 0); obj->uobject.object = cq; ret = idr_add_uobj(&ib_uverbs_cq_idr, &obj->uobject); if (ret) goto err_free; memset(&resp, 0, sizeof resp); - resp.cq_handle = obj->uobject.id; - resp.cqe = cq->cqe; + resp.base.cq_handle = obj->uobject.id; + resp.base.cqe = cq->cqe; - if (copy_to_user(response, &resp, sizeof(resp))) { - ret = -EFAULT; - goto err_copy; - } + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + ret = cb(file, obj, &resp, ucore, context); + if (ret) + goto err_cb; + mutex_lock(&file->mutex); list_add_tail(&obj->uobject.list, &file->ucontext->cq_list); mutex_unlock(&file->mutex); obj->uobject.live = 1; up_write(&obj->uobject.mutex); - return in_len; + return obj; -err_copy: +err_cb: idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject); err_free: ib_destroy_cq(cq); err_file: if (ev_file) ib_uverbs_release_ucq(file, ev_file, obj); err: put_uobj_write(&obj->uobject); - return ret; + + return ERR_PTR(ret); } +static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file, + struct ib_ucq_object *obj, + struct ib_uverbs_ex_create_cq_resp *resp, + struct ib_udata *ucore, void *context) +{ + if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base))) + return -EFAULT; + + return 0; +} + ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { - struct ib_uverbs_create_cq cmd; + struct ib_uverbs_create_cq cmd; + struct ib_uverbs_ex_create_cq cmd_ex; + struct ib_uverbs_create_cq_resp resp; + struct ib_udata ucore; + struct ib_udata uhw; + struct ib_ucq_object *obj; + if (out_len < sizeof(resp)) + return -ENOSPC; + if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; - return create_cq(file, buf, in_len, out_len, &cmd, - IB_USER_VERBS_CMD_BASIC, - (void __user *) (unsigned long) cmd.response); + INIT_UDATA(&ucore, buf, (unsigned long)cmd.response, sizeof(cmd), sizeof(resp)); + + INIT_UDATA(&uhw, buf + sizeof(cmd), + (unsigned long)cmd.response + sizeof(resp), + in_len - sizeof(cmd), out_len - sizeof(resp)); + + memset(&cmd_ex, 0, sizeof(cmd_ex)); + cmd_ex.user_handle = cmd.user_handle; + cmd_ex.cqe = cmd.cqe; + cmd_ex.comp_vector = cmd.comp_vector; + cmd_ex.comp_channel = cmd.comp_channel; + + obj = create_cq(file, ib_dev, &ucore, &uhw, &cmd_ex, + offsetof(typeof(cmd_ex), comp_channel) + + sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb, + NULL); + + if (IS_ERR(obj)) + return PTR_ERR(obj); + + return in_len; } +static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file, + struct ib_ucq_object *obj, + struct ib_uverbs_ex_create_cq_resp *resp, + struct ib_udata *ucore, void *context) +{ + if (ib_copy_to_udata(ucore, resp, resp->response_length)) + return -EFAULT; + + return 0; +} + +int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_create_cq_resp resp; + struct ib_uverbs_ex_create_cq cmd; + struct ib_ucq_object *obj; + int err; + + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + if (cmd.comp_mask) + return -EINVAL; + + if (cmd.reserved) + return -EINVAL; + + if (ucore->outlen < (offsetof(typeof(resp), response_length) + + sizeof(resp.response_length))) + return -ENOSPC; + + obj = create_cq(file, ib_dev, ucore, uhw, &cmd, + min(ucore->inlen, sizeof(cmd)), + ib_uverbs_ex_create_cq_cb, NULL); + + if (IS_ERR(obj)) + return PTR_ERR(obj); + + return 0; +} + ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_resize_cq cmd; struct ib_uverbs_resize_cq_resp resp; struct ib_udata udata; struct ib_cq *cq; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); if (!cq) return -EINVAL; ret = cq->device->resize_cq(cq, cmd.cqe, &udata); if (ret) goto out; resp.cqe = cq->cqe; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp.cqe)) ret = -EFAULT; out: put_cq_read(cq); return ret ? ret : in_len; } static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) { struct ib_uverbs_wc tmp; tmp.wr_id = wc->wr_id; tmp.status = wc->status; tmp.opcode = wc->opcode; tmp.vendor_err = wc->vendor_err; tmp.byte_len = wc->byte_len; tmp.ex.imm_data = (__u32 __force) wc->ex.imm_data; tmp.qp_num = wc->qp->qp_num; tmp.src_qp = wc->src_qp; tmp.wc_flags = wc->wc_flags; tmp.pkey_index = wc->pkey_index; tmp.slid = wc->slid; tmp.sl = wc->sl; tmp.dlid_path_bits = wc->dlid_path_bits; tmp.port_num = wc->port_num; tmp.reserved = 0; if (copy_to_user(dest, &tmp, sizeof tmp)) return -EFAULT; return 0; } ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_poll_cq cmd; struct ib_uverbs_poll_cq_resp resp; u8 __user *header_ptr; u8 __user *data_ptr; struct ib_cq *cq; struct ib_wc wc; - int ret; + int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); if (!cq) return -EINVAL; /* we copy a struct ib_uverbs_poll_cq_resp to user space */ header_ptr = (void __user *)(unsigned long) cmd.response; data_ptr = header_ptr + sizeof resp; memset(&resp, 0, sizeof resp); while (resp.count < cmd.ne) { ret = ib_poll_cq(cq, 1, &wc); if (ret < 0) goto out_put; if (!ret) break; ret = copy_wc_to_user(data_ptr, &wc); if (ret) goto out_put; data_ptr += sizeof(struct ib_uverbs_wc); ++resp.count; } if (copy_to_user(header_ptr, &resp, sizeof resp)) { ret = -EFAULT; goto out_put; } ret = in_len; out_put: put_cq_read(cq); return ret; } ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_req_notify_cq cmd; struct ib_cq *cq; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); if (!cq) return -EINVAL; ib_req_notify_cq(cq, cmd.solicited_only ? IB_CQ_SOLICITED : IB_CQ_NEXT_COMP); put_cq_read(cq); return in_len; } ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_cq cmd; struct ib_uverbs_destroy_cq_resp resp; struct ib_uobject *uobj; struct ib_cq *cq; struct ib_ucq_object *obj; struct ib_uverbs_event_file *ev_file; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_cq_idr, cmd.cq_handle, file->ucontext); if (!uobj) return -EINVAL; cq = uobj->object; ev_file = cq->cq_context; obj = container_of(cq->uobject, struct ib_ucq_object, uobject); ret = ib_destroy_cq(cq); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_cq_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); ib_uverbs_release_ucq(file, ev_file, obj); memset(&resp, 0, sizeof resp); resp.comp_events_reported = obj->comp_events_reported; resp.async_events_reported = obj->async_events_reported; put_uobj(uobj); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) return -EFAULT; return in_len; } -ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +static int create_qp(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw, + struct ib_uverbs_ex_create_qp *cmd, + size_t cmd_sz, + int (*cb)(struct ib_uverbs_file *file, + struct ib_uverbs_ex_create_qp_resp *resp, + struct ib_udata *udata), + void *context) { - void __user *response; - struct ib_udata udata; - struct ib_uqp_object *obj; - struct ib_device *device; - struct ib_pd *pd = NULL; - struct ib_xrcd *xrcd = NULL; - struct ib_uobject *uninitialized_var(xrcd_uobj); - struct ib_cq *scq = NULL, *rcq = NULL; - struct ib_srq *srq = NULL; - struct ib_qp *qp; - struct ib_qp_init_attr attr; - int ret; - union { - struct ib_uverbs_create_qp basic; - } cmd_obj; - struct ib_uverbs_create_qp *cmd; - size_t cmd_size = 0; - union { - struct ib_uverbs_create_qp_resp basic; - } resp_obj; - struct ib_uverbs_create_qp_resp *resp; - size_t resp_size = 0; + struct ib_uqp_object *obj; + struct ib_device *device; + struct ib_pd *pd = NULL; + struct ib_xrcd *xrcd = NULL; + struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_cq *scq = NULL, *rcq = NULL; + struct ib_srq *srq = NULL; + struct ib_qp *qp; + char *buf; + struct ib_qp_init_attr attr = {}; + struct ib_uverbs_ex_create_qp_resp resp; + int ret; + struct ib_rwq_ind_table *ind_tbl = NULL; + bool has_sq = true; - cmd_size = sizeof(cmd_obj.basic); - cmd = &cmd_obj.basic; - - resp_size = sizeof(resp_obj.basic); - resp = &resp_obj.basic; - - if (out_len < resp_size) - return -ENOSPC; - - if (copy_from_user(&cmd_obj, buf, cmd_size)) - return -EFAULT; - - response = (void __user *) (unsigned long) cmd->response; - - if (!disable_raw_qp_enforcement && - cmd->qp_type == IB_QPT_RAW_PACKET && priv_check(curthread, PRIV_NET_RAW)) + if (cmd->qp_type == IB_QPT_RAW_PACKET && priv_check(curthread, PRIV_NET_RAW) != 0) return -EPERM; - INIT_UDATA(&udata, buf + cmd_size, response + resp_size, - in_len - cmd_size, out_len - resp_size); - obj = kzalloc(sizeof *obj, GFP_KERNEL); if (!obj) return -ENOMEM; - init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &qp_lock_class); + init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, + &qp_lock_class); down_write(&obj->uevent.uobject.mutex); + if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) + + sizeof(cmd->rwq_ind_tbl_handle) && + (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) { + ind_tbl = idr_read_rwq_indirection_table(cmd->rwq_ind_tbl_handle, + file->ucontext); + if (!ind_tbl) { + ret = -EINVAL; + goto err_put; + } + attr.rwq_ind_tbl = ind_tbl; + } + + if ((cmd_sz >= offsetof(typeof(*cmd), reserved1) + + sizeof(cmd->reserved1)) && cmd->reserved1) { + ret = -EOPNOTSUPP; + goto err_put; + } + + if (ind_tbl && (cmd->max_recv_wr || cmd->max_recv_sge || cmd->is_srq)) { + ret = -EINVAL; + goto err_put; + } + + if (ind_tbl && !cmd->max_send_wr) + has_sq = false; + if (cmd->qp_type == IB_QPT_XRC_TGT) { - xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext, &xrcd_uobj); + xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext, + &xrcd_uobj); if (!xrcd) { ret = -EINVAL; goto err_put; } device = xrcd->device; } else { if (cmd->qp_type == IB_QPT_XRC_INI) { cmd->max_recv_wr = 0; cmd->max_recv_sge = 0; } else { if (cmd->is_srq) { - srq = idr_read_srq(cmd->srq_handle, file->ucontext); + srq = idr_read_srq(cmd->srq_handle, + file->ucontext); if (!srq || srq->srq_type != IB_SRQT_BASIC) { - ret = -EINVAL; + ret = -EINVAL; goto err_put; - } - } + } + } - if (cmd->recv_cq_handle != cmd->send_cq_handle) { - rcq = idr_read_cq(cmd->recv_cq_handle, file->ucontext, 0); - if (!rcq) { - ret = -EINVAL; - goto err_put; + if (!ind_tbl) { + if (cmd->recv_cq_handle != cmd->send_cq_handle) { + rcq = idr_read_cq(cmd->recv_cq_handle, + file->ucontext, 0); + if (!rcq) { + ret = -EINVAL; + goto err_put; + } + } } - } } - scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq); - rcq = rcq ?: scq; + if (has_sq) + scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq); + if (!ind_tbl) + rcq = rcq ?: scq; pd = idr_read_pd(cmd->pd_handle, file->ucontext); - if (!pd || !scq) { + if (!pd || (!scq && has_sq)) { ret = -EINVAL; goto err_put; - } + } device = pd->device; - } + } - memset(&attr, 0, sizeof attr); attr.event_handler = ib_uverbs_qp_event_handler; attr.qp_context = file; attr.send_cq = scq; attr.recv_cq = rcq; attr.srq = srq; attr.xrcd = xrcd; - attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR : + IB_SIGNAL_REQ_WR; attr.qp_type = cmd->qp_type; attr.create_flags = 0; attr.cap.max_send_wr = cmd->max_send_wr; attr.cap.max_recv_wr = cmd->max_recv_wr; attr.cap.max_send_sge = cmd->max_send_sge; attr.cap.max_recv_sge = cmd->max_recv_sge; attr.cap.max_inline_data = cmd->max_inline_data; obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); INIT_LIST_HEAD(&obj->mcast_list); + if (cmd_sz >= offsetof(typeof(*cmd), create_flags) + + sizeof(cmd->create_flags)) + attr.create_flags = cmd->create_flags; + + if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_QP_CREATE_CROSS_CHANNEL | + IB_QP_CREATE_MANAGED_SEND | + IB_QP_CREATE_MANAGED_RECV | + IB_QP_CREATE_SCATTER_FCS)) { + ret = -EINVAL; + goto err_put; + } + + buf = (char *)cmd + sizeof(*cmd); + if (cmd_sz > sizeof(*cmd)) + if (!(buf[0] == 0 && !memcmp(buf, buf + 1, + cmd_sz - sizeof(*cmd) - 1))) { + ret = -EINVAL; + goto err_put; + } + if (cmd->qp_type == IB_QPT_XRC_TGT) qp = ib_create_qp(pd, &attr); else - qp = device->create_qp(pd, &attr, &udata); + qp = device->create_qp(pd, &attr, uhw); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_put; } if (cmd->qp_type != IB_QPT_XRC_TGT) { qp->real_qp = qp; qp->device = device; qp->pd = pd; qp->send_cq = attr.send_cq; qp->recv_cq = attr.recv_cq; qp->srq = attr.srq; + qp->rwq_ind_tbl = ind_tbl; qp->event_handler = attr.event_handler; qp->qp_context = attr.qp_context; qp->qp_type = attr.qp_type; atomic_set(&qp->usecnt, 0); atomic_inc(&pd->usecnt); - atomic_inc(&attr.send_cq->usecnt); + if (attr.send_cq) + atomic_inc(&attr.send_cq->usecnt); if (attr.recv_cq) atomic_inc(&attr.recv_cq->usecnt); if (attr.srq) atomic_inc(&attr.srq->usecnt); + if (ind_tbl) + atomic_inc(&ind_tbl->usecnt); } qp->uobject = &obj->uevent.uobject; obj->uevent.uobject.object = qp; ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); if (ret) goto err_destroy; - memset(&resp_obj, 0, sizeof(resp_obj)); - resp->qpn = qp->qp_num; - resp->qp_handle = obj->uevent.uobject.id; - resp->max_recv_sge = attr.cap.max_recv_sge; - resp->max_send_sge = attr.cap.max_send_sge; - resp->max_recv_wr = attr.cap.max_recv_wr; - resp->max_send_wr = attr.cap.max_send_wr; - resp->max_inline_data = attr.cap.max_inline_data; + memset(&resp, 0, sizeof resp); + resp.base.qpn = qp->qp_num; + resp.base.qp_handle = obj->uevent.uobject.id; + resp.base.max_recv_sge = attr.cap.max_recv_sge; + resp.base.max_send_sge = attr.cap.max_send_sge; + resp.base.max_recv_wr = attr.cap.max_recv_wr; + resp.base.max_send_wr = attr.cap.max_send_wr; + resp.base.max_inline_data = attr.cap.max_inline_data; - if (copy_to_user(response, &resp_obj, resp_size)) { - ret = -EFAULT; - goto err_copy; - } + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + ret = cb(file, &resp, ucore); + if (ret) + goto err_cb; + if (xrcd) { - obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); atomic_inc(&obj->uxrcd->refcnt); put_xrcd_read(xrcd_uobj); - } + } if (pd) put_pd_read(pd); if (scq) put_cq_read(scq); if (rcq && rcq != scq) put_cq_read(rcq); if (srq) put_srq_read(srq); + if (ind_tbl) + put_rwq_indirection_table_read(ind_tbl); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); mutex_unlock(&file->mutex); obj->uevent.uobject.live = 1; up_write(&obj->uevent.uobject.mutex); - return in_len; - -err_copy: + return 0; +err_cb: idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); err_destroy: ib_destroy_qp(qp); err_put: if (xrcd) put_xrcd_read(xrcd_uobj); if (pd) put_pd_read(pd); if (scq) put_cq_read(scq); if (rcq && rcq != scq) put_cq_read(rcq); if (srq) put_srq_read(srq); + if (ind_tbl) + put_rwq_indirection_table_read(ind_tbl); put_uobj_write(&obj->uevent.uobject); return ret; } +static int ib_uverbs_create_qp_cb(struct ib_uverbs_file *file, + struct ib_uverbs_ex_create_qp_resp *resp, + struct ib_udata *ucore) +{ + if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base))) + return -EFAULT; + + return 0; +} + +ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_create_qp cmd; + struct ib_uverbs_ex_create_qp cmd_ex; + struct ib_udata ucore; + struct ib_udata uhw; + ssize_t resp_size = sizeof(struct ib_uverbs_create_qp_resp); + int err; + + if (out_len < resp_size) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + INIT_UDATA(&ucore, buf, (unsigned long)cmd.response, sizeof(cmd), + resp_size); + INIT_UDATA(&uhw, buf + sizeof(cmd), + (unsigned long)cmd.response + resp_size, + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - resp_size); + + memset(&cmd_ex, 0, sizeof(cmd_ex)); + cmd_ex.user_handle = cmd.user_handle; + cmd_ex.pd_handle = cmd.pd_handle; + cmd_ex.send_cq_handle = cmd.send_cq_handle; + cmd_ex.recv_cq_handle = cmd.recv_cq_handle; + cmd_ex.srq_handle = cmd.srq_handle; + cmd_ex.max_send_wr = cmd.max_send_wr; + cmd_ex.max_recv_wr = cmd.max_recv_wr; + cmd_ex.max_send_sge = cmd.max_send_sge; + cmd_ex.max_recv_sge = cmd.max_recv_sge; + cmd_ex.max_inline_data = cmd.max_inline_data; + cmd_ex.sq_sig_all = cmd.sq_sig_all; + cmd_ex.qp_type = cmd.qp_type; + cmd_ex.is_srq = cmd.is_srq; + + err = create_qp(file, &ucore, &uhw, &cmd_ex, + offsetof(typeof(cmd_ex), is_srq) + + sizeof(cmd.is_srq), ib_uverbs_create_qp_cb, + NULL); + + if (err) + return err; + + return in_len; +} + +static int ib_uverbs_ex_create_qp_cb(struct ib_uverbs_file *file, + struct ib_uverbs_ex_create_qp_resp *resp, + struct ib_udata *ucore) +{ + if (ib_copy_to_udata(ucore, resp, resp->response_length)) + return -EFAULT; + + return 0; +} + +int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_create_qp_resp resp; + struct ib_uverbs_ex_create_qp cmd = {0}; + int err; + + if (ucore->inlen < (offsetof(typeof(cmd), comp_mask) + + sizeof(cmd.comp_mask))) + return -EINVAL; + + err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); + if (err) + return err; + + if (cmd.comp_mask & ~IB_UVERBS_CREATE_QP_SUP_COMP_MASK) + return -EINVAL; + + if (cmd.reserved) + return -EINVAL; + + if (ucore->outlen < (offsetof(typeof(resp), response_length) + + sizeof(resp.response_length))) + return -ENOSPC; + + err = create_qp(file, ucore, uhw, &cmd, + min(ucore->inlen, sizeof(cmd)), + ib_uverbs_ex_create_qp_cb, NULL); + + if (err) + return err; + + return 0; +} + ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_open_qp cmd; struct ib_uverbs_create_qp_resp resp; struct ib_udata udata; struct ib_uqp_object *obj; struct ib_xrcd *xrcd; struct ib_uobject *uninitialized_var(xrcd_uobj); struct ib_qp *qp; struct ib_qp_open_attr attr; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); obj = kmalloc(sizeof *obj, GFP_KERNEL); if (!obj) return -ENOMEM; init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class); down_write(&obj->uevent.uobject.mutex); xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj); if (!xrcd) { ret = -EINVAL; goto err_put; } attr.event_handler = ib_uverbs_qp_event_handler; attr.qp_context = file; attr.qp_num = cmd.qpn; attr.qp_type = cmd.qp_type; obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); INIT_LIST_HEAD(&obj->mcast_list); qp = ib_open_qp(xrcd, &attr); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_put; - } + } qp->uobject = &obj->uevent.uobject; obj->uevent.uobject.object = qp; ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); if (ret) goto err_destroy; memset(&resp, 0, sizeof resp); resp.qpn = qp->qp_num; resp.qp_handle = obj->uevent.uobject.id; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_remove; } obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); put_xrcd_read(xrcd_uobj); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); mutex_unlock(&file->mutex); obj->uevent.uobject.live = 1; + up_write(&obj->uevent.uobject.mutex); return in_len; err_remove: idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); err_destroy: ib_destroy_qp(qp); err_put: put_xrcd_read(xrcd_uobj); put_uobj_write(&obj->uevent.uobject); return ret; } ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_query_qp cmd; struct ib_uverbs_query_qp_resp resp; struct ib_qp *qp; struct ib_qp_attr *attr; struct ib_qp_init_attr *init_attr; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; attr = kmalloc(sizeof *attr, GFP_KERNEL); init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL); if (!attr || !init_attr) { ret = -ENOMEM; goto out; } qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) { ret = -EINVAL; goto out; } ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr); put_qp_read(qp); if (ret) goto out; memset(&resp, 0, sizeof resp); resp.qp_state = attr->qp_state; resp.cur_qp_state = attr->cur_qp_state; resp.path_mtu = attr->path_mtu; resp.path_mig_state = attr->path_mig_state; resp.qkey = attr->qkey; resp.rq_psn = attr->rq_psn; resp.sq_psn = attr->sq_psn; resp.dest_qp_num = attr->dest_qp_num; resp.qp_access_flags = attr->qp_access_flags; resp.pkey_index = attr->pkey_index; resp.alt_pkey_index = attr->alt_pkey_index; resp.sq_draining = attr->sq_draining; resp.max_rd_atomic = attr->max_rd_atomic; resp.max_dest_rd_atomic = attr->max_dest_rd_atomic; resp.min_rnr_timer = attr->min_rnr_timer; resp.port_num = attr->port_num; resp.timeout = attr->timeout; resp.retry_cnt = attr->retry_cnt; resp.rnr_retry = attr->rnr_retry; resp.alt_port_num = attr->alt_port_num; resp.alt_timeout = attr->alt_timeout; memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16); resp.dest.flow_label = attr->ah_attr.grh.flow_label; resp.dest.sgid_index = attr->ah_attr.grh.sgid_index; resp.dest.hop_limit = attr->ah_attr.grh.hop_limit; resp.dest.traffic_class = attr->ah_attr.grh.traffic_class; resp.dest.dlid = attr->ah_attr.dlid; resp.dest.sl = attr->ah_attr.sl; resp.dest.src_path_bits = attr->ah_attr.src_path_bits; resp.dest.static_rate = attr->ah_attr.static_rate; resp.dest.is_global = !!(attr->ah_attr.ah_flags & IB_AH_GRH); resp.dest.port_num = attr->ah_attr.port_num; memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16); resp.alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label; resp.alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index; resp.alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit; resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class; resp.alt_dest.dlid = attr->alt_ah_attr.dlid; resp.alt_dest.sl = attr->alt_ah_attr.sl; resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits; resp.alt_dest.static_rate = attr->alt_ah_attr.static_rate; resp.alt_dest.is_global = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH); resp.alt_dest.port_num = attr->alt_ah_attr.port_num; resp.max_send_wr = init_attr->cap.max_send_wr; resp.max_recv_wr = init_attr->cap.max_recv_wr; resp.max_send_sge = init_attr->cap.max_send_sge; resp.max_recv_sge = init_attr->cap.max_recv_sge; resp.max_inline_data = init_attr->cap.max_inline_data; resp.sq_sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) ret = -EFAULT; out: kfree(attr); kfree(init_attr); return ret ? ret : in_len; } /* Remove ignored fields set in the attribute mask */ static int modify_qp_mask(enum ib_qp_type qp_type, int mask) { switch (qp_type) { case IB_QPT_XRC_INI: return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER); case IB_QPT_XRC_TGT: return mask & ~(IB_QP_MAX_QP_RD_ATOMIC | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY); default: return mask; } } -static ssize_t __uverbs_modify_qp(struct ib_uverbs_file *file, +ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, - int out_len, - enum uverbs_cmd_type cmd_type) + int out_len) { - struct ib_uverbs_modify_qp_ex cmd; - struct ib_udata udata; - struct ib_qp *qp; - struct ib_qp_attr *attr; - struct ib_qp_attr_ex *attrx; - int ret; - void *p; - union ib_gid sgid; - union ib_gid *dgid; - u8 port_num; + struct ib_uverbs_modify_qp cmd; + struct ib_udata udata; + struct ib_qp *qp; + struct ib_qp_attr *attr; + int ret; - if (cmd_type == IB_USER_VERBS_CMD_BASIC) { - p = &cmd; - p += sizeof(cmd.comp_mask); - if (copy_from_user(p, buf, - sizeof(struct ib_uverbs_modify_qp))) + if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - } else { - if (copy_from_user(&cmd, buf, sizeof(cmd))) - return -EFAULT; - } INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, out_len); - attrx = kzalloc(sizeof(*attrx), GFP_KERNEL); - if (!attrx) + attr = kmalloc(sizeof *attr, GFP_KERNEL); + if (!attr) return -ENOMEM; - attr = (struct ib_qp_attr *)attrx; qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) { - kfree(attrx); - return -EINVAL; + ret = -EINVAL; + goto out; } attr->qp_state = cmd.qp_state; attr->cur_qp_state = cmd.cur_qp_state; attr->path_mtu = cmd.path_mtu; attr->path_mig_state = cmd.path_mig_state; attr->qkey = cmd.qkey; attr->rq_psn = cmd.rq_psn; attr->sq_psn = cmd.sq_psn; attr->dest_qp_num = cmd.dest_qp_num; attr->qp_access_flags = cmd.qp_access_flags; attr->pkey_index = cmd.pkey_index; attr->alt_pkey_index = cmd.alt_pkey_index; attr->en_sqd_async_notify = cmd.en_sqd_async_notify; attr->max_rd_atomic = cmd.max_rd_atomic; attr->max_dest_rd_atomic = cmd.max_dest_rd_atomic; attr->min_rnr_timer = cmd.min_rnr_timer; attr->port_num = cmd.port_num; attr->timeout = cmd.timeout; attr->retry_cnt = cmd.retry_cnt; attr->rnr_retry = cmd.rnr_retry; attr->alt_port_num = cmd.alt_port_num; attr->alt_timeout = cmd.alt_timeout; memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16); attr->ah_attr.grh.flow_label = cmd.dest.flow_label; attr->ah_attr.grh.sgid_index = cmd.dest.sgid_index; attr->ah_attr.grh.hop_limit = cmd.dest.hop_limit; attr->ah_attr.grh.traffic_class = cmd.dest.traffic_class; attr->ah_attr.dlid = cmd.dest.dlid; attr->ah_attr.sl = cmd.dest.sl; attr->ah_attr.src_path_bits = cmd.dest.src_path_bits; attr->ah_attr.static_rate = cmd.dest.static_rate; attr->ah_attr.ah_flags = cmd.dest.is_global ? IB_AH_GRH : 0; attr->ah_attr.port_num = cmd.dest.port_num; memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16); attr->alt_ah_attr.grh.flow_label = cmd.alt_dest.flow_label; attr->alt_ah_attr.grh.sgid_index = cmd.alt_dest.sgid_index; attr->alt_ah_attr.grh.hop_limit = cmd.alt_dest.hop_limit; attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class; attr->alt_ah_attr.dlid = cmd.alt_dest.dlid; attr->alt_ah_attr.sl = cmd.alt_dest.sl; attr->alt_ah_attr.src_path_bits = cmd.alt_dest.src_path_bits; attr->alt_ah_attr.static_rate = cmd.alt_dest.static_rate; attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0; attr->alt_ah_attr.port_num = cmd.alt_dest.port_num; - port_num = (cmd.attr_mask & IB_QP_PORT) ? cmd.port_num : qp->port_num; - if ((cmd.attr_mask & IB_QP_AV) && port_num && - (rdma_port_get_link_layer(qp->device, port_num) == - IB_LINK_LAYER_ETHERNET)) { - ret = ib_query_gid(qp->device, port_num, - attr->ah_attr.grh.sgid_index, &sgid); - if (ret) - goto out; - dgid = &attr->ah_attr.grh.dgid; - if (rdma_link_local_addr((struct in6_addr *)dgid->raw)) { - rdma_get_ll_mac((struct in6_addr *)dgid->raw, - attr->ah_attr.dmac); - rdma_get_ll_mac((struct in6_addr *)sgid.raw, - attr->smac); - attr->vlan_id = rdma_get_vlan_id(&sgid); - } else { - ret = rdma_addr_find_dmac_by_grh(&sgid, dgid, - attr->ah_attr.dmac, - &attr->vlan_id, -1U); - if (ret) - goto out; - ret = rdma_addr_find_smac_by_sgid(&sgid, attr->smac, - NULL, -1U); - if (ret) - goto out; - } - cmd.attr_mask |= IB_QP_SMAC; - if (attr->vlan_id < 0xFFFF) - cmd.attr_mask |= IB_QP_VID; - } - if (cmd_type == IB_USER_VERBS_CMD_EXTENDED) { - if (cmd.comp_mask & IB_UVERBS_QP_ATTR_DCT_KEY) - attrx->dct_key = cmd.dct_key; - } if (qp->real_qp == qp) { + ret = ib_resolve_eth_dmac(qp, attr, &cmd.attr_mask); + if (ret) + goto release_qp; ret = qp->device->modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata); - if (!ret && (cmd.attr_mask & IB_QP_PORT)) - qp->port_num = attr->port_num; } else { ret = ib_modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask)); } if (ret) - goto out; + goto release_qp; ret = in_len; -out: +release_qp: put_qp_read(qp); - kfree(attrx); +out: + kfree(attr); + return ret; } -ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) -{ - return __uverbs_modify_qp(file, buf, in_len, out_len, - IB_USER_VERBS_CMD_BASIC); -} - ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_qp cmd; struct ib_uverbs_destroy_qp_resp resp; - struct ib_uobject *uobj; + struct ib_uobject *uobj; struct ib_qp *qp; struct ib_uqp_object *obj; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; memset(&resp, 0, sizeof resp); uobj = idr_write_uobj(&ib_uverbs_qp_idr, cmd.qp_handle, file->ucontext); if (!uobj) return -EINVAL; qp = uobj->object; obj = container_of(uobj, struct ib_uqp_object, uevent.uobject); if (!list_empty(&obj->mcast_list)) { put_uobj_write(uobj); return -EBUSY; } ret = ib_destroy_qp(qp); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; if (obj->uxrcd) atomic_dec(&obj->uxrcd->refcnt); idr_remove_uobj(&ib_uverbs_qp_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); ib_uverbs_release_uevent(file, &obj->uevent); resp.events_reported = obj->uevent.events_reported; put_uobj(uobj); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) return -EFAULT; return in_len; } +static void *alloc_wr(size_t wr_size, __u32 num_sge) +{ + return kmalloc(ALIGN(wr_size, sizeof (struct ib_sge)) + + num_sge * sizeof (struct ib_sge), GFP_KERNEL); +}; + ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_post_send cmd; struct ib_uverbs_post_send_resp resp; struct ib_uverbs_send_wr *user_wr; struct ib_send_wr *wr = NULL, *last, *next, *bad_wr; - struct ib_qp *qp; + struct ib_qp *qp; int i, sg_ind; int is_ud; ssize_t ret = -EINVAL; + size_t next_size; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; if (in_len < sizeof cmd + cmd.wqe_size * cmd.wr_count + cmd.sge_count * sizeof (struct ib_uverbs_sge)) return -EINVAL; if (cmd.wqe_size < sizeof (struct ib_uverbs_send_wr)) return -EINVAL; user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL); if (!user_wr) return -ENOMEM; qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) goto out; is_ud = qp->qp_type == IB_QPT_UD; sg_ind = 0; last = NULL; for (i = 0; i < cmd.wr_count; ++i) { if (copy_from_user(user_wr, buf + sizeof cmd + i * cmd.wqe_size, cmd.wqe_size)) { ret = -EFAULT; goto out_put; } if (user_wr->num_sge + sg_ind > cmd.sge_count) { ret = -EINVAL; goto out_put; } - next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + - user_wr->num_sge * sizeof (struct ib_sge), - GFP_KERNEL); - if (!next) { - ret = -ENOMEM; - goto out_put; - } + if (is_ud) { + struct ib_ud_wr *ud; + if (user_wr->opcode != IB_WR_SEND && + user_wr->opcode != IB_WR_SEND_WITH_IMM) { + ret = -EINVAL; + goto out_put; + } + + next_size = sizeof(*ud); + ud = alloc_wr(next_size, user_wr->num_sge); + if (!ud) { + ret = -ENOMEM; + goto out_put; + } + + ud->ah = idr_read_ah(user_wr->wr.ud.ah, file->ucontext); + if (!ud->ah) { + kfree(ud); + ret = -EINVAL; + goto out_put; + } + ud->remote_qpn = user_wr->wr.ud.remote_qpn; + ud->remote_qkey = user_wr->wr.ud.remote_qkey; + + next = &ud->wr; + } else if (user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM || + user_wr->opcode == IB_WR_RDMA_WRITE || + user_wr->opcode == IB_WR_RDMA_READ) { + struct ib_rdma_wr *rdma; + + next_size = sizeof(*rdma); + rdma = alloc_wr(next_size, user_wr->num_sge); + if (!rdma) { + ret = -ENOMEM; + goto out_put; + } + + rdma->remote_addr = user_wr->wr.rdma.remote_addr; + rdma->rkey = user_wr->wr.rdma.rkey; + + next = &rdma->wr; + } else if (user_wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || + user_wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + struct ib_atomic_wr *atomic; + + next_size = sizeof(*atomic); + atomic = alloc_wr(next_size, user_wr->num_sge); + if (!atomic) { + ret = -ENOMEM; + goto out_put; + } + + atomic->remote_addr = user_wr->wr.atomic.remote_addr; + atomic->compare_add = user_wr->wr.atomic.compare_add; + atomic->swap = user_wr->wr.atomic.swap; + atomic->rkey = user_wr->wr.atomic.rkey; + + next = &atomic->wr; + } else if (user_wr->opcode == IB_WR_SEND || + user_wr->opcode == IB_WR_SEND_WITH_IMM || + user_wr->opcode == IB_WR_SEND_WITH_INV) { + next_size = sizeof(*next); + next = alloc_wr(next_size, user_wr->num_sge); + if (!next) { + ret = -ENOMEM; + goto out_put; + } + } else { + ret = -EINVAL; + goto out_put; + } + + if (user_wr->opcode == IB_WR_SEND_WITH_IMM || + user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { + next->ex.imm_data = + (__be32 __force) user_wr->ex.imm_data; + } else if (user_wr->opcode == IB_WR_SEND_WITH_INV) { + next->ex.invalidate_rkey = user_wr->ex.invalidate_rkey; + } + if (!last) wr = next; - else + else last->next = next; last = next; next->next = NULL; next->wr_id = user_wr->wr_id; next->num_sge = user_wr->num_sge; next->opcode = user_wr->opcode; next->send_flags = user_wr->send_flags; - if (is_ud) { - next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah, - file->ucontext); - if (!next->wr.ud.ah) { - ret = -EINVAL; - goto out_put; - } - next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn; - next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey; - } else { - switch (next->opcode) { - case IB_WR_RDMA_WRITE_WITH_IMM: - next->ex.imm_data = - (__be32 __force) user_wr->ex.imm_data; - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_READ: - next->wr.rdma.remote_addr = - user_wr->wr.rdma.remote_addr; - next->wr.rdma.rkey = - user_wr->wr.rdma.rkey; - break; - case IB_WR_SEND_WITH_IMM: - next->ex.imm_data = - (__be32 __force) user_wr->ex.imm_data; - break; - case IB_WR_SEND_WITH_INV: - next->ex.invalidate_rkey = - user_wr->ex.invalidate_rkey; - break; - case IB_WR_ATOMIC_CMP_AND_SWP: - case IB_WR_ATOMIC_FETCH_AND_ADD: - next->wr.atomic.remote_addr = - user_wr->wr.atomic.remote_addr; - next->wr.atomic.compare_add = - user_wr->wr.atomic.compare_add; - next->wr.atomic.swap = user_wr->wr.atomic.swap; - next->wr.atomic.rkey = user_wr->wr.atomic.rkey; - break; - default: - break; - } - } - if (next->num_sge) { - next->sg_list = (void *) next + - ALIGN(sizeof *next, sizeof (struct ib_sge)); + next->sg_list = (void *)((char *)next + + ALIGN(next_size, sizeof(struct ib_sge))); if (copy_from_user(next->sg_list, - buf + sizeof cmd + + (const char *)buf + sizeof cmd + cmd.wr_count * cmd.wqe_size + sg_ind * sizeof (struct ib_sge), next->num_sge * sizeof (struct ib_sge))) { ret = -EFAULT; goto out_put; } sg_ind += next->num_sge; } else next->sg_list = NULL; } resp.bad_wr = 0; ret = qp->device->post_send(qp->real_qp, wr, &bad_wr); if (ret) for (next = wr; next; next = next->next) { ++resp.bad_wr; if (next == bad_wr) - break; + break; } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) ret = -EFAULT; out_put: put_qp_read(qp); while (wr) { - if (is_ud && wr->wr.ud.ah) - put_ah_read(wr->wr.ud.ah); + if (is_ud && ud_wr(wr)->ah) + put_ah_read(ud_wr(wr)->ah); next = wr->next; kfree(wr); wr = next; } out: kfree(user_wr); return ret ? ret : in_len; } static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf, int in_len, u32 wr_count, u32 sge_count, u32 wqe_size) { struct ib_uverbs_recv_wr *user_wr; struct ib_recv_wr *wr = NULL, *last, *next; int sg_ind; int i; - int ret; + int ret; if (in_len < wqe_size * wr_count + sge_count * sizeof (struct ib_uverbs_sge)) return ERR_PTR(-EINVAL); if (wqe_size < sizeof (struct ib_uverbs_recv_wr)) return ERR_PTR(-EINVAL); user_wr = kmalloc(wqe_size, GFP_KERNEL); if (!user_wr) return ERR_PTR(-ENOMEM); sg_ind = 0; last = NULL; for (i = 0; i < wr_count; ++i) { if (copy_from_user(user_wr, buf + i * wqe_size, wqe_size)) { ret = -EFAULT; goto err; } if (user_wr->num_sge + sg_ind > sge_count) { - ret = -EINVAL; - goto err; - } + ret = -EINVAL; + goto err; + } next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + user_wr->num_sge * sizeof (struct ib_sge), GFP_KERNEL); if (!next) { ret = -ENOMEM; goto err; - } + } if (!last) wr = next; else last->next = next; last = next; next->next = NULL; next->wr_id = user_wr->wr_id; next->num_sge = user_wr->num_sge; if (next->num_sge) { - next->sg_list = (void *) next + - ALIGN(sizeof *next, sizeof (struct ib_sge)); + next->sg_list = (void *)((char *)next + + ALIGN(sizeof *next, sizeof (struct ib_sge))); if (copy_from_user(next->sg_list, - buf + wr_count * wqe_size + + (const char *)buf + wr_count * wqe_size + sg_ind * sizeof (struct ib_sge), next->num_sge * sizeof (struct ib_sge))) { ret = -EFAULT; goto err; } sg_ind += next->num_sge; } else next->sg_list = NULL; } kfree(user_wr); return wr; err: kfree(user_wr); while (wr) { next = wr->next; kfree(wr); wr = next; } return ERR_PTR(ret); } ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_post_recv cmd; struct ib_uverbs_post_recv_resp resp; struct ib_recv_wr *wr, *next, *bad_wr; struct ib_qp *qp; ssize_t ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd, in_len - sizeof cmd, cmd.wr_count, cmd.sge_count, cmd.wqe_size); if (IS_ERR(wr)) return PTR_ERR(wr); qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) goto out; resp.bad_wr = 0; ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr); put_qp_read(qp); if (ret) for (next = wr; next; next = next->next) { ++resp.bad_wr; if (next == bad_wr) break; } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) ret = -EFAULT; out: while (wr) { next = wr->next; kfree(wr); wr = next; } return ret ? ret : in_len; } ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_post_srq_recv cmd; struct ib_uverbs_post_srq_recv_resp resp; struct ib_recv_wr *wr, *next, *bad_wr; struct ib_srq *srq; ssize_t ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd, in_len - sizeof cmd, cmd.wr_count, cmd.sge_count, cmd.wqe_size); if (IS_ERR(wr)) return PTR_ERR(wr); srq = idr_read_srq(cmd.srq_handle, file->ucontext); if (!srq) goto out; resp.bad_wr = 0; ret = srq->device->post_srq_recv(srq, wr, &bad_wr); put_srq_read(srq); if (ret) for (next = wr; next; next = next->next) { ++resp.bad_wr; if (next == bad_wr) break; } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) ret = -EFAULT; out: while (wr) { next = wr->next; kfree(wr); wr = next; } return ret ? ret : in_len; } ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_create_ah cmd; struct ib_uverbs_create_ah_resp resp; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_ah *ah; struct ib_ah_attr attr; int ret; if (out_len < sizeof resp) return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = kmalloc(sizeof *uobj, GFP_KERNEL); if (!uobj) return -ENOMEM; init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_class); down_write(&uobj->mutex); - pd = idr_read_pd(cmd.pd_handle, file->ucontext); + pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err; } attr.dlid = cmd.attr.dlid; attr.sl = cmd.attr.sl; attr.src_path_bits = cmd.attr.src_path_bits; attr.static_rate = cmd.attr.static_rate; attr.ah_flags = cmd.attr.is_global ? IB_AH_GRH : 0; attr.port_num = cmd.attr.port_num; attr.grh.flow_label = cmd.attr.grh.flow_label; attr.grh.sgid_index = cmd.attr.grh.sgid_index; attr.grh.hop_limit = cmd.attr.grh.hop_limit; attr.grh.traffic_class = cmd.attr.grh.traffic_class; + memset(&attr.dmac, 0, sizeof(attr.dmac)); memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16); ah = ib_create_ah(pd, &attr); if (IS_ERR(ah)) { ret = PTR_ERR(ah); goto err_put; } ah->uobject = uobj; uobj->object = ah; ret = idr_add_uobj(&ib_uverbs_ah_idr, uobj); if (ret) goto err_destroy; resp.ah_handle = uobj->id; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; } put_pd_read(pd); mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->ah_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); return in_len; err_copy: idr_remove_uobj(&ib_uverbs_ah_idr, uobj); err_destroy: ib_destroy_ah(ah); err_put: put_pd_read(pd); err: put_uobj_write(uobj); return ret; } ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_ah cmd; struct ib_ah *ah; struct ib_uobject *uobj; - int ret; + int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; uobj = idr_write_uobj(&ib_uverbs_ah_idr, cmd.ah_handle, file->ucontext); if (!uobj) return -EINVAL; ah = uobj->object; ret = ib_destroy_ah(ah); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; idr_remove_uobj(&ib_uverbs_ah_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); return in_len; } ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_attach_mcast cmd; struct ib_qp *qp; struct ib_uqp_object *obj; struct ib_uverbs_mcast_entry *mcast; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; qp = idr_write_qp(cmd.qp_handle, file->ucontext); if (!qp) return -EINVAL; obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); list_for_each_entry(mcast, &obj->mcast_list, list) if (cmd.mlid == mcast->lid && !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) { ret = 0; goto out_put; } mcast = kmalloc(sizeof *mcast, GFP_KERNEL); if (!mcast) { ret = -ENOMEM; goto out_put; } mcast->lid = cmd.mlid; memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw); ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid); if (!ret) list_add_tail(&mcast->list, &obj->mcast_list); else kfree(mcast); out_put: put_qp_write(qp); return ret ? ret : in_len; } ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_detach_mcast cmd; struct ib_uqp_object *obj; struct ib_qp *qp; struct ib_uverbs_mcast_entry *mcast; - int ret = -EINVAL; + int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; qp = idr_write_qp(cmd.qp_handle, file->ucontext); if (!qp) return -EINVAL; ret = ib_detach_mcast(qp, (union ib_gid *) cmd.gid, cmd.mlid); if (ret) goto out_put; obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); list_for_each_entry(mcast, &obj->mcast_list, list) if (cmd.mlid == mcast->lid && !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) { list_del(&mcast->list); kfree(mcast); break; } out_put: put_qp_write(qp); return ret ? ret : in_len; } -static int __uverbs_create_xsrq(struct ib_uverbs_file *file, - struct ib_uverbs_create_xsrq *cmd, - struct ib_udata *udata) +static size_t kern_spec_filter_sz(struct ib_uverbs_flow_spec_hdr *spec) { - struct ib_uverbs_create_srq_resp resp; - struct ib_usrq_object *obj; - struct ib_pd *pd; - struct ib_srq *srq; - struct ib_uobject *uninitialized_var(xrcd_uobj); - struct ib_srq_init_attr attr; - int ret; + /* Returns user space filter size, includes padding */ + return (spec->size - sizeof(struct ib_uverbs_flow_spec_hdr)) / 2; +} - obj = kmalloc(sizeof(*obj), GFP_KERNEL); - if (!obj) - return -ENOMEM; +static ssize_t spec_filter_size(void *kern_spec_filter, u16 kern_filter_size, + u16 ib_real_filter_sz) +{ + /* + * User space filter structures must be 64 bit aligned, otherwise this + * may pass, but we won't handle additional new attributes. + */ - init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class); - down_write(&obj->uevent.uobject.mutex); + if (kern_filter_size > ib_real_filter_sz) { + if (memchr_inv((char *)kern_spec_filter + + ib_real_filter_sz, 0, + kern_filter_size - ib_real_filter_sz)) + return -EINVAL; + return ib_real_filter_sz; + } + return kern_filter_size; +} - if (cmd->srq_type == IB_SRQT_XRC) { - attr.ext.xrc.xrcd = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj); - if (!attr.ext.xrc.xrcd) { - ret = -EINVAL; - goto err; - } +static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, + union ib_flow_spec *ib_spec) +{ + ssize_t actual_filter_sz; + ssize_t kern_filter_sz; + ssize_t ib_filter_sz; + void *kern_spec_mask; + void *kern_spec_val; - obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); - atomic_inc(&obj->uxrcd->refcnt); + if (kern_spec->reserved) + return -EINVAL; - attr.ext.xrc.cq = idr_read_cq(cmd->cq_handle, file->ucontext, 0); - if (!attr.ext.xrc.cq) { - ret = -EINVAL; - goto err_put_xrcd; - } - } + ib_spec->type = kern_spec->type; - pd = idr_read_pd(cmd->pd_handle, file->ucontext); - if (!pd) { - ret = -EINVAL; - goto err_put_cq; - } + kern_filter_sz = kern_spec_filter_sz(&kern_spec->hdr); + /* User flow spec size must be aligned to 4 bytes */ + if (kern_filter_sz != ALIGN(kern_filter_sz, 4)) + return -EINVAL; - attr.event_handler = ib_uverbs_srq_event_handler; - attr.srq_context = file; - attr.srq_type = cmd->srq_type; - attr.attr.max_wr = cmd->max_wr; - attr.attr.max_sge = cmd->max_sge; - attr.attr.srq_limit = cmd->srq_limit; + kern_spec_val = (char *)kern_spec + + sizeof(struct ib_uverbs_flow_spec_hdr); + kern_spec_mask = (char *)kern_spec_val + kern_filter_sz; - obj->uevent.events_reported = 0; - INIT_LIST_HEAD(&obj->uevent.event_list); + switch (ib_spec->type) { + case IB_FLOW_SPEC_ETH: + ib_filter_sz = offsetof(struct ib_flow_eth_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->size = sizeof(struct ib_flow_spec_eth); + memcpy(&ib_spec->eth.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->eth.mask, kern_spec_mask, actual_filter_sz); + break; + case IB_FLOW_SPEC_IPV4: + ib_filter_sz = offsetof(struct ib_flow_ipv4_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->size = sizeof(struct ib_flow_spec_ipv4); + memcpy(&ib_spec->ipv4.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->ipv4.mask, kern_spec_mask, actual_filter_sz); + break; + case IB_FLOW_SPEC_IPV6: + ib_filter_sz = offsetof(struct ib_flow_ipv6_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->size = sizeof(struct ib_flow_spec_ipv6); + memcpy(&ib_spec->ipv6.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->ipv6.mask, kern_spec_mask, actual_filter_sz); - srq = pd->device->create_srq(pd, &attr, udata); - if (IS_ERR(srq)) { - ret = PTR_ERR(srq); - goto err_put; + if ((ntohl(ib_spec->ipv6.mask.flow_label)) >= BIT(20) || + (ntohl(ib_spec->ipv6.val.flow_label)) >= BIT(20)) + return -EINVAL; + break; + case IB_FLOW_SPEC_TCP: + case IB_FLOW_SPEC_UDP: + ib_filter_sz = offsetof(struct ib_flow_tcp_udp_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->size = sizeof(struct ib_flow_spec_tcp_udp); + memcpy(&ib_spec->tcp_udp.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->tcp_udp.mask, kern_spec_mask, actual_filter_sz); + break; + default: + return -EINVAL; } + return 0; +} - srq->device = pd->device; - srq->pd = pd; - srq->srq_type = cmd->srq_type; - srq->uobject = &obj->uevent.uobject; - srq->event_handler = attr.event_handler; - srq->srq_context = attr.srq_context; +int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_create_wq cmd = {}; + struct ib_uverbs_ex_create_wq_resp resp = {}; + struct ib_uwq_object *obj; + int err = 0; + struct ib_cq *cq; + struct ib_pd *pd; + struct ib_wq *wq; + struct ib_wq_init_attr wq_init_attr = {}; + size_t required_cmd_sz; + size_t required_resp_len; - if (cmd->srq_type == IB_SRQT_XRC) { - srq->ext.xrc.cq = attr.ext.xrc.cq; - srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; - atomic_inc(&attr.ext.xrc.cq->usecnt); - atomic_inc(&attr.ext.xrc.xrcd->usecnt); - } + required_cmd_sz = offsetof(typeof(cmd), max_sge) + sizeof(cmd.max_sge); + required_resp_len = offsetof(typeof(resp), wqn) + sizeof(resp.wqn); - atomic_inc(&pd->usecnt); - atomic_set(&srq->usecnt, 0); + if (ucore->inlen < required_cmd_sz) + return -EINVAL; - obj->uevent.uobject.object = srq; - ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); - if (ret) - goto err_destroy; + if (ucore->outlen < required_resp_len) + return -ENOSPC; - memset(&resp, 0, sizeof resp); - resp.srq_handle = obj->uevent.uobject.id; - resp.max_wr = attr.attr.max_wr; - resp.max_sge = attr.attr.max_sge; - if (cmd->srq_type == IB_SRQT_XRC) - resp.srqn = srq->ext.xrc.srq_num; + if (ucore->inlen > sizeof(cmd) && + !ib_is_udata_cleared(ucore, sizeof(cmd), + ucore->inlen - sizeof(cmd))) + return -EOPNOTSUPP; - if (copy_to_user((void __user *) (unsigned long) cmd->response, - &resp, sizeof resp)) { - ret = -EFAULT; - goto err_copy; + err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); + if (err) + return err; + + if (cmd.comp_mask) + return -EOPNOTSUPP; + + obj = kmalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, + &wq_lock_class); + down_write(&obj->uevent.uobject.mutex); + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + err = -EINVAL; + goto err_uobj; } - if (cmd->srq_type == IB_SRQT_XRC) { - put_uobj_read(xrcd_uobj); - put_cq_read(attr.ext.xrc.cq); + cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); + if (!cq) { + err = -EINVAL; + goto err_put_pd; } + + wq_init_attr.cq = cq; + wq_init_attr.max_sge = cmd.max_sge; + wq_init_attr.max_wr = cmd.max_wr; + wq_init_attr.wq_context = file; + wq_init_attr.wq_type = cmd.wq_type; + wq_init_attr.event_handler = ib_uverbs_wq_event_handler; + obj->uevent.events_reported = 0; + INIT_LIST_HEAD(&obj->uevent.event_list); + wq = pd->device->create_wq(pd, &wq_init_attr, uhw); + if (IS_ERR(wq)) { + err = PTR_ERR(wq); + goto err_put_cq; + } + + wq->uobject = &obj->uevent.uobject; + obj->uevent.uobject.object = wq; + wq->wq_type = wq_init_attr.wq_type; + wq->cq = cq; + wq->pd = pd; + wq->device = pd->device; + wq->wq_context = wq_init_attr.wq_context; + atomic_set(&wq->usecnt, 0); + atomic_inc(&pd->usecnt); + atomic_inc(&cq->usecnt); + wq->uobject = &obj->uevent.uobject; + obj->uevent.uobject.object = wq; + err = idr_add_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject); + if (err) + goto destroy_wq; + + memset(&resp, 0, sizeof(resp)); + resp.wq_handle = obj->uevent.uobject.id; + resp.max_sge = wq_init_attr.max_sge; + resp.max_wr = wq_init_attr.max_wr; + resp.wqn = wq->wq_num; + resp.response_length = required_resp_len; + err = ib_copy_to_udata(ucore, + &resp, resp.response_length); + if (err) + goto err_copy; + put_pd_read(pd); + put_cq_read(cq); mutex_lock(&file->mutex); - list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list); + list_add_tail(&obj->uevent.uobject.list, &file->ucontext->wq_list); mutex_unlock(&file->mutex); obj->uevent.uobject.live = 1; - up_write(&obj->uevent.uobject.mutex); - return 0; err_copy: - idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); - -err_destroy: - ib_destroy_srq(srq); - -err_put: - put_pd_read(pd); - + idr_remove_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject); +destroy_wq: + ib_destroy_wq(wq); err_put_cq: - if (cmd->srq_type == IB_SRQT_XRC) - put_cq_read(attr.ext.xrc.cq); - -err_put_xrcd: - if (cmd->srq_type == IB_SRQT_XRC) { - atomic_dec(&obj->uxrcd->refcnt); - put_uobj_read(xrcd_uobj); - } - -err: + put_cq_read(cq); +err_put_pd: + put_pd_read(pd); +err_uobj: put_uobj_write(&obj->uevent.uobject); - return ret; + + return err; } -ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) { - struct ib_uverbs_create_srq cmd; - struct ib_uverbs_create_xsrq xcmd; - struct ib_uverbs_create_srq_resp resp; - struct ib_udata udata; - int ret; + struct ib_uverbs_ex_destroy_wq cmd = {}; + struct ib_uverbs_ex_destroy_wq_resp resp = {}; + struct ib_wq *wq; + struct ib_uobject *uobj; + struct ib_uwq_object *obj; + size_t required_cmd_sz; + size_t required_resp_len; + int ret; - if (out_len < sizeof resp) - return -ENOSPC; + required_cmd_sz = offsetof(typeof(cmd), wq_handle) + sizeof(cmd.wq_handle); + required_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; + if (ucore->inlen < required_cmd_sz) + return -EINVAL; - xcmd.response = cmd.response; - xcmd.user_handle = cmd.user_handle; - xcmd.srq_type = IB_SRQT_BASIC; - xcmd.pd_handle = cmd.pd_handle; - xcmd.max_wr = cmd.max_wr; - xcmd.max_sge = cmd.max_sge; - xcmd.srq_limit = cmd.srq_limit; + if (ucore->outlen < required_resp_len) + return -ENOSPC; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + if (ucore->inlen > sizeof(cmd) && + !ib_is_udata_cleared(ucore, sizeof(cmd), + ucore->inlen - sizeof(cmd))) + return -EOPNOTSUPP; - ret = __uverbs_create_xsrq(file, &xcmd, &udata); + ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); if (ret) return ret; - return in_len; -} + if (cmd.comp_mask) + return -EOPNOTSUPP; -ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, int out_len) -{ - struct ib_uverbs_create_xsrq cmd; - struct ib_uverbs_create_srq_resp resp; - struct ib_udata udata; - int ret; + resp.response_length = required_resp_len; + uobj = idr_write_uobj(&ib_uverbs_wq_idr, cmd.wq_handle, + file->ucontext); + if (!uobj) + return -EINVAL; - if (out_len < sizeof resp) - return -ENOSPC; + wq = uobj->object; + obj = container_of(uobj, struct ib_uwq_object, uevent.uobject); + ret = ib_destroy_wq(wq); + if (!ret) + uobj->live = 0; - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; - - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); - - ret = __uverbs_create_xsrq(file, &cmd, &udata); + put_uobj_write(uobj); if (ret) return ret; - return in_len; -} + idr_remove_uobj(&ib_uverbs_wq_idr, uobj); -ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) -{ - struct ib_uverbs_modify_srq cmd; - struct ib_udata udata; - struct ib_srq *srq; - struct ib_srq_attr attr; - int ret; + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; + ib_uverbs_release_uevent(file, &obj->uevent); + resp.events_reported = obj->uevent.events_reported; + put_uobj(uobj); - INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, - out_len); + ret = ib_copy_to_udata(ucore, &resp, resp.response_length); + if (ret) + return ret; - srq = idr_read_srq(cmd.srq_handle, file->ucontext); - if (!srq) - return -EINVAL; - - attr.max_wr = cmd.max_wr; - attr.srq_limit = cmd.srq_limit; - - ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata); - - put_srq_read(srq); - - return ret ? ret : in_len; + return 0; } -ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, - const char __user *buf, - int in_len, int out_len) +int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) { - struct ib_uverbs_query_srq cmd; - struct ib_uverbs_query_srq_resp resp; - struct ib_srq_attr attr; - struct ib_srq *srq; - int ret; + struct ib_uverbs_ex_modify_wq cmd = {}; + struct ib_wq *wq; + struct ib_wq_attr wq_attr = {}; + size_t required_cmd_sz; + int ret; - if (out_len < sizeof resp) - return -ENOSPC; - - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; - - srq = idr_read_srq(cmd.srq_handle, file->ucontext); - if (!srq) + required_cmd_sz = offsetof(typeof(cmd), curr_wq_state) + sizeof(cmd.curr_wq_state); + if (ucore->inlen < required_cmd_sz) return -EINVAL; - ret = ib_query_srq(srq, &attr); + if (ucore->inlen > sizeof(cmd) && + !ib_is_udata_cleared(ucore, sizeof(cmd), + ucore->inlen - sizeof(cmd))) + return -EOPNOTSUPP; - put_srq_read(srq); - + ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); if (ret) - return ret; + return ret; - memset(&resp, 0, sizeof resp); + if (!cmd.attr_mask) + return -EINVAL; - resp.max_wr = attr.max_wr; - resp.max_sge = attr.max_sge; - resp.srq_limit = attr.srq_limit; + if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE)) + return -EINVAL; - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) - return -EFAULT; + wq = idr_read_wq(cmd.wq_handle, file->ucontext); + if (!wq) + return -EINVAL; - return in_len; + wq_attr.curr_wq_state = cmd.curr_wq_state; + wq_attr.wq_state = cmd.wq_state; + ret = wq->device->modify_wq(wq, &wq_attr, cmd.attr_mask, uhw); + put_wq_read(wq); + return ret; } -ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) { - struct ib_uverbs_destroy_srq cmd; - struct ib_uverbs_destroy_srq_resp resp; - struct ib_uobject *uobj; - struct ib_srq *srq; - struct ib_uevent_object *obj; - int ret = -EINVAL; - struct ib_usrq_object *us; - enum ib_srq_type srq_type; + struct ib_uverbs_ex_create_rwq_ind_table cmd = {}; + struct ib_uverbs_ex_create_rwq_ind_table_resp resp = {}; + struct ib_uobject *uobj; + int err = 0; + struct ib_rwq_ind_table_init_attr init_attr = {}; + struct ib_rwq_ind_table *rwq_ind_tbl; + struct ib_wq **wqs = NULL; + u32 *wqs_handles = NULL; + struct ib_wq *wq = NULL; + int i, j, num_read_wqs; + u32 num_wq_handles; + u32 expected_in_size; + size_t required_cmd_sz_header; + size_t required_resp_len; - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; + required_cmd_sz_header = offsetof(typeof(cmd), log_ind_tbl_size) + sizeof(cmd.log_ind_tbl_size); + required_resp_len = offsetof(typeof(resp), ind_tbl_num) + sizeof(resp.ind_tbl_num); - uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext); - if (!uobj) + if (ucore->inlen < required_cmd_sz_header) return -EINVAL; - srq = uobj->object; - obj = container_of(uobj, struct ib_uevent_object, uobject); - srq_type = srq->srq_type; - ret = ib_destroy_srq(srq); - if (!ret) - uobj->live = 0; + if (ucore->outlen < required_resp_len) + return -ENOSPC; - put_uobj_write(uobj); + err = ib_copy_from_udata(&cmd, ucore, required_cmd_sz_header); + if (err) + return err; - if (ret) - return ret; + ucore->inbuf = (const char *)ucore->inbuf + required_cmd_sz_header; + ucore->inlen -= required_cmd_sz_header; - if (srq_type == IB_SRQT_XRC) { - us = container_of(obj, struct ib_usrq_object, uevent); - atomic_dec(&us->uxrcd->refcnt); - } + if (cmd.comp_mask) + return -EOPNOTSUPP; - idr_remove_uobj(&ib_uverbs_srq_idr, uobj); + if (cmd.log_ind_tbl_size > IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE) + return -EINVAL; - mutex_lock(&file->mutex); - list_del(&uobj->list); - mutex_unlock(&file->mutex); + num_wq_handles = 1 << cmd.log_ind_tbl_size; + expected_in_size = num_wq_handles * sizeof(__u32); + if (num_wq_handles == 1) + /* input size for wq handles is u64 aligned */ + expected_in_size += sizeof(__u32); - ib_uverbs_release_uevent(file, obj); + if (ucore->inlen < expected_in_size) + return -EINVAL; - memset(&resp, 0, sizeof resp); - resp.events_reported = obj->events_reported; + if (ucore->inlen > expected_in_size && + !ib_is_udata_cleared(ucore, expected_in_size, + ucore->inlen - expected_in_size)) + return -EOPNOTSUPP; - put_uobj(uobj); - - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) - ret = -EFAULT; - - return ret ? ret : in_len; -} - -ssize_t ib_uverbs_exp_create_dct(struct ib_uverbs_file *file, - struct ib_udata *ucore, - struct ib_udata *uhw) -{ - int in_len = ucore->inlen + uhw->inlen; - int out_len = ucore->outlen + uhw->outlen; - struct ib_uverbs_create_dct cmd; - struct ib_uverbs_create_dct_resp resp; - struct ib_udata udata; - struct ib_udct_object *obj; - struct ib_dct *dct; - int ret; - struct ib_dct_init_attr attr; - struct ib_pd *pd = NULL; - struct ib_cq *cq = NULL; - struct ib_srq *srq = NULL; - - if (out_len < sizeof(resp)) - return -ENOSPC; - - ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); - if (ret) - return ret; - - obj = kmalloc(sizeof(*obj), GFP_KERNEL); - if (!obj) + wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles), + GFP_KERNEL); + if (!wqs_handles) return -ENOMEM; - init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, - &dct_lock_class); - down_write(&obj->uobject.mutex); + err = ib_copy_from_udata(wqs_handles, ucore, + num_wq_handles * sizeof(__u32)); + if (err) + goto err_free; - pd = idr_read_pd(cmd.pd_handle, file->ucontext); - if (!pd) { - ret = -EINVAL; - goto err_pd; + wqs = kcalloc(num_wq_handles, sizeof(*wqs), GFP_KERNEL); + if (!wqs) { + err = -ENOMEM; + goto err_free; } - cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); - if (!cq) { - ret = -EINVAL; - goto err_put; + for (num_read_wqs = 0; num_read_wqs < num_wq_handles; + num_read_wqs++) { + wq = idr_read_wq(wqs_handles[num_read_wqs], file->ucontext); + if (!wq) { + err = -EINVAL; + goto put_wqs; + } + + wqs[num_read_wqs] = wq; } - srq = idr_read_srq(cmd.srq_handle, file->ucontext); - if (!srq) { - ret = -EINVAL; - goto err_put; + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) { + err = -ENOMEM; + goto put_wqs; } - attr.cq = cq; - attr.access_flags = cmd.access_flags; - attr.min_rnr_timer = cmd.min_rnr_timer; - attr.srq = srq; - attr.tclass = cmd.tclass; - attr.flow_label = cmd.flow_label; - attr.dc_key = cmd.dc_key; - attr.mtu = cmd.mtu; - attr.port = cmd.port; - attr.pkey_index = cmd.pkey_index; - attr.gid_index = cmd.gid_index; - attr.hop_limit = cmd.hop_limit; - attr.create_flags = cmd.create_flags; + init_uobj(uobj, 0, file->ucontext, &rwq_ind_table_lock_class); + down_write(&uobj->mutex); + init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size; + init_attr.ind_tbl = wqs; + rwq_ind_tbl = ib_dev->create_rwq_ind_table(ib_dev, &init_attr, uhw); - dct = ib_create_dct(pd, &attr, &udata); - if (IS_ERR(dct)) { - ret = PTR_ERR(dct); - goto err_put; + if (IS_ERR(rwq_ind_tbl)) { + err = PTR_ERR(rwq_ind_tbl); + goto err_uobj; } - dct->device = file->device->ib_dev; - dct->uobject = &obj->uobject; + rwq_ind_tbl->ind_tbl = wqs; + rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size; + rwq_ind_tbl->uobject = uobj; + uobj->object = rwq_ind_tbl; + rwq_ind_tbl->device = ib_dev; + atomic_set(&rwq_ind_tbl->usecnt, 0); - obj->uobject.object = dct; - ret = idr_add_uobj(&ib_uverbs_dct_idr, &obj->uobject); - if (ret) - goto err_dct; + for (i = 0; i < num_wq_handles; i++) + atomic_inc(&wqs[i]->usecnt); - memset(&resp, 0, sizeof(resp)); - resp.dct_handle = obj->uobject.id; - resp.dctn = dct->dct_num; + err = idr_add_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); + if (err) + goto destroy_ind_tbl; - ret = ucore->ops->copy_to(ucore, &resp, sizeof(resp)); - if (ret) + resp.ind_tbl_handle = uobj->id; + resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num; + resp.response_length = required_resp_len; + + err = ib_copy_to_udata(ucore, + &resp, resp.response_length); + if (err) goto err_copy; + kfree(wqs_handles); + + for (j = 0; j < num_read_wqs; j++) + put_wq_read(wqs[j]); + mutex_lock(&file->mutex); - list_add_tail(&obj->uobject.list, &file->ucontext->dct_list); + list_add_tail(&uobj->list, &file->ucontext->rwq_ind_tbl_list); mutex_unlock(&file->mutex); - obj->uobject.live = 1; + uobj->live = 1; - put_srq_read(srq); - put_cq_read(cq); - put_pd_read(pd); + up_write(&uobj->mutex); + return 0; - up_write(&obj->uobject.mutex); - - return in_len; - err_copy: - idr_remove_uobj(&ib_uverbs_dct_idr, &obj->uobject); - -err_dct: - ib_destroy_dct(dct); - -err_put: - if (srq) - put_srq_read(srq); - - if (cq) - put_cq_read(cq); - - put_pd_read(pd); - -err_pd: - put_uobj_write(&obj->uobject); - return ret; + idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); +destroy_ind_tbl: + ib_destroy_rwq_ind_table(rwq_ind_tbl); +err_uobj: + put_uobj_write(uobj); +put_wqs: + for (j = 0; j < num_read_wqs; j++) + put_wq_read(wqs[j]); +err_free: + kfree(wqs_handles); + kfree(wqs); + return err; } -ssize_t ib_uverbs_exp_destroy_dct(struct ib_uverbs_file *file, - struct ib_udata *ucore, - struct ib_udata *uhw) +int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) { - int in_len = ucore->inlen + uhw->inlen; - int out_len = ucore->outlen + uhw->outlen; - struct ib_uverbs_destroy_dct cmd; - struct ib_uverbs_destroy_dct_resp resp; - struct ib_uobject *uobj; - struct ib_dct *dct; - struct ib_udct_object *obj; - int ret; + struct ib_uverbs_ex_destroy_rwq_ind_table cmd = {}; + struct ib_rwq_ind_table *rwq_ind_tbl; + struct ib_uobject *uobj; + int ret; + struct ib_wq **ind_tbl; + size_t required_cmd_sz; - if (out_len < sizeof(resp)) - return -ENOSPC; + required_cmd_sz = offsetof(typeof(cmd), ind_tbl_handle) + sizeof(cmd.ind_tbl_handle); - ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); + if (ucore->inlen < required_cmd_sz) + return -EINVAL; + + if (ucore->inlen > sizeof(cmd) && + !ib_is_udata_cleared(ucore, sizeof(cmd), + ucore->inlen - sizeof(cmd))) + return -EOPNOTSUPP; + + ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); if (ret) return ret; - uobj = idr_write_uobj(&ib_uverbs_dct_idr, cmd.user_handle, file->ucontext); + if (cmd.comp_mask) + return -EOPNOTSUPP; + + uobj = idr_write_uobj(&ib_uverbs_rwq_ind_tbl_idr, cmd.ind_tbl_handle, + file->ucontext); if (!uobj) return -EINVAL; + rwq_ind_tbl = uobj->object; + ind_tbl = rwq_ind_tbl->ind_tbl; - dct = uobj->object; - obj = container_of(dct->uobject, struct ib_udct_object, uobject); - - ret = ib_destroy_dct(dct); + ret = ib_destroy_rwq_ind_table(rwq_ind_tbl); if (!ret) uobj->live = 0; put_uobj_write(uobj); if (ret) return ret; - idr_remove_uobj(&ib_uverbs_dct_idr, uobj); + idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); - memset(&resp, 0, sizeof(resp)); - put_uobj(uobj); - - ret = ucore->ops->copy_to(ucore, &resp, sizeof(resp)); - if (ret) - return ret; - - return in_len; + kfree(ind_tbl); + return ret; } -ssize_t ib_uverbs_exp_query_dct(struct ib_uverbs_file *file, - struct ib_udata *ucore, - struct ib_udata *uhw) -{ - int in_len = ucore->inlen + uhw->inlen; - int out_len = ucore->outlen + uhw->outlen; - struct ib_uverbs_query_dct cmd; - struct ib_uverbs_query_dct_resp resp; - struct ib_dct *dct; - struct ib_dct_attr *attr; - int err; - - if (out_len < sizeof(resp)) - return -ENOSPC; - - err = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); - if (err) - return err; - - attr = kmalloc(sizeof(*attr), GFP_KERNEL); - if (!attr) { - err = -ENOMEM; - goto out; - } - - dct = idr_read_dct(cmd.dct_handle, file->ucontext); - if (!dct) { - err = -EINVAL; - goto out; - } - - err = ib_query_dct(dct, attr); - - put_dct_read(dct); - - if (err) - goto out; - - memset(&resp, 0, sizeof(resp)); - - resp.dc_key = attr->dc_key; - resp.access_flags = attr->access_flags; - resp.flow_label = attr->flow_label; - resp.key_violations = attr->key_violations; - resp.port = attr->port; - resp.min_rnr_timer = attr->min_rnr_timer; - resp.tclass = attr->tclass; - resp.mtu = attr->mtu; - resp.pkey_index = attr->pkey_index; - resp.gid_index = attr->gid_index; - resp.hop_limit = attr->hop_limit; - resp.state = attr->state; - - err = ucore->ops->copy_to(ucore, &resp, sizeof(resp)); - -out: - kfree(attr); - - return err ? err : in_len; -} - -/* - * Experimental functions - */ - -static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; - -static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, - union ib_flow_spec *ib_spec) -{ - ib_spec->type = kern_spec->type; - - switch (ib_spec->type) { - case IB_FLOW_SPEC_ETH: - ib_spec->eth.size = sizeof(struct ib_flow_spec_eth); - memcpy(&ib_spec->eth.val, &kern_spec->eth.val, - sizeof(struct ib_flow_eth_filter)); - memcpy(&ib_spec->eth.mask, &kern_spec->eth.mask, - sizeof(struct ib_flow_eth_filter)); - break; - case IB_FLOW_SPEC_IB: - ib_spec->ib.size = sizeof(struct ib_flow_spec_ib); - memcpy(&ib_spec->ib.val, &kern_spec->ib.val, - sizeof(struct ib_flow_ib_filter)); - memcpy(&ib_spec->ib.mask, &kern_spec->ib.mask, - sizeof(struct ib_flow_ib_filter)); - break; - case IB_FLOW_SPEC_IPV4: - ib_spec->ipv4.size = sizeof(struct ib_flow_spec_ipv4); - memcpy(&ib_spec->ipv4.val, &kern_spec->ipv4.val, - sizeof(struct ib_flow_ipv4_filter)); - memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask, - sizeof(struct ib_flow_ipv4_filter)); - break; - case IB_FLOW_SPEC_TCP: - case IB_FLOW_SPEC_UDP: - ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp); - memcpy(&ib_spec->tcp_udp.val, &kern_spec->tcp_udp.val, - sizeof(struct ib_flow_tcp_udp_filter)); - memcpy(&ib_spec->tcp_udp.mask, &kern_spec->tcp_udp.mask, - sizeof(struct ib_flow_tcp_udp_filter)); - break; - default: - return -EINVAL; - } - return 0; -} - int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, + struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_create_flow cmd; struct ib_uverbs_create_flow_resp resp; - struct ib_uobject *uobj; + struct ib_uobject *uobj; struct ib_flow *flow_id; struct ib_uverbs_flow_attr *kern_flow_attr; struct ib_flow_attr *flow_attr; struct ib_qp *qp; int err = 0; void *kern_spec; void *ib_spec; int i; + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + if (ucore->outlen < sizeof(resp)) return -ENOSPC; err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); if (err) return err; - ucore->inbuf += sizeof(cmd); + ucore->inbuf = (const char *)ucore->inbuf + sizeof(cmd); ucore->inlen -= sizeof(cmd); if (cmd.comp_mask) return -EINVAL; - if (priv_check(curthread, PRIV_NET_RAW) && !disable_raw_qp_enforcement) + if (priv_check(curthread, PRIV_NET_RAW) != 0) return -EPERM; + if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED) + return -EINVAL; + + if ((cmd.flow_attr.flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) && + ((cmd.flow_attr.type == IB_FLOW_ATTR_ALL_DEFAULT) || + (cmd.flow_attr.type == IB_FLOW_ATTR_MC_DEFAULT))) + return -EINVAL; + if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) return -EINVAL; if (cmd.flow_attr.size > ucore->inlen || cmd.flow_attr.size > (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec))) return -EINVAL; + if (cmd.flow_attr.reserved[0] || + cmd.flow_attr.reserved[1]) + return -EINVAL; + if (cmd.flow_attr.num_of_specs) { - kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + - cmd.flow_attr.size, GFP_KERNEL); + kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size, + GFP_KERNEL); if (!kern_flow_attr) - return -ENOMEM; + return -ENOMEM; memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr)); err = ib_copy_from_udata(kern_flow_attr + 1, ucore, cmd.flow_attr.size); if (err) goto err_free_attr; } else { kern_flow_attr = &cmd.flow_attr; } uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); if (!uobj) { err = -ENOMEM; goto err_free_attr; } init_uobj(uobj, 0, file->ucontext, &rule_lock_class); down_write(&uobj->mutex); qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) { err = -EINVAL; goto err_uobj; } - flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, - GFP_KERNEL); + flow_attr = kzalloc(sizeof(*flow_attr) + cmd.flow_attr.num_of_specs * + sizeof(union ib_flow_spec), GFP_KERNEL); if (!flow_attr) { err = -ENOMEM; goto err_put; } flow_attr->type = kern_flow_attr->type; flow_attr->priority = kern_flow_attr->priority; flow_attr->num_of_specs = kern_flow_attr->num_of_specs; flow_attr->port = kern_flow_attr->port; flow_attr->flags = kern_flow_attr->flags; flow_attr->size = sizeof(*flow_attr); kern_spec = kern_flow_attr + 1; ib_spec = flow_attr + 1; for (i = 0; i < flow_attr->num_of_specs && - cmd.flow_attr.size > - offsetof(struct ib_uverbs_flow_spec, reserved) && + cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) && cmd.flow_attr.size >= ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) { err = kern_spec_to_ib_spec(kern_spec, ib_spec); if (err) goto err_free; flow_attr->size += - ((union ib_flow_spec *)ib_spec)->size; - cmd.flow_attr.size -= - ((struct ib_uverbs_flow_spec *)kern_spec)->size; - kern_spec += ((struct ib_uverbs_flow_spec *)kern_spec)->size; - ib_spec += ((union ib_flow_spec *)ib_spec)->size; + ((union ib_flow_spec *) ib_spec)->size; + cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size; + kern_spec = (char *)kern_spec + ((struct ib_uverbs_flow_spec *) kern_spec)->size; + ib_spec = (char *)ib_spec + ((union ib_flow_spec *)ib_spec)->size; } if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n", i, cmd.flow_attr.size); + err = -EINVAL; goto err_free; - } + } flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); if (IS_ERR(flow_id)) { err = PTR_ERR(flow_id); goto err_free; } flow_id->qp = qp; flow_id->uobject = uobj; uobj->object = flow_id; err = idr_add_uobj(&ib_uverbs_rule_idr, uobj); if (err) goto destroy_flow; memset(&resp, 0, sizeof(resp)); resp.flow_handle = uobj->id; err = ib_copy_to_udata(ucore, &resp, sizeof(resp)); if (err) goto err_copy; put_qp_read(qp); mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->rule_list); mutex_unlock(&file->mutex); uobj->live = 1; up_write(&uobj->mutex); kfree(flow_attr); if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); return 0; err_copy: idr_remove_uobj(&ib_uverbs_rule_idr, uobj); destroy_flow: ib_destroy_flow(flow_id); err_free: kfree(flow_attr); err_put: put_qp_read(qp); err_uobj: put_uobj_write(uobj); err_free_attr: if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); return err; } int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, + struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_destroy_flow cmd; struct ib_flow *flow_id; struct ib_uobject *uobj; int ret; + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); if (ret) return ret; + if (cmd.comp_mask) + return -EINVAL; + uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle, file->ucontext); if (!uobj) return -EINVAL; flow_id = uobj->object; ret = ib_destroy_flow(flow_id); if (!ret) uobj->live = 0; put_uobj_write(uobj); idr_remove_uobj(&ib_uverbs_rule_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); put_uobj(uobj); return ret; } -ssize_t ib_uverbs_exp_modify_qp(struct ib_uverbs_file *file, - struct ib_udata *ucore, struct ib_udata *uhw) +static int __uverbs_create_xsrq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_uverbs_create_xsrq *cmd, + struct ib_udata *udata) { - const char __user *buf = ucore->inbuf; - int in_len = ucore->inlen + uhw->inlen; - int out_len = ucore->outlen + uhw->outlen; + struct ib_uverbs_create_srq_resp resp; + struct ib_usrq_object *obj; + struct ib_pd *pd; + struct ib_srq *srq; + struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_srq_init_attr attr; + int ret; - return __uverbs_modify_qp(file, buf, in_len, out_len, - IB_USER_VERBS_CMD_EXTENDED); -} + obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!obj) + return -ENOMEM; + init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class); + down_write(&obj->uevent.uobject.mutex); -ssize_t ib_uverbs_exp_create_cq(struct ib_uverbs_file *file, - struct ib_udata *ucore, struct ib_udata *uhw) + if (cmd->srq_type == IB_SRQT_XRC) { + attr.ext.xrc.xrcd = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj); + if (!attr.ext.xrc.xrcd) { + ret = -EINVAL; + goto err; + } + + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); + atomic_inc(&obj->uxrcd->refcnt); + + attr.ext.xrc.cq = idr_read_cq(cmd->cq_handle, file->ucontext, 0); + if (!attr.ext.xrc.cq) { + ret = -EINVAL; + goto err_put_xrcd; + } + } + + pd = idr_read_pd(cmd->pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto err_put_cq; + } + + attr.event_handler = ib_uverbs_srq_event_handler; + attr.srq_context = file; + attr.srq_type = cmd->srq_type; + attr.attr.max_wr = cmd->max_wr; + attr.attr.max_sge = cmd->max_sge; + attr.attr.srq_limit = cmd->srq_limit; + + obj->uevent.events_reported = 0; + INIT_LIST_HEAD(&obj->uevent.event_list); + + srq = pd->device->create_srq(pd, &attr, udata); + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto err_put; + } + + srq->device = pd->device; + srq->pd = pd; + srq->srq_type = cmd->srq_type; + srq->uobject = &obj->uevent.uobject; + srq->event_handler = attr.event_handler; + srq->srq_context = attr.srq_context; + + if (cmd->srq_type == IB_SRQT_XRC) { + srq->ext.xrc.cq = attr.ext.xrc.cq; + srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; + atomic_inc(&attr.ext.xrc.cq->usecnt); + atomic_inc(&attr.ext.xrc.xrcd->usecnt); + } + + atomic_inc(&pd->usecnt); + atomic_set(&srq->usecnt, 0); + + obj->uevent.uobject.object = srq; + ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); + if (ret) + goto err_destroy; + + memset(&resp, 0, sizeof resp); + resp.srq_handle = obj->uevent.uobject.id; + resp.max_wr = attr.attr.max_wr; + resp.max_sge = attr.attr.max_sge; + if (cmd->srq_type == IB_SRQT_XRC) + resp.srqn = srq->ext.xrc.srq_num; + + if (copy_to_user((void __user *) (unsigned long) cmd->response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + if (cmd->srq_type == IB_SRQT_XRC) { + put_uobj_read(xrcd_uobj); + put_cq_read(attr.ext.xrc.cq); + } + put_pd_read(pd); + + mutex_lock(&file->mutex); + list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list); + mutex_unlock(&file->mutex); + + obj->uevent.uobject.live = 1; + + up_write(&obj->uevent.uobject.mutex); + + return 0; + +err_copy: + idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); + +err_destroy: + ib_destroy_srq(srq); + +err_put: + put_pd_read(pd); + +err_put_cq: + if (cmd->srq_type == IB_SRQT_XRC) + put_cq_read(attr.ext.xrc.cq); + +err_put_xrcd: + if (cmd->srq_type == IB_SRQT_XRC) { + atomic_dec(&obj->uxrcd->refcnt); + put_uobj_read(xrcd_uobj); + } + +err: + put_uobj_write(&obj->uevent.uobject); + return ret; +} + +ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { - const char __user *buf = ucore->inbuf; - int in_len = ucore->inlen + uhw->inlen; - int out_len = ucore->outlen + uhw->outlen; - struct ib_uverbs_create_cq_ex cmd; + struct ib_uverbs_create_srq cmd; + struct ib_uverbs_create_xsrq xcmd; + struct ib_uverbs_create_srq_resp resp; + struct ib_udata udata; + int ret; - if (copy_from_user(&cmd, buf, sizeof(cmd))) + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - return create_cq(file, buf, in_len, out_len, &cmd, - IB_USER_VERBS_CMD_EXTENDED, ucore->outbuf); + xcmd.response = cmd.response; + xcmd.user_handle = cmd.user_handle; + xcmd.srq_type = IB_SRQT_BASIC; + xcmd.pd_handle = cmd.pd_handle; + xcmd.max_wr = cmd.max_wr; + xcmd.max_sge = cmd.max_sge; + xcmd.srq_limit = cmd.srq_limit; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof resp); + + ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata); + if (ret) + return ret; + + return in_len; } -ssize_t ib_uverbs_exp_modify_cq(struct ib_uverbs_file *file, - struct ib_udata *ucore, struct ib_udata *uhw) +ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, int in_len, int out_len) { - const char __user *buf = ucore->inbuf; - int in_len = ucore->inlen + uhw->inlen; - struct ib_uverbs_modify_cq_ex cmd; - struct ib_cq *cq; - struct ib_cq_attr attr; + struct ib_uverbs_create_xsrq cmd; + struct ib_uverbs_create_srq_resp resp; + struct ib_udata udata; int ret; - if (copy_from_user(&cmd, buf, sizeof(cmd))) + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); - if (!cq) + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof resp); + + ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata); + if (ret) + return ret; + + return in_len; +} + +ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_modify_srq cmd; + struct ib_udata udata; + struct ib_srq *srq; + struct ib_srq_attr attr; + int ret; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, + out_len); + + srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (!srq) return -EINVAL; - attr.moderation.cq_count = cmd.cq_count; - attr.moderation.cq_period = cmd.cq_period; - attr.cq_cap_flags = cmd.cq_cap_flags; + attr.max_wr = cmd.max_wr; + attr.srq_limit = cmd.srq_limit; - ret = ib_modify_cq(cq, &attr, cmd.attr_mask); + ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata); - put_cq_read(cq); + put_srq_read(srq); return ret ? ret : in_len; } - -ssize_t ib_uverbs_exp_query_device(struct ib_uverbs_file *file, - struct ib_udata *ucore, struct ib_udata *uhw) +ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, + int in_len, int out_len) { - struct ib_uverbs_exp_query_device_resp resp; - struct ib_exp_device_attr exp_attr; - int ret; + struct ib_uverbs_query_srq cmd; + struct ib_uverbs_query_srq_resp resp; + struct ib_srq_attr attr; + struct ib_srq *srq; + int ret; - if (ucore->outlen + uhw->outlen < sizeof(resp)) + if (out_len < sizeof resp) return -ENOSPC; - memset(&resp, 0, sizeof(resp)); - memset(&exp_attr, 0, sizeof(exp_attr)); - ret = ib_exp_query_device(file->device->ib_dev, &exp_attr); - if (ret) - return ret; + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; - ib_uverbs_query_device_assign(&resp.base, &exp_attr.base, file); + srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (!srq) + return -EINVAL; - resp.comp_mask = 0; - resp.device_cap_flags2 = 0; + ret = ib_query_srq(srq, &attr); - /* - * Handle regular attr fields - */ - if (exp_attr.base.comp_mask & IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK) { - resp.timestamp_mask = exp_attr.base.timestamp_mask; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK; - } + put_srq_read(srq); - if (exp_attr.base.comp_mask & IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK) { - resp.hca_core_clock = exp_attr.base.hca_core_clock; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK; - } + if (ret) + return ret; - /* - * Handle experimental attr fields - */ - if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_CAP_FLAGS2) { - resp.device_cap_flags2 = exp_attr.device_cap_flags2; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_CAP_FLAGS2; - } + memset(&resp, 0, sizeof resp); - if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_REQ_RD) { - resp.dc_rd_req = exp_attr.dc_rd_req; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_DC_REQ_RD; - } + resp.max_wr = attr.max_wr; + resp.max_sge = attr.max_sge; + resp.srq_limit = attr.srq_limit; - if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_RES_RD) { - resp.dc_rd_res = exp_attr.dc_rd_res; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_DC_RES_RD; - } + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + return -EFAULT; - if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ) { - resp.inline_recv_sz = exp_attr.inline_recv_sz; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ; - } + return in_len; +} - if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_RSS_TBL_SZ) { - resp.max_rss_tbl_sz = exp_attr.max_rss_tbl_sz; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_RSS_TBL_SZ; - } +ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_destroy_srq cmd; + struct ib_uverbs_destroy_srq_resp resp; + struct ib_uobject *uobj; + struct ib_srq *srq; + struct ib_uevent_object *obj; + int ret = -EINVAL; + struct ib_usrq_object *us; + enum ib_srq_type srq_type; - if (copy_to_user(ucore->outbuf, &resp, sizeof(resp))) + if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - return ucore->inlen + uhw->inlen; -} + uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext); + if (!uobj) + return -EINVAL; + srq = uobj->object; + obj = container_of(uobj, struct ib_uevent_object, uobject); + srq_type = srq->srq_type; -ssize_t ib_uverbs_exp_create_qp(struct ib_uverbs_file *file, - struct ib_udata *ucore, struct ib_udata *uhw) -{ - struct ib_uqp_object *obj; - struct ib_device *device; - struct ib_pd *pd = NULL; - struct ib_xrcd *xrcd = NULL; - struct ib_uobject *uninitialized_var(xrcd_uobj); - struct ib_cq *scq = NULL, *rcq = NULL; - struct ib_srq *srq = NULL; - struct ib_qp *qp; - struct ib_exp_qp_init_attr attr; - int ret; - struct ib_uverbs_exp_create_qp cmd_exp; - struct ib_uverbs_exp_create_qp_resp resp_exp; - struct ib_qp *parentqp = NULL; + ret = ib_destroy_srq(srq); + if (!ret) + uobj->live = 0; - memset(&cmd_exp, 0, sizeof(cmd_exp)); + put_uobj_write(uobj); - ret = ucore->ops->copy_from(&cmd_exp, ucore, sizeof(cmd_exp)); if (ret) return ret; - if (!disable_raw_qp_enforcement && - cmd_exp.qp_type == IB_QPT_RAW_PACKET && priv_check(curthread, - PRIV_NET_RAW)) - return -EPERM; + if (srq_type == IB_SRQT_XRC) { + us = container_of(obj, struct ib_usrq_object, uevent); + atomic_dec(&us->uxrcd->refcnt); + } - obj = kzalloc(sizeof(*obj), GFP_KERNEL); - if (!obj) - return -ENOMEM; + idr_remove_uobj(&ib_uverbs_srq_idr, uobj); - init_uobj(&obj->uevent.uobject, cmd_exp.user_handle, file->ucontext, - &qp_lock_class); - down_write(&obj->uevent.uobject.mutex); + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); - if (cmd_exp.qp_type == IB_QPT_XRC_TGT) { - xrcd = idr_read_xrcd(cmd_exp.pd_handle, file->ucontext, &xrcd_uobj); - if (!xrcd) { - ret = -EINVAL; - goto err_put; - } - device = xrcd->device; - } else { - if (cmd_exp.qp_type == IB_QPT_XRC_INI) { - cmd_exp.max_recv_wr = 0; - cmd_exp.max_recv_sge = 0; - } else { - if (cmd_exp.is_srq) { - srq = idr_read_srq(cmd_exp.srq_handle, file->ucontext); - if (!srq || srq->srq_type != IB_SRQT_BASIC) { - ret = -EINVAL; - goto err_put; - } - } + ib_uverbs_release_uevent(file, obj); - if (cmd_exp.recv_cq_handle != cmd_exp.send_cq_handle) { - rcq = idr_read_cq(cmd_exp.recv_cq_handle, file->ucontext, 0); - if (!rcq) { - ret = -EINVAL; - goto err_put; - } - } - } + memset(&resp, 0, sizeof resp); + resp.events_reported = obj->events_reported; - scq = idr_read_cq(cmd_exp.send_cq_handle, file->ucontext, !!rcq); - rcq = rcq ?: scq; - pd = idr_read_pd(cmd_exp.pd_handle, file->ucontext); - if (!pd || !scq) { - ret = -EINVAL; - goto err_put; - } + put_uobj(uobj); - device = pd->device; - } + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + ret = -EFAULT; - memset(&attr, 0, sizeof(attr)); - attr.event_handler = ib_uverbs_qp_event_handler; - attr.qp_context = file; - attr.send_cq = scq; - attr.recv_cq = rcq; - attr.srq = srq; - attr.xrcd = xrcd; - attr.sq_sig_type = cmd_exp.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; - attr.qp_type = cmd_exp.qp_type; - attr.create_flags = 0; + return ret ? ret : in_len; +} - attr.cap.max_send_wr = cmd_exp.max_send_wr; - attr.cap.max_recv_wr = cmd_exp.max_recv_wr; - attr.cap.max_send_sge = cmd_exp.max_send_sge; - attr.cap.max_recv_sge = cmd_exp.max_recv_sge; - attr.cap.max_inline_data = cmd_exp.max_inline_data; +int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_query_device_resp resp = { {0} }; + struct ib_uverbs_ex_query_device cmd; + struct ib_device_attr attr = {0}; + int err; - if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_CAP_FLAGS) - attr.create_flags |= cmd_exp.qp_cap_flags & - (IB_QP_CREATE_CROSS_CHANNEL | - IB_QP_CREATE_MANAGED_SEND | - IB_QP_CREATE_MANAGED_RECV); + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; - if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_QPG) { - struct ib_uverbs_qpg *qpg; - if (cmd_exp.qp_type != IB_QPT_RAW_PACKET && - cmd_exp.qp_type != IB_QPT_UD) { - ret = -EINVAL; - goto err_put; - } - qpg = &cmd_exp.qpg; - switch (qpg->qpg_type) { - case IB_QPG_PARENT: - attr.parent_attrib.rss_child_count = - qpg->parent_attrib.rss_child_count; - attr.parent_attrib.tss_child_count = - qpg->parent_attrib.tss_child_count; - break; - case IB_QPG_CHILD_RX: - case IB_QPG_CHILD_TX: - parentqp = idr_read_qp(qpg->parent_handle, - file->ucontext); - if (!parentqp) { - ret = -EINVAL; - goto err_put; - } - attr.qpg_parent = parentqp; - break; - default: - ret = -EINVAL; - goto err_put; - } - attr.qpg_type = qpg->qpg_type; - } + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; - if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV) - attr.max_inl_recv = cmd_exp.max_inl_recv; + if (cmd.comp_mask) + return -EINVAL; - obj->uevent.events_reported = 0; - INIT_LIST_HEAD(&obj->uevent.event_list); - INIT_LIST_HEAD(&obj->mcast_list); + if (cmd.reserved) + return -EINVAL; - if (cmd_exp.qp_type == IB_QPT_XRC_TGT) - qp = ib_create_qp(pd, (struct ib_qp_init_attr *)&attr); - else - qp = device->exp_create_qp(pd, &attr, uhw); + resp.response_length = offsetof(typeof(resp), odp_caps); - if (IS_ERR(qp)) { - ret = PTR_ERR(qp); - goto err_put; - } + if (ucore->outlen < resp.response_length) + return -ENOSPC; - if (cmd_exp.qp_type != IB_QPT_XRC_TGT) { - qp->real_qp = qp; - qp->device = device; - qp->pd = pd; - qp->send_cq = attr.send_cq; - qp->recv_cq = attr.recv_cq; - qp->srq = attr.srq; - qp->event_handler = attr.event_handler; - qp->qp_context = attr.qp_context; - qp->qp_type = attr.qp_type; - atomic_set(&qp->usecnt, 0); - atomic_inc(&pd->usecnt); - atomic_inc(&attr.send_cq->usecnt); - if (attr.recv_cq) - atomic_inc(&attr.recv_cq->usecnt); - if (attr.srq) - atomic_inc(&attr.srq->usecnt); - } - qp->uobject = &obj->uevent.uobject; + err = ib_dev->query_device(ib_dev, &attr, uhw); + if (err) + return err; - obj->uevent.uobject.object = qp; - ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); - if (ret) - goto err_destroy; + copy_query_dev_fields(file, ib_dev, &resp.base, &attr); - memset(&resp_exp, 0, sizeof(resp_exp)); - resp_exp.qpn = qp->qp_num; - resp_exp.qp_handle = obj->uevent.uobject.id; - resp_exp.max_recv_sge = attr.cap.max_recv_sge; - resp_exp.max_send_sge = attr.cap.max_send_sge; - resp_exp.max_recv_wr = attr.cap.max_recv_wr; - resp_exp.max_send_wr = attr.cap.max_send_wr; - resp_exp.max_inline_data = attr.cap.max_inline_data; + if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps)) + goto end; - if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV) { - resp_exp.comp_mask |= IB_UVERBS_EXP_CREATE_QP_RESP_INL_RECV; - resp_exp.max_inl_recv = attr.max_inl_recv; - } +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + resp.odp_caps.general_caps = attr.odp_caps.general_caps; + resp.odp_caps.per_transport_caps.rc_odp_caps = + attr.odp_caps.per_transport_caps.rc_odp_caps; + resp.odp_caps.per_transport_caps.uc_odp_caps = + attr.odp_caps.per_transport_caps.uc_odp_caps; + resp.odp_caps.per_transport_caps.ud_odp_caps = + attr.odp_caps.per_transport_caps.ud_odp_caps; +#endif + resp.response_length += sizeof(resp.odp_caps); - ret = ucore->ops->copy_to(ucore, &resp_exp, sizeof(resp_exp)); - if (ret) - goto err_copy; + if (ucore->outlen < resp.response_length + sizeof(resp.timestamp_mask)) + goto end; - if (xrcd) { - obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); - atomic_inc(&obj->uxrcd->refcnt); - put_xrcd_read(xrcd_uobj); - } + resp.timestamp_mask = attr.timestamp_mask; + resp.response_length += sizeof(resp.timestamp_mask); - if (pd) - put_pd_read(pd); - if (scq) - put_cq_read(scq); - if (rcq && rcq != scq) - put_cq_read(rcq); - if (srq) - put_srq_read(srq); - if (parentqp) - put_qp_read(parentqp); + if (ucore->outlen < resp.response_length + sizeof(resp.hca_core_clock)) + goto end; - mutex_lock(&file->mutex); - list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); - mutex_unlock(&file->mutex); + resp.hca_core_clock = attr.hca_core_clock; + resp.response_length += sizeof(resp.hca_core_clock); - obj->uevent.uobject.live = 1; + if (ucore->outlen < resp.response_length + sizeof(resp.device_cap_flags_ex)) + goto end; - up_write(&obj->uevent.uobject.mutex); + resp.device_cap_flags_ex = attr.device_cap_flags; + resp.response_length += sizeof(resp.device_cap_flags_ex); - return ucore->inlen + uhw->inlen; + if (ucore->outlen < resp.response_length + sizeof(resp.rss_caps)) + goto end; -err_copy: - idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); + resp.rss_caps.supported_qpts = attr.rss_caps.supported_qpts; + resp.rss_caps.max_rwq_indirection_tables = + attr.rss_caps.max_rwq_indirection_tables; + resp.rss_caps.max_rwq_indirection_table_size = + attr.rss_caps.max_rwq_indirection_table_size; -err_destroy: - ib_destroy_qp(qp); + resp.response_length += sizeof(resp.rss_caps); -err_put: - if (xrcd) - put_xrcd_read(xrcd_uobj); - if (pd) - put_pd_read(pd); - if (scq) - put_cq_read(scq); - if (rcq && rcq != scq) - put_cq_read(rcq); - if (srq) - put_srq_read(srq); - if (parentqp) - put_qp_read(parentqp); + if (ucore->outlen < resp.response_length + sizeof(resp.max_wq_type_rq)) + goto end; - put_uobj_write(&obj->uevent.uobject); - return ret; + resp.max_wq_type_rq = attr.max_wq_type_rq; + resp.response_length += sizeof(resp.max_wq_type_rq); +end: + err = ib_copy_to_udata(ucore, &resp, resp.response_length); + return err; } - -int ib_exp_query_device(struct ib_device *device, - struct ib_exp_device_attr *device_attr) -{ - return device->exp_query_device(device, device_attr); -} -EXPORT_SYMBOL(ib_exp_query_device); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/uverbs_main.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/uverbs_main.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/uverbs_main.c (revision 319974) @@ -1,1435 +1,1392 @@ /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include -#include -#include -#include #include +#include + #include "uverbs.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand userspace verbs access"); MODULE_LICENSE("Dual BSD/GPL"); enum { IB_UVERBS_MAJOR = 231, IB_UVERBS_BASE_MINOR = 192, IB_UVERBS_MAX_DEVICES = 32 }; #define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR) -static int uverbs_copy_from_udata_ex(void *dest, struct ib_udata *udata, size_t len) -{ - return copy_from_user(dest, udata->inbuf, min(udata->inlen, len)) ? -EFAULT : 0; -} - -static int uverbs_copy_to_udata_ex(struct ib_udata *udata, void *src, size_t len) -{ - return copy_to_user(udata->outbuf, src, min(udata->outlen, len)) ? -EFAULT : 0; -} - -static struct ib_udata_ops uverbs_copy_ex = { - .copy_from = uverbs_copy_from_udata_ex, - .copy_to = uverbs_copy_to_udata_ex -}; - -#define INIT_UDATA_EX(udata, ibuf, obuf, ilen, olen) \ - do { \ - (udata)->ops = &uverbs_copy_ex; \ - (udata)->inbuf = (void __user *)(unsigned long)(ibuf); \ - (udata)->outbuf = (void __user *)(unsigned long)(obuf); \ - (udata)->inlen = (ilen); \ - (udata)->outlen = (olen); \ - } while (0) - - static struct class *uverbs_class; DEFINE_SPINLOCK(ib_uverbs_idr_lock); DEFINE_IDR(ib_uverbs_pd_idr); DEFINE_IDR(ib_uverbs_mr_idr); DEFINE_IDR(ib_uverbs_mw_idr); DEFINE_IDR(ib_uverbs_ah_idr); DEFINE_IDR(ib_uverbs_cq_idr); DEFINE_IDR(ib_uverbs_qp_idr); DEFINE_IDR(ib_uverbs_srq_idr); DEFINE_IDR(ib_uverbs_xrcd_idr); DEFINE_IDR(ib_uverbs_rule_idr); -DEFINE_IDR(ib_uverbs_dct_idr); +DEFINE_IDR(ib_uverbs_wq_idr); +DEFINE_IDR(ib_uverbs_rwq_ind_tbl_idr); static DEFINE_SPINLOCK(map_lock); static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) = { - [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context, - [IB_USER_VERBS_CMD_QUERY_DEVICE] = ib_uverbs_query_device, - [IB_USER_VERBS_CMD_QUERY_PORT] = ib_uverbs_query_port, - [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd, - [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd, - [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr, - [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr, + [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context, + [IB_USER_VERBS_CMD_QUERY_DEVICE] = ib_uverbs_query_device, + [IB_USER_VERBS_CMD_QUERY_PORT] = ib_uverbs_query_port, + [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd, + [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd, + [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr, + [IB_USER_VERBS_CMD_REREG_MR] = ib_uverbs_rereg_mr, + [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr, [IB_USER_VERBS_CMD_ALLOC_MW] = ib_uverbs_alloc_mw, [IB_USER_VERBS_CMD_DEALLOC_MW] = ib_uverbs_dealloc_mw, [IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel, - [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq, - [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq, - [IB_USER_VERBS_CMD_POLL_CQ] = ib_uverbs_poll_cq, - [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ] = ib_uverbs_req_notify_cq, - [IB_USER_VERBS_CMD_DESTROY_CQ] = ib_uverbs_destroy_cq, - [IB_USER_VERBS_CMD_CREATE_QP] = ib_uverbs_create_qp, - [IB_USER_VERBS_CMD_QUERY_QP] = ib_uverbs_query_qp, - [IB_USER_VERBS_CMD_MODIFY_QP] = ib_uverbs_modify_qp, - [IB_USER_VERBS_CMD_DESTROY_QP] = ib_uverbs_destroy_qp, - [IB_USER_VERBS_CMD_POST_SEND] = ib_uverbs_post_send, - [IB_USER_VERBS_CMD_POST_RECV] = ib_uverbs_post_recv, - [IB_USER_VERBS_CMD_POST_SRQ_RECV] = ib_uverbs_post_srq_recv, - [IB_USER_VERBS_CMD_CREATE_AH] = ib_uverbs_create_ah, - [IB_USER_VERBS_CMD_DESTROY_AH] = ib_uverbs_destroy_ah, - [IB_USER_VERBS_CMD_ATTACH_MCAST] = ib_uverbs_attach_mcast, - [IB_USER_VERBS_CMD_DETACH_MCAST] = ib_uverbs_detach_mcast, - [IB_USER_VERBS_CMD_CREATE_SRQ] = ib_uverbs_create_srq, - [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq, - [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq, - [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, + [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq, + [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq, + [IB_USER_VERBS_CMD_POLL_CQ] = ib_uverbs_poll_cq, + [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ] = ib_uverbs_req_notify_cq, + [IB_USER_VERBS_CMD_DESTROY_CQ] = ib_uverbs_destroy_cq, + [IB_USER_VERBS_CMD_CREATE_QP] = ib_uverbs_create_qp, + [IB_USER_VERBS_CMD_QUERY_QP] = ib_uverbs_query_qp, + [IB_USER_VERBS_CMD_MODIFY_QP] = ib_uverbs_modify_qp, + [IB_USER_VERBS_CMD_DESTROY_QP] = ib_uverbs_destroy_qp, + [IB_USER_VERBS_CMD_POST_SEND] = ib_uverbs_post_send, + [IB_USER_VERBS_CMD_POST_RECV] = ib_uverbs_post_recv, + [IB_USER_VERBS_CMD_POST_SRQ_RECV] = ib_uverbs_post_srq_recv, + [IB_USER_VERBS_CMD_CREATE_AH] = ib_uverbs_create_ah, + [IB_USER_VERBS_CMD_DESTROY_AH] = ib_uverbs_destroy_ah, + [IB_USER_VERBS_CMD_ATTACH_MCAST] = ib_uverbs_attach_mcast, + [IB_USER_VERBS_CMD_DETACH_MCAST] = ib_uverbs_detach_mcast, + [IB_USER_VERBS_CMD_CREATE_SRQ] = ib_uverbs_create_srq, + [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq, + [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq, + [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrcd, [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd, [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq, [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp, }; static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, + struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) = { [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow, + [IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device, + [IB_USER_VERBS_EX_CMD_CREATE_CQ] = ib_uverbs_ex_create_cq, + [IB_USER_VERBS_EX_CMD_CREATE_QP] = ib_uverbs_ex_create_qp, + [IB_USER_VERBS_EX_CMD_CREATE_WQ] = ib_uverbs_ex_create_wq, + [IB_USER_VERBS_EX_CMD_MODIFY_WQ] = ib_uverbs_ex_modify_wq, + [IB_USER_VERBS_EX_CMD_DESTROY_WQ] = ib_uverbs_ex_destroy_wq, + [IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL] = ib_uverbs_ex_create_rwq_ind_table, + [IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL] = ib_uverbs_ex_destroy_rwq_ind_table, }; -static ssize_t (*uverbs_exp_cmd_table[])(struct ib_uverbs_file *file, - struct ib_udata *ucore, - struct ib_udata *uhw) = { - [IB_USER_VERBS_EXP_CMD_CREATE_QP] = ib_uverbs_exp_create_qp, - [IB_USER_VERBS_EXP_CMD_MODIFY_CQ] = ib_uverbs_exp_modify_cq, - [IB_USER_VERBS_EXP_CMD_MODIFY_QP] = ib_uverbs_exp_modify_qp, - [IB_USER_VERBS_EXP_CMD_CREATE_CQ] = ib_uverbs_exp_create_cq, - [IB_USER_VERBS_EXP_CMD_QUERY_DEVICE] = ib_uverbs_exp_query_device, - [IB_USER_VERBS_EXP_CMD_CREATE_DCT] = ib_uverbs_exp_create_dct, - [IB_USER_VERBS_EXP_CMD_DESTROY_DCT] = ib_uverbs_exp_destroy_dct, - [IB_USER_VERBS_EXP_CMD_QUERY_DCT] = ib_uverbs_exp_query_dct, -}; - static void ib_uverbs_add_one(struct ib_device *device); -static void ib_uverbs_remove_one(struct ib_device *device); +static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); -static void ib_uverbs_release_dev(struct kref *ref) +int uverbs_dealloc_mw(struct ib_mw *mw) { + struct ib_pd *pd = mw->pd; + int ret; + + ret = mw->device->dealloc_mw(mw); + if (!ret) + atomic_dec(&pd->usecnt); + return ret; +} + +static void ib_uverbs_release_dev(struct kobject *kobj) +{ struct ib_uverbs_device *dev = - container_of(ref, struct ib_uverbs_device, ref); + container_of(kobj, struct ib_uverbs_device, kobj); - complete(&dev->comp); + cleanup_srcu_struct(&dev->disassociate_srcu); + kfree(dev); } +static struct kobj_type ib_uverbs_dev_ktype = { + .release = ib_uverbs_release_dev, +}; + static void ib_uverbs_release_event_file(struct kref *ref) { struct ib_uverbs_event_file *file = container_of(ref, struct ib_uverbs_event_file, ref); kfree(file); } void ib_uverbs_release_ucq(struct ib_uverbs_file *file, struct ib_uverbs_event_file *ev_file, struct ib_ucq_object *uobj) { struct ib_uverbs_event *evt, *tmp; if (ev_file) { spin_lock_irq(&ev_file->lock); list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) { list_del(&evt->list); kfree(evt); } spin_unlock_irq(&ev_file->lock); kref_put(&ev_file->ref, ib_uverbs_release_event_file); } spin_lock_irq(&file->async_file->lock); list_for_each_entry_safe(evt, tmp, &uobj->async_list, obj_list) { list_del(&evt->list); kfree(evt); } spin_unlock_irq(&file->async_file->lock); } void ib_uverbs_release_uevent(struct ib_uverbs_file *file, struct ib_uevent_object *uobj) { struct ib_uverbs_event *evt, *tmp; spin_lock_irq(&file->async_file->lock); list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) { list_del(&evt->list); kfree(evt); } spin_unlock_irq(&file->async_file->lock); } static void ib_uverbs_detach_umcast(struct ib_qp *qp, struct ib_uqp_object *uobj) { struct ib_uverbs_mcast_entry *mcast, *tmp; list_for_each_entry_safe(mcast, tmp, &uobj->mcast_list, list) { ib_detach_mcast(qp, &mcast->gid, mcast->lid); list_del(&mcast->list); kfree(mcast); } } static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, struct ib_ucontext *context) { struct ib_uobject *uobj, *tmp; - int err; - if (!context) - return 0; - context->closing = 1; list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) { struct ib_ah *ah = uobj->object; idr_remove_uobj(&ib_uverbs_ah_idr, uobj); ib_destroy_ah(ah); kfree(uobj); } /* Remove MWs before QPs, in order to support type 2A MWs. */ list_for_each_entry_safe(uobj, tmp, &context->mw_list, list) { struct ib_mw *mw = uobj->object; idr_remove_uobj(&ib_uverbs_mw_idr, uobj); - err = ib_dealloc_mw(mw); - if (err) { - pr_info("user_verbs: couldn't deallocate MW during cleanup.\n"); - pr_info("user_verbs: the system may have become unstable.\n"); - } + uverbs_dealloc_mw(mw); kfree(uobj); } + list_for_each_entry_safe(uobj, tmp, &context->rule_list, list) { struct ib_flow *flow_id = uobj->object; idr_remove_uobj(&ib_uverbs_rule_idr, uobj); ib_destroy_flow(flow_id); kfree(uobj); } list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) { struct ib_qp *qp = uobj->object; struct ib_uqp_object *uqp = container_of(uobj, struct ib_uqp_object, uevent.uobject); idr_remove_uobj(&ib_uverbs_qp_idr, uobj); - - ib_uverbs_detach_umcast(qp, uqp); - err = ib_destroy_qp(qp); - if (err) - pr_info("destroying uverbs qp failed: err %d\n", err); - + if (qp == qp->real_qp) + ib_uverbs_detach_umcast(qp, uqp); + ib_destroy_qp(qp); ib_uverbs_release_uevent(file, &uqp->uevent); kfree(uqp); } - list_for_each_entry_safe(uobj, tmp, &context->dct_list, list) { - struct ib_dct *dct = uobj->object; - struct ib_udct_object *udct = - container_of(uobj, struct ib_udct_object, uobject); + list_for_each_entry_safe(uobj, tmp, &context->rwq_ind_tbl_list, list) { + struct ib_rwq_ind_table *rwq_ind_tbl = uobj->object; + struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl; - idr_remove_uobj(&ib_uverbs_dct_idr, uobj); + idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); + ib_destroy_rwq_ind_table(rwq_ind_tbl); + kfree(ind_tbl); + kfree(uobj); + } - err = ib_destroy_dct(dct); - if (err) - pr_info("destroying uverbs dct failed: err %d\n", err); + list_for_each_entry_safe(uobj, tmp, &context->wq_list, list) { + struct ib_wq *wq = uobj->object; + struct ib_uwq_object *uwq = + container_of(uobj, struct ib_uwq_object, uevent.uobject); - kfree(udct); + idr_remove_uobj(&ib_uverbs_wq_idr, uobj); + ib_destroy_wq(wq); + ib_uverbs_release_uevent(file, &uwq->uevent); + kfree(uwq); } list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) { struct ib_srq *srq = uobj->object; struct ib_uevent_object *uevent = container_of(uobj, struct ib_uevent_object, uobject); idr_remove_uobj(&ib_uverbs_srq_idr, uobj); - err = ib_destroy_srq(srq); - if (err) - pr_info("destroying uverbs srq failed: err %d\n", err); + ib_destroy_srq(srq); ib_uverbs_release_uevent(file, uevent); kfree(uevent); } list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) { struct ib_cq *cq = uobj->object; struct ib_uverbs_event_file *ev_file = cq->cq_context; struct ib_ucq_object *ucq = container_of(uobj, struct ib_ucq_object, uobject); idr_remove_uobj(&ib_uverbs_cq_idr, uobj); - err = ib_destroy_cq(cq); - if (err) - pr_info("destroying uverbs cq failed: err %d\n", err); - + ib_destroy_cq(cq); ib_uverbs_release_ucq(file, ev_file, ucq); kfree(ucq); } list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) { struct ib_mr *mr = uobj->object; idr_remove_uobj(&ib_uverbs_mr_idr, uobj); - err = ib_dereg_mr(mr); - if (err) { - pr_info("user_verbs: couldn't deregister an MR during cleanup.\n"); - pr_info("user_verbs: the system may have become unstable.\n"); - } + ib_dereg_mr(mr); kfree(uobj); } mutex_lock(&file->device->xrcd_tree_mutex); list_for_each_entry_safe(uobj, tmp, &context->xrcd_list, list) { struct ib_xrcd *xrcd = uobj->object; struct ib_uxrcd_object *uxrcd = container_of(uobj, struct ib_uxrcd_object, uobject); idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj); ib_uverbs_dealloc_xrcd(file->device, xrcd); kfree(uxrcd); } mutex_unlock(&file->device->xrcd_tree_mutex); list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) { struct ib_pd *pd = uobj->object; idr_remove_uobj(&ib_uverbs_pd_idr, uobj); ib_dealloc_pd(pd); kfree(uobj); } + put_pid(context->tgid); + return context->device->dealloc_ucontext(context); } +static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev) +{ + complete(&dev->comp); +} + static void ib_uverbs_release_file(struct kref *ref) { struct ib_uverbs_file *file = container_of(ref, struct ib_uverbs_file, ref); + struct ib_device *ib_dev; + int srcu_key; - module_put(file->device->ib_dev->owner); - kref_put(&file->device->ref, ib_uverbs_release_dev); + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + ib_dev = srcu_dereference(file->device->ib_dev, + &file->device->disassociate_srcu); + if (ib_dev && !ib_dev->disassociate_ucontext) + module_put(ib_dev->owner); + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + if (atomic_dec_and_test(&file->device->refcount)) + ib_uverbs_comp_dev(file->device); + kfree(file); } static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { struct ib_uverbs_event_file *file = filp->private_data; struct ib_uverbs_event *event; int eventsz; int ret = 0; spin_lock_irq(&file->lock); while (list_empty(&file->event_list)) { spin_unlock_irq(&file->lock); if (filp->f_flags & O_NONBLOCK) return -EAGAIN; if (wait_event_interruptible(file->poll_wait, - !list_empty(&file->event_list))) + (!list_empty(&file->event_list) || + /* The barriers built into wait_event_interruptible() + * and wake_up() guarentee this will see the null set + * without using RCU + */ + !file->uverbs_file->device->ib_dev))) return -ERESTARTSYS; + /* If device was disassociated and no event exists set an error */ + if (list_empty(&file->event_list) && + !file->uverbs_file->device->ib_dev) + return -EIO; + spin_lock_irq(&file->lock); } event = list_entry(file->event_list.next, struct ib_uverbs_event, list); if (file->is_async) eventsz = sizeof (struct ib_uverbs_async_event_desc); else eventsz = sizeof (struct ib_uverbs_comp_event_desc); if (eventsz > count) { ret = -EINVAL; event = NULL; } else { list_del(file->event_list.next); if (event->counter) { ++(*event->counter); list_del(&event->obj_list); } } spin_unlock_irq(&file->lock); if (event) { if (copy_to_user(buf, event, eventsz)) ret = -EFAULT; else ret = eventsz; } kfree(event); return ret; } static unsigned int ib_uverbs_event_poll(struct file *filp, struct poll_table_struct *wait) { unsigned int pollflags = 0; struct ib_uverbs_event_file *file = filp->private_data; - file->filp = filp; poll_wait(filp, &file->poll_wait, wait); spin_lock_irq(&file->lock); if (!list_empty(&file->event_list)) pollflags = POLLIN | POLLRDNORM; spin_unlock_irq(&file->lock); return pollflags; } static int ib_uverbs_event_fasync(int fd, struct file *filp, int on) { struct ib_uverbs_event_file *file = filp->private_data; return fasync_helper(fd, filp, on, &file->async_queue); } static int ib_uverbs_event_close(struct inode *inode, struct file *filp) { struct ib_uverbs_event_file *file = filp->private_data; struct ib_uverbs_event *entry, *tmp; + int closed_already = 0; + mutex_lock(&file->uverbs_file->device->lists_mutex); spin_lock_irq(&file->lock); + closed_already = file->is_closed; file->is_closed = 1; list_for_each_entry_safe(entry, tmp, &file->event_list, list) { if (entry->counter) list_del(&entry->obj_list); kfree(entry); } spin_unlock_irq(&file->lock); - - if (file->is_async) { - ib_unregister_event_handler(&file->uverbs_file->event_handler); - kref_put(&file->uverbs_file->ref, ib_uverbs_release_file); + if (!closed_already) { + list_del(&file->list); + if (file->is_async) + ib_unregister_event_handler(&file->uverbs_file-> + event_handler); } + mutex_unlock(&file->uverbs_file->device->lists_mutex); + + kref_put(&file->uverbs_file->ref, ib_uverbs_release_file); kref_put(&file->ref, ib_uverbs_release_event_file); return 0; } static const struct file_operations uverbs_event_fops = { .owner = THIS_MODULE, - .read = ib_uverbs_event_read, + .read = ib_uverbs_event_read, .poll = ib_uverbs_event_poll, .release = ib_uverbs_event_close, .fasync = ib_uverbs_event_fasync, .llseek = no_llseek, }; void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context) { struct ib_uverbs_event_file *file = cq_context; struct ib_ucq_object *uobj; struct ib_uverbs_event *entry; unsigned long flags; if (!file) return; spin_lock_irqsave(&file->lock, flags); if (file->is_closed) { spin_unlock_irqrestore(&file->lock, flags); return; } entry = kmalloc(sizeof *entry, GFP_ATOMIC); if (!entry) { spin_unlock_irqrestore(&file->lock, flags); return; } uobj = container_of(cq->uobject, struct ib_ucq_object, uobject); entry->desc.comp.cq_handle = cq->uobject->user_handle; entry->counter = &uobj->comp_events_reported; list_add_tail(&entry->list, &file->event_list); list_add_tail(&entry->obj_list, &uobj->comp_list); spin_unlock_irqrestore(&file->lock, flags); wake_up_interruptible(&file->poll_wait); - if (file->filp) - selwakeup(&file->filp->f_selinfo); + linux_poll_wakeup(file->filp); kill_fasync(&file->async_queue, SIGIO, POLL_IN); } static void ib_uverbs_async_handler(struct ib_uverbs_file *file, __u64 element, __u64 event, struct list_head *obj_list, u32 *counter) { struct ib_uverbs_event *entry; unsigned long flags; spin_lock_irqsave(&file->async_file->lock, flags); if (file->async_file->is_closed) { spin_unlock_irqrestore(&file->async_file->lock, flags); return; } entry = kmalloc(sizeof *entry, GFP_ATOMIC); if (!entry) { spin_unlock_irqrestore(&file->async_file->lock, flags); return; } entry->desc.async.element = element; entry->desc.async.event_type = event; + entry->desc.async.reserved = 0; entry->counter = counter; list_add_tail(&entry->list, &file->async_file->event_list); if (obj_list) list_add_tail(&entry->obj_list, obj_list); spin_unlock_irqrestore(&file->async_file->lock, flags); wake_up_interruptible(&file->async_file->poll_wait); - if (file->async_file->filp) - selwakeup(&file->async_file->filp->f_selinfo); + linux_poll_wakeup(file->async_file->filp); kill_fasync(&file->async_file->async_queue, SIGIO, POLL_IN); } void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr) { struct ib_ucq_object *uobj = container_of(event->element.cq->uobject, struct ib_ucq_object, uobject); ib_uverbs_async_handler(uobj->uverbs_file, uobj->uobject.user_handle, event->event, &uobj->async_list, &uobj->async_events_reported); } void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr) { struct ib_uevent_object *uobj; + /* for XRC target qp's, check that qp is live */ + if (!event->element.qp->uobject || !event->element.qp->uobject->live) + return; + uobj = container_of(event->element.qp->uobject, struct ib_uevent_object, uobject); ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle, event->event, &uobj->event_list, &uobj->events_reported); } +void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr) +{ + struct ib_uevent_object *uobj = container_of(event->element.wq->uobject, + struct ib_uevent_object, uobject); + + ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle, + event->event, &uobj->event_list, + &uobj->events_reported); +} + void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr) { struct ib_uevent_object *uobj; uobj = container_of(event->element.srq->uobject, struct ib_uevent_object, uobject); ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle, event->event, &uobj->event_list, &uobj->events_reported); } void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event) { struct ib_uverbs_file *file = container_of(handler, struct ib_uverbs_file, event_handler); ib_uverbs_async_handler(file, event->element.port_num, event->event, NULL, NULL); } +void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file) +{ + kref_put(&file->async_file->ref, ib_uverbs_release_event_file); + file->async_file = NULL; +} + struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, + struct ib_device *ib_dev, int is_async) { struct ib_uverbs_event_file *ev_file; struct file *filp; + int ret; - ev_file = kzalloc(sizeof *ev_file, GFP_KERNEL); + ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL); if (!ev_file) return ERR_PTR(-ENOMEM); kref_init(&ev_file->ref); spin_lock_init(&ev_file->lock); INIT_LIST_HEAD(&ev_file->event_list); init_waitqueue_head(&ev_file->poll_wait); ev_file->uverbs_file = uverbs_file; - ev_file->is_async = is_async; + kref_get(&ev_file->uverbs_file->ref); + ev_file->async_queue = NULL; + ev_file->is_closed = 0; /* * fops_get() can't fail here, because we're coming from a * system call on a uverbs file, which will already have a * module reference. */ filp = alloc_file(FMODE_READ, fops_get(&uverbs_event_fops)); - - if (IS_ERR(filp)) { - kfree(ev_file); - } else { + if (IS_ERR(filp)) + goto err_put_refs; filp->private_data = ev_file; + ev_file->filp = filp; + + mutex_lock(&uverbs_file->device->lists_mutex); + list_add_tail(&ev_file->list, + &uverbs_file->device->uverbs_events_file_list); + mutex_unlock(&uverbs_file->device->lists_mutex); + + if (is_async) { + WARN_ON(uverbs_file->async_file); + uverbs_file->async_file = ev_file; + kref_get(&uverbs_file->async_file->ref); + INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler, + ib_dev, + ib_uverbs_event_handler); + ret = ib_register_event_handler(&uverbs_file->event_handler); + if (ret) + goto err_put_file; + + /* At that point async file stuff was fully set */ + ev_file->is_async = 1; } return filp; + +err_put_file: + fput(filp); + kref_put(&uverbs_file->async_file->ref, ib_uverbs_release_event_file); + uverbs_file->async_file = NULL; + return ERR_PTR(ret); + +err_put_refs: + kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file); + kref_put(&ev_file->ref, ib_uverbs_release_event_file); + return filp; } /* * Look up a completion event file by FD. If lookup is successful, * takes a ref to the event file struct that it returns; if * unsuccessful, returns NULL. */ struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd) { struct ib_uverbs_event_file *ev_file = NULL; struct fd f = fdget(fd); if (!f.file) return NULL; if (f.file->f_op != &uverbs_event_fops) goto out; ev_file = f.file->private_data; if (ev_file->is_async) { ev_file = NULL; goto out; } kref_get(&ev_file->ref); out: fdput(f); return ev_file; } -static const char *verbs_cmd_str(__u32 cmd) +static int verify_command_mask(struct ib_device *ib_dev, __u32 command) { - switch (cmd) { - case IB_USER_VERBS_CMD_GET_CONTEXT: - return "GET_CONTEXT"; - case IB_USER_VERBS_CMD_QUERY_DEVICE: - return "QUERY_DEVICE"; - case IB_USER_VERBS_CMD_QUERY_PORT: - return "QUERY_PORT"; - case IB_USER_VERBS_CMD_ALLOC_PD: - return "ALLOC_PD"; - case IB_USER_VERBS_CMD_DEALLOC_PD: - return "DEALLOC_PD"; - case IB_USER_VERBS_CMD_REG_MR: - return "REG_MR"; - case IB_USER_VERBS_CMD_DEREG_MR: - return "DEREG_MR"; - case IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL: - return "CREATE_COMP_CHANNEL"; - case IB_USER_VERBS_CMD_CREATE_CQ: - return "CREATE_CQ"; - case IB_USER_VERBS_CMD_RESIZE_CQ: - return "RESIZE_CQ"; - case IB_USER_VERBS_CMD_POLL_CQ: - return "POLL_CQ"; - case IB_USER_VERBS_CMD_REQ_NOTIFY_CQ: - return "REQ_NOTIFY_CQ"; - case IB_USER_VERBS_CMD_DESTROY_CQ: - return "DESTROY_CQ"; - case IB_USER_VERBS_CMD_CREATE_QP: - return "CREATE_QP"; - case IB_USER_VERBS_CMD_QUERY_QP: - return "QUERY_QP"; - case IB_USER_VERBS_CMD_MODIFY_QP: - return "MODIFY_QP"; - case IB_USER_VERBS_CMD_DESTROY_QP: - return "DESTROY_QP"; - case IB_USER_VERBS_CMD_POST_SEND: - return "POST_SEND"; - case IB_USER_VERBS_CMD_POST_RECV: - return "POST_RECV"; - case IB_USER_VERBS_CMD_POST_SRQ_RECV: - return "POST_SRQ_RECV"; - case IB_USER_VERBS_CMD_CREATE_AH: - return "CREATE_AH"; - case IB_USER_VERBS_CMD_DESTROY_AH: - return "DESTROY_AH"; - case IB_USER_VERBS_CMD_ATTACH_MCAST: - return "ATTACH_MCAST"; - case IB_USER_VERBS_CMD_DETACH_MCAST: - return "DETACH_MCAST"; - case IB_USER_VERBS_CMD_CREATE_SRQ: - return "CREATE_SRQ"; - case IB_USER_VERBS_CMD_MODIFY_SRQ: - return "MODIFY_SRQ"; - case IB_USER_VERBS_CMD_QUERY_SRQ: - return "QUERY_SRQ"; - case IB_USER_VERBS_CMD_DESTROY_SRQ: - return "DESTROY_SRQ"; - case IB_USER_VERBS_CMD_OPEN_XRCD: - return "OPEN_XRCD"; - case IB_USER_VERBS_CMD_CLOSE_XRCD: - return "CLOSE_XRCD"; - case IB_USER_VERBS_CMD_CREATE_XSRQ: - return "CREATE_XSRQ"; - case IB_USER_VERBS_CMD_OPEN_QP: - return "OPEN_QP"; - } + u64 mask; - return "Unknown command"; -} + if (command <= IB_USER_VERBS_CMD_OPEN_QP) + mask = ib_dev->uverbs_cmd_mask; + else + mask = ib_dev->uverbs_ex_cmd_mask; -enum { - COMMAND_INFO_MASK = 0x1000, -}; + if (mask & ((u64)1 << command)) + return 0; -static ssize_t ib_uverbs_exp_handle_cmd(struct ib_uverbs_file *file, - const char __user *buf, - struct ib_device *dev, - struct ib_uverbs_cmd_hdr *hdr, - size_t count, - int legacy_ex_cmd) -{ - struct ib_udata ucore; - struct ib_udata uhw; - struct ib_uverbs_ex_cmd_hdr ex_hdr; - __u32 command = hdr->command - IB_USER_VERBS_EXP_CMD_FIRST; - - if (hdr->command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | - IB_USER_VERBS_CMD_COMMAND_MASK)) - return -EINVAL; - - if (command >= ARRAY_SIZE(uverbs_exp_cmd_table) || - !uverbs_exp_cmd_table[command]) - return -EINVAL; - - if (!file->ucontext) - return -EINVAL; - - if (!(dev->uverbs_exp_cmd_mask & (1ull << command))) - return -ENOSYS; - - if (legacy_ex_cmd) { - struct ib_uverbs_ex_cmd_hdr_legacy hxl; - struct ib_uverbs_ex_cmd_resp1_legacy resp1; - __u64 response; - ssize_t ret; - - if (count < sizeof(hxl)) - return -EINVAL; - - if (copy_from_user(&hxl, buf, sizeof(hxl))) - return -EFAULT; - - if (((hxl.in_words + hxl.provider_in_words) * 4) != count) - return -EINVAL; - - count -= sizeof(hxl); - buf += sizeof(hxl); - if (hxl.out_words || hxl.provider_out_words) { - if (count < sizeof(resp1)) - return -EINVAL; - if (copy_from_user(&resp1, buf, sizeof(resp1))) - return -EFAULT; - response = resp1.response; - if (!response) - return -EINVAL; - - /* - * Change user buffer to comply with new extension format. - */ - if (sizeof(resp1.comp_mask) != sizeof(resp1.response)) - return -EFAULT; - buf += sizeof(resp1.comp_mask); - if (copy_to_user(__DECONST(void __user *, buf), &resp1.comp_mask, - sizeof(resp1.response))) - return -EFAULT; - - } else { - response = 0; - } - - INIT_UDATA_EX(&ucore, - (hxl.in_words) ? buf : 0, - response, - hxl.in_words * 4, - hxl.out_words * 4); - - INIT_UDATA_EX(&uhw, - (hxl.provider_in_words) ? buf + ucore.inlen : 0, - (hxl.provider_out_words) ? response + ucore.outlen : 0, - hxl.provider_in_words * 4, - hxl.provider_out_words * 4); - - ret = uverbs_exp_cmd_table[command](file, &ucore, &uhw); - /* - * UnChange user buffer - */ - if (response && copy_to_user(__DECONST(void __user *, buf), &resp1.response, sizeof(resp1.response))) - return -EFAULT; - - return ret; - } else { - if (count < (sizeof(hdr) + sizeof(ex_hdr))) - return -EINVAL; - - if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) - return -EFAULT; - - buf += sizeof(hdr) + sizeof(ex_hdr); - - if ((hdr->in_words + ex_hdr.provider_in_words) * 8 != count) - return -EINVAL; - - if (ex_hdr.response) { - if (!hdr->out_words && !ex_hdr.provider_out_words) - return -EINVAL; - } else { - if (hdr->out_words || ex_hdr.provider_out_words) - return -EINVAL; - } - - INIT_UDATA_EX(&ucore, - (hdr->in_words) ? buf : 0, - (unsigned long)ex_hdr.response, - hdr->in_words * 8, - hdr->out_words * 8); - - INIT_UDATA_EX(&uhw, - (ex_hdr.provider_in_words) ? buf + ucore.inlen : 0, - (ex_hdr.provider_out_words) ? ex_hdr.response + ucore.outlen : 0, - ex_hdr.provider_in_words * 8, - ex_hdr.provider_out_words * 8); - - return uverbs_exp_cmd_table[command](file, &ucore, &uhw); - } + return -1; } static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct ib_uverbs_file *file = filp->private_data; - struct ib_device *dev = file->device->ib_dev; + struct ib_device *ib_dev; struct ib_uverbs_cmd_hdr hdr; - struct timespec ts1; - struct timespec ts2; - ktime_t t1, t2, delta; - s64 ds; - ssize_t ret; - u64 dividend; - u32 divisor; - __u32 flags; __u32 command; - int legacy_ex_cmd = 0; - size_t written_count = count; + __u32 flags; + int srcu_key; + ssize_t ret; + if (WARN_ON_ONCE(!ib_safe_file_access(filp))) + return -EACCES; + if (count < sizeof hdr) return -EINVAL; if (copy_from_user(&hdr, buf, sizeof hdr)) return -EFAULT; - /* - * For BWD compatibility change old style extension verbs commands - * to their equivalent experimental command. - */ - if ((hdr.command >= IB_USER_VERBS_LEGACY_CMD_FIRST) && - (hdr.command <= IB_USER_VERBS_LEGACY_EX_CMD_LAST)) { - hdr.command += IB_USER_VERBS_EXP_CMD_FIRST - - IB_USER_VERBS_LEGACY_CMD_FIRST; - legacy_ex_cmd = 1; + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + ib_dev = srcu_dereference(file->device->ib_dev, + &file->device->disassociate_srcu); + if (!ib_dev) { + ret = -EIO; + goto out; } + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) { + ret = -EINVAL; + goto out; + } + + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + if (verify_command_mask(ib_dev, command)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (!file->ucontext && + command != IB_USER_VERBS_CMD_GET_CONTEXT) { + ret = -EINVAL; + goto out; + } + flags = (hdr.command & IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; - command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; - ktime_get_ts(&ts1); - if (!flags && (command >= IB_USER_VERBS_EXP_CMD_FIRST)) { - ret = ib_uverbs_exp_handle_cmd(file, buf, dev, &hdr, count, legacy_ex_cmd); - } else if (!flags) { + if (!flags) { if (command >= ARRAY_SIZE(uverbs_cmd_table) || - !uverbs_cmd_table[command]) - return -EINVAL; + !uverbs_cmd_table[command]) { + ret = -EINVAL; + goto out; + } - if (!file->ucontext && - command != IB_USER_VERBS_CMD_GET_CONTEXT) - return -EINVAL; + if (hdr.in_words * 4 != count) { + ret = -EINVAL; + goto out; + } - if (!(dev->uverbs_cmd_mask & (1ull << command))) - return -ENOSYS; + ret = uverbs_cmd_table[command](file, ib_dev, + buf + sizeof(hdr), + hdr.in_words * 4, + hdr.out_words * 4); - if (hdr.in_words * 4 != count) - return -EINVAL; - - ret = uverbs_cmd_table[command](file, - buf + sizeof(hdr), - hdr.in_words * 4, - hdr.out_words * 4); } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) { + struct ib_uverbs_ex_cmd_hdr ex_hdr; struct ib_udata ucore; struct ib_udata uhw; - struct ib_uverbs_ex_cmd_hdr ex_hdr; + size_t written_count = count; - if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | - IB_USER_VERBS_CMD_COMMAND_MASK)) - return -EINVAL; - if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) || - !uverbs_ex_cmd_table[command]) - return -EINVAL; + !uverbs_ex_cmd_table[command]) { + ret = -ENOSYS; + goto out; + } - if (!file->ucontext) - return -EINVAL; + if (!file->ucontext) { + ret = -EINVAL; + goto out; + } - if (!(dev->uverbs_ex_cmd_mask & (1ull << command))) - return -ENOSYS; + if (count < (sizeof(hdr) + sizeof(ex_hdr))) { + ret = -EINVAL; + goto out; + } - if (count < (sizeof(hdr) + sizeof(ex_hdr))) - return -EINVAL; + if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) { + ret = -EFAULT; + goto out; + } - if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) - return -EFAULT; - count -= sizeof(hdr) + sizeof(ex_hdr); buf += sizeof(hdr) + sizeof(ex_hdr); - if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) - return -EINVAL; + if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) { + ret = -EINVAL; + goto out; + } + if (ex_hdr.cmd_hdr_reserved) { + ret = -EINVAL; + goto out; + } + if (ex_hdr.response) { - if (!hdr.out_words && !ex_hdr.provider_out_words) - return -EINVAL; + if (!hdr.out_words && !ex_hdr.provider_out_words) { + ret = -EINVAL; + goto out; + } + + if (!access_ok(VERIFY_WRITE, + (void __user *) (unsigned long) ex_hdr.response, + (hdr.out_words + ex_hdr.provider_out_words) * 8)) { + ret = -EFAULT; + goto out; + } } else { - if (hdr.out_words || ex_hdr.provider_out_words) - return -EINVAL; + if (hdr.out_words || ex_hdr.provider_out_words) { + ret = -EINVAL; + goto out; + } } - INIT_UDATA_EX(&ucore, - (hdr.in_words) ? buf : 0, - (unsigned long)ex_hdr.response, - hdr.in_words * 8, - hdr.out_words * 8); + INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response, + hdr.in_words * 8, hdr.out_words * 8); - INIT_UDATA_EX(&uhw, - (ex_hdr.provider_in_words) ? buf + ucore.inlen : 0, - (ex_hdr.provider_out_words) ? ex_hdr.response + ucore.outlen : 0, - ex_hdr.provider_in_words * 8, - ex_hdr.provider_out_words * 8); + INIT_UDATA_BUF_OR_NULL(&uhw, + buf + ucore.inlen, + (unsigned long) ex_hdr.response + ucore.outlen, + ex_hdr.provider_in_words * 8, + ex_hdr.provider_out_words * 8); - ret = uverbs_ex_cmd_table[command](file, &ucore, &uhw); - - if (ret) - return ret; - - return written_count; - + ret = uverbs_ex_cmd_table[command](file, + ib_dev, + &ucore, + &uhw); + if (!ret) + ret = written_count; } else { - return -EFAULT; + ret = -ENOSYS; } - if ((dev->cmd_perf & (COMMAND_INFO_MASK - 1)) == hdr.command) { - ktime_get_ts(&ts2); - t1 = timespec_to_ktime(ts1); - t2 = timespec_to_ktime(ts2); - delta = ktime_sub(t2, t1); - ds = ktime_to_ns(delta); - spin_lock(&dev->cmd_perf_lock); - dividend = dev->cmd_avg * dev->cmd_n + ds; - ++dev->cmd_n; - divisor = dev->cmd_n; - do_div(dividend, divisor); - dev->cmd_avg = dividend; - spin_unlock(&dev->cmd_perf_lock); - if (dev->cmd_perf & COMMAND_INFO_MASK) { - pr_info("%s: %s execution time = %lld nsec\n", - file->device->ib_dev->name, - verbs_cmd_str(hdr.command), - (long long)ds); - } - } +out: + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); return ret; } static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) { struct ib_uverbs_file *file = filp->private_data; + struct ib_device *ib_dev; + int ret = 0; + int srcu_key; - if (!file->ucontext) - return -ENODEV; - else - return file->device->ib_dev->mmap(file->ucontext, vma); -} -/* XXX Not supported in FreeBSD */ -#if 0 -static unsigned long ib_uverbs_get_unmapped_area(struct file *filp, - unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct ib_uverbs_file *file = filp->private_data; - - if (!file->ucontext) - return -ENODEV; - else { - if (!file->device->ib_dev->get_unmapped_area) - return current->mm->get_unmapped_area(filp, addr, len, - pgoff, flags); - - return file->device->ib_dev->get_unmapped_area(filp, addr, len, - pgoff, flags); + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + ib_dev = srcu_dereference(file->device->ib_dev, + &file->device->disassociate_srcu); + if (!ib_dev) { + ret = -EIO; + goto out; } -} -#endif -static long ib_uverbs_ioctl(struct file *filp, - unsigned int cmd, unsigned long arg) -{ - struct ib_uverbs_file *file = filp->private_data; - - if (!file->device->ib_dev->ioctl) - return -ENOTSUPP; - if (!file->ucontext) - return -ENODEV; + ret = -ENODEV; else - /* provider should provide it's own locking mechanism */ - return file->device->ib_dev->ioctl(file->ucontext, cmd, arg); + ret = ib_dev->mmap(file->ucontext, vma); +out: + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + return ret; } /* * ib_uverbs_open() does not need the BKL: * * - the ib_uverbs_device structures are properly reference counted and * everything else is purely local to the file being created, so * races against other open calls are not a problem; * - there is no ioctl method to race against; * - the open method will either immediately run -ENXIO, or all * required initialization will be done. */ static int ib_uverbs_open(struct inode *inode, struct file *filp) { struct ib_uverbs_device *dev; struct ib_uverbs_file *file; + struct ib_device *ib_dev; int ret; + int module_dependent; + int srcu_key; dev = container_of(inode->i_cdev->si_drv1, struct ib_uverbs_device, cdev); - if (dev) - kref_get(&dev->ref); - else + if (!atomic_inc_not_zero(&dev->refcount)) return -ENXIO; - if (!try_module_get(dev->ib_dev->owner)) { - ret = -ENODEV; + srcu_key = srcu_read_lock(&dev->disassociate_srcu); + mutex_lock(&dev->lists_mutex); + ib_dev = srcu_dereference(dev->ib_dev, + &dev->disassociate_srcu); + if (!ib_dev) { + ret = -EIO; goto err; } - file = kmalloc(sizeof *file, GFP_KERNEL); + /* In case IB device supports disassociate ucontext, there is no hard + * dependency between uverbs device and its low level device. + */ + module_dependent = !(ib_dev->disassociate_ucontext); + + if (module_dependent) { + if (!try_module_get(ib_dev->owner)) { + ret = -ENODEV; + goto err; + } + } + + file = kzalloc(sizeof(*file), GFP_KERNEL); if (!file) { ret = -ENOMEM; - goto err_module; + if (module_dependent) + goto err_module; + + goto err; } file->device = dev; file->ucontext = NULL; file->async_file = NULL; kref_init(&file->ref); mutex_init(&file->mutex); + mutex_init(&file->cleanup_mutex); filp->private_data = file; + kobject_get(&dev->kobj); + list_add_tail(&file->list, &dev->uverbs_file_list); + mutex_unlock(&dev->lists_mutex); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return nonseekable_open(inode, filp); err_module: - module_put(dev->ib_dev->owner); + module_put(ib_dev->owner); err: - kref_put(&dev->ref, ib_uverbs_release_dev); + mutex_unlock(&dev->lists_mutex); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); + if (atomic_dec_and_test(&dev->refcount)) + ib_uverbs_comp_dev(dev); + return ret; } static int ib_uverbs_close(struct inode *inode, struct file *filp) { struct ib_uverbs_file *file = filp->private_data; + struct ib_uverbs_device *dev = file->device; - ib_uverbs_cleanup_ucontext(file, file->ucontext); + mutex_lock(&file->cleanup_mutex); + if (file->ucontext) { + ib_uverbs_cleanup_ucontext(file, file->ucontext); + file->ucontext = NULL; + } + mutex_unlock(&file->cleanup_mutex); + mutex_lock(&file->device->lists_mutex); + if (!file->is_closed) { + list_del(&file->list); + file->is_closed = 1; + } + mutex_unlock(&file->device->lists_mutex); + if (file->async_file) kref_put(&file->async_file->ref, ib_uverbs_release_event_file); kref_put(&file->ref, ib_uverbs_release_file); + kobject_put(&dev->kobj); return 0; } static const struct file_operations uverbs_fops = { - .owner = THIS_MODULE, - .write = ib_uverbs_write, - .open = ib_uverbs_open, + .owner = THIS_MODULE, + .write = ib_uverbs_write, + .open = ib_uverbs_open, .release = ib_uverbs_close, .llseek = no_llseek, - .unlocked_ioctl = ib_uverbs_ioctl, }; static const struct file_operations uverbs_mmap_fops = { - .owner = THIS_MODULE, - .write = ib_uverbs_write, + .owner = THIS_MODULE, + .write = ib_uverbs_write, .mmap = ib_uverbs_mmap, - .open = ib_uverbs_open, + .open = ib_uverbs_open, .release = ib_uverbs_close, .llseek = no_llseek, -/* XXX Not supported in FreeBSD */ -#if 0 - .get_unmapped_area = ib_uverbs_get_unmapped_area, -#endif - .unlocked_ioctl = ib_uverbs_ioctl, }; static struct ib_client uverbs_client = { .name = "uverbs", .add = ib_uverbs_add_one, .remove = ib_uverbs_remove_one }; static ssize_t show_ibdev(struct device *device, struct device_attribute *attr, char *buf) { + int ret = -ENODEV; + int srcu_key; struct ib_uverbs_device *dev = dev_get_drvdata(device); + struct ib_device *ib_dev; if (!dev) return -ENODEV; - return sprintf(buf, "%s\n", dev->ib_dev->name); + srcu_key = srcu_read_lock(&dev->disassociate_srcu); + ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); + if (ib_dev) + ret = sprintf(buf, "%s\n", ib_dev->name); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); + + return ret; } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); -static ssize_t show_dev_ref_cnt(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_uverbs_device *dev = dev_get_drvdata(device); - - if (!dev) - return -ENODEV; - - return sprintf(buf, "%d\n", atomic_read(&dev->ref.refcount)); -} -static DEVICE_ATTR(ref_cnt, S_IRUGO, show_dev_ref_cnt, NULL); - static ssize_t show_dev_abi_version(struct device *device, struct device_attribute *attr, char *buf) { struct ib_uverbs_device *dev = dev_get_drvdata(device); + int ret = -ENODEV; + int srcu_key; + struct ib_device *ib_dev; if (!dev) return -ENODEV; + srcu_key = srcu_read_lock(&dev->disassociate_srcu); + ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); + if (ib_dev) + ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); - return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver); + return ret; } static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL); -static ssize_t show_abi_version(struct class *class, struct class_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", IB_USER_VERBS_ABI_VERSION); -} +static CLASS_ATTR_STRING(abi_version, S_IRUGO, + __stringify(IB_USER_VERBS_ABI_VERSION)); -static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); - static dev_t overflow_maj; static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES); /* * If we have more than IB_UVERBS_MAX_DEVICES, dynamically overflow by * requesting a new major number and doubling the number of max devices we * support. It's stupid, but simple. */ static int find_overflow_devnum(void) { int ret; if (!overflow_maj) { ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES, "infiniband_verbs"); if (ret) { - printk(KERN_ERR "user_verbs: couldn't register dynamic device number\n"); + pr_err("user_verbs: couldn't register dynamic device number\n"); return ret; } } ret = find_first_zero_bit(overflow_map, IB_UVERBS_MAX_DEVICES); if (ret >= IB_UVERBS_MAX_DEVICES) return -1; return ret; } -#include -static ssize_t -show_dev_device(struct device *device, struct device_attribute *attr, char *buf) -{ - struct ib_uverbs_device *dev = dev_get_drvdata(device); - - if (!dev || !dev->ib_dev->dma_device) - return -ENODEV; - - return sprintf(buf, "0x%04x\n", - ((struct pci_dev *)dev->ib_dev->dma_device)->device); -} -static DEVICE_ATTR(device, S_IRUGO, show_dev_device, NULL); - -static ssize_t -show_dev_vendor(struct device *device, struct device_attribute *attr, char *buf) -{ - struct ib_uverbs_device *dev = dev_get_drvdata(device); - - if (!dev || !dev->ib_dev->dma_device) - return -ENODEV; - - return sprintf(buf, "0x%04x\n", - ((struct pci_dev *)dev->ib_dev->dma_device)->vendor); -} - -static DEVICE_ATTR(vendor, S_IRUGO, show_dev_vendor, NULL); - -struct attribute *device_attrs[] = -{ - &dev_attr_device.attr, - &dev_attr_vendor.attr, - NULL -}; - -static struct attribute_group device_group = { - .name = "device", - .attrs = device_attrs -}; - static void ib_uverbs_add_one(struct ib_device *device) { int devnum; dev_t base; struct ib_uverbs_device *uverbs_dev; + int ret; if (!device->alloc_ucontext) return; uverbs_dev = kzalloc(sizeof *uverbs_dev, GFP_KERNEL); if (!uverbs_dev) return; - kref_init(&uverbs_dev->ref); + ret = init_srcu_struct(&uverbs_dev->disassociate_srcu); + if (ret) { + kfree(uverbs_dev); + return; + } + + atomic_set(&uverbs_dev->refcount, 1); init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; mutex_init(&uverbs_dev->xrcd_tree_mutex); + kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype); + mutex_init(&uverbs_dev->lists_mutex); + INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); + INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list); spin_lock(&map_lock); devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); if (devnum >= IB_UVERBS_MAX_DEVICES) { spin_unlock(&map_lock); devnum = find_overflow_devnum(); if (devnum < 0) - goto err; + goto err; spin_lock(&map_lock); uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES; base = devnum + overflow_maj; set_bit(devnum, overflow_map); } else { uverbs_dev->devnum = devnum; base = devnum + IB_UVERBS_BASE_DEV; set_bit(devnum, dev_map); } spin_unlock(&map_lock); - uverbs_dev->ib_dev = device; + rcu_assign_pointer(uverbs_dev->ib_dev, device); uverbs_dev->num_comp_vectors = device->num_comp_vectors; cdev_init(&uverbs_dev->cdev, NULL); uverbs_dev->cdev.owner = THIS_MODULE; uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; + uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj; kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum); if (cdev_add(&uverbs_dev->cdev, base, 1)) goto err_cdev; uverbs_dev->dev = device_create(uverbs_class, device->dma_device, uverbs_dev->cdev.dev, uverbs_dev, "uverbs%d", uverbs_dev->devnum); if (IS_ERR(uverbs_dev->dev)) goto err_cdev; if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev)) goto err_class; - if (device_create_file(uverbs_dev->dev, &dev_attr_ref_cnt)) - goto err_class; if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) goto err_class; - if (sysfs_create_group(&uverbs_dev->dev->kobj, &device_group)) - goto err_class; ib_set_client_data(device, &uverbs_client, uverbs_dev); return; err_class: device_destroy(uverbs_class, uverbs_dev->cdev.dev); err_cdev: cdev_del(&uverbs_dev->cdev); if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) clear_bit(devnum, dev_map); else clear_bit(devnum, overflow_map); err: - kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); + if (atomic_dec_and_test(&uverbs_dev->refcount)) + ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); - kfree(uverbs_dev); + kobject_put(&uverbs_dev->kobj); return; } -static void ib_uverbs_remove_one(struct ib_device *device) +static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, + struct ib_device *ib_dev) { - struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client); + struct ib_uverbs_file *file; + struct ib_uverbs_event_file *event_file; + struct ib_event event; + /* Pending running commands to terminate */ + synchronize_srcu(&uverbs_dev->disassociate_srcu); + event.event = IB_EVENT_DEVICE_FATAL; + event.element.port_num = 0; + event.device = ib_dev; + + mutex_lock(&uverbs_dev->lists_mutex); + while (!list_empty(&uverbs_dev->uverbs_file_list)) { + struct ib_ucontext *ucontext; + file = list_first_entry(&uverbs_dev->uverbs_file_list, + struct ib_uverbs_file, list); + file->is_closed = 1; + list_del(&file->list); + kref_get(&file->ref); + mutex_unlock(&uverbs_dev->lists_mutex); + + ib_uverbs_event_handler(&file->event_handler, &event); + + mutex_lock(&file->cleanup_mutex); + ucontext = file->ucontext; + file->ucontext = NULL; + mutex_unlock(&file->cleanup_mutex); + + /* At this point ib_uverbs_close cannot be running + * ib_uverbs_cleanup_ucontext + */ + if (ucontext) { + /* We must release the mutex before going ahead and + * calling disassociate_ucontext. disassociate_ucontext + * might end up indirectly calling uverbs_close, + * for example due to freeing the resources + * (e.g mmput). + */ + ib_dev->disassociate_ucontext(ucontext); + ib_uverbs_cleanup_ucontext(file, ucontext); + } + + mutex_lock(&uverbs_dev->lists_mutex); + kref_put(&file->ref, ib_uverbs_release_file); + } + + while (!list_empty(&uverbs_dev->uverbs_events_file_list)) { + event_file = list_first_entry(&uverbs_dev-> + uverbs_events_file_list, + struct ib_uverbs_event_file, + list); + spin_lock_irq(&event_file->lock); + event_file->is_closed = 1; + spin_unlock_irq(&event_file->lock); + + list_del(&event_file->list); + if (event_file->is_async) { + ib_unregister_event_handler(&event_file->uverbs_file-> + event_handler); + event_file->uverbs_file->event_handler.device = NULL; + } + + wake_up_interruptible(&event_file->poll_wait); + linux_poll_wakeup(event_file->filp); + kill_fasync(&event_file->async_queue, SIGIO, POLL_IN); + } + mutex_unlock(&uverbs_dev->lists_mutex); +} + +static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) +{ + struct ib_uverbs_device *uverbs_dev = client_data; + int wait_clients = 1; + if (!uverbs_dev) return; - sysfs_remove_group(&uverbs_dev->dev->kobj, &device_group); dev_set_drvdata(uverbs_dev->dev, NULL); device_destroy(uverbs_class, uverbs_dev->cdev.dev); cdev_del(&uverbs_dev->cdev); if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) - clear_bit(uverbs_dev->devnum, dev_map); + clear_bit(uverbs_dev->devnum, dev_map); else clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); - kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); - wait_for_completion(&uverbs_dev->comp); - kfree(uverbs_dev); + if (device->disassociate_ucontext) { + /* We disassociate HW resources and immediately return. + * Userspace will see a EIO errno for all future access. + * Upon returning, ib_device may be freed internally and is not + * valid any more. + * uverbs_device is still available until all clients close + * their files, then the uverbs device ref count will be zero + * and its resources will be freed. + * Note: At this point no more files can be opened since the + * cdev was deleted, however active clients can still issue + * commands and close their open files. + */ + rcu_assign_pointer(uverbs_dev->ib_dev, NULL); + ib_uverbs_free_hw_resources(uverbs_dev, device); + wait_clients = 0; + } + + if (atomic_dec_and_test(&uverbs_dev->refcount)) + ib_uverbs_comp_dev(uverbs_dev); + if (wait_clients) + wait_for_completion(&uverbs_dev->comp); + kobject_put(&uverbs_dev->kobj); } static char *uverbs_devnode(struct device *dev, umode_t *mode) { if (mode) *mode = 0666; return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); } static int __init ib_uverbs_init(void) { int ret; ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES, "infiniband_verbs"); if (ret) { - printk(KERN_ERR "user_verbs: couldn't register device number\n"); + pr_err("user_verbs: couldn't register device number\n"); goto out; } uverbs_class = class_create(THIS_MODULE, "infiniband_verbs"); if (IS_ERR(uverbs_class)) { ret = PTR_ERR(uverbs_class); - printk(KERN_ERR "user_verbs: couldn't create class infiniband_verbs\n"); + pr_err("user_verbs: couldn't create class infiniband_verbs\n"); goto out_chrdev; } uverbs_class->devnode = uverbs_devnode; - ret = class_create_file(uverbs_class, &class_attr_abi_version); + ret = class_create_file(uverbs_class, &class_attr_abi_version.attr); if (ret) { - printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n"); + pr_err("user_verbs: couldn't create abi_version attribute\n"); goto out_class; } ret = ib_register_client(&uverbs_client); if (ret) { - printk(KERN_ERR "user_verbs: couldn't register client\n"); + pr_err("user_verbs: couldn't register client\n"); goto out_class; } return 0; out_class: class_destroy(uverbs_class); out_chrdev: unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES); out: return ret; } static void __exit ib_uverbs_cleanup(void) { ib_unregister_client(&uverbs_client); class_destroy(uverbs_class); unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES); if (overflow_maj) unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES); idr_destroy(&ib_uverbs_pd_idr); idr_destroy(&ib_uverbs_mr_idr); idr_destroy(&ib_uverbs_mw_idr); idr_destroy(&ib_uverbs_ah_idr); idr_destroy(&ib_uverbs_cq_idr); idr_destroy(&ib_uverbs_qp_idr); idr_destroy(&ib_uverbs_srq_idr); } -module_init(ib_uverbs_init); +module_init_order(ib_uverbs_init, SI_ORDER_THIRD); module_exit(ib_uverbs_cleanup); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/uverbs_marshall.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/uverbs_marshall.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/core/uverbs_marshall.c (revision 319974) @@ -1,144 +1,148 @@ /* * Copyright (c) 2005 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include #include void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, struct ib_ah_attr *src) { memcpy(dst->grh.dgid, src->grh.dgid.raw, sizeof src->grh.dgid); dst->grh.flow_label = src->grh.flow_label; dst->grh.sgid_index = src->grh.sgid_index; dst->grh.hop_limit = src->grh.hop_limit; dst->grh.traffic_class = src->grh.traffic_class; memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved)); dst->dlid = src->dlid; dst->sl = src->sl; dst->src_path_bits = src->src_path_bits; dst->static_rate = src->static_rate; dst->is_global = src->ah_flags & IB_AH_GRH ? 1 : 0; dst->port_num = src->port_num; dst->reserved = 0; } EXPORT_SYMBOL(ib_copy_ah_attr_to_user); void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, struct ib_qp_attr *src) { dst->qp_state = src->qp_state; dst->cur_qp_state = src->cur_qp_state; dst->path_mtu = src->path_mtu; dst->path_mig_state = src->path_mig_state; dst->qkey = src->qkey; dst->rq_psn = src->rq_psn; dst->sq_psn = src->sq_psn; dst->dest_qp_num = src->dest_qp_num; dst->qp_access_flags = src->qp_access_flags; dst->max_send_wr = src->cap.max_send_wr; dst->max_recv_wr = src->cap.max_recv_wr; dst->max_send_sge = src->cap.max_send_sge; dst->max_recv_sge = src->cap.max_recv_sge; dst->max_inline_data = src->cap.max_inline_data; ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr); ib_copy_ah_attr_to_user(&dst->alt_ah_attr, &src->alt_ah_attr); dst->pkey_index = src->pkey_index; dst->alt_pkey_index = src->alt_pkey_index; dst->en_sqd_async_notify = src->en_sqd_async_notify; dst->sq_draining = src->sq_draining; dst->max_rd_atomic = src->max_rd_atomic; dst->max_dest_rd_atomic = src->max_dest_rd_atomic; dst->min_rnr_timer = src->min_rnr_timer; dst->port_num = src->port_num; dst->timeout = src->timeout; dst->retry_cnt = src->retry_cnt; dst->rnr_retry = src->rnr_retry; dst->alt_port_num = src->alt_port_num; dst->alt_timeout = src->alt_timeout; memset(dst->reserved, 0, sizeof(dst->reserved)); } EXPORT_SYMBOL(ib_copy_qp_attr_to_user); void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, struct ib_sa_path_rec *src) { memcpy(dst->dgid, src->dgid.raw, sizeof src->dgid); memcpy(dst->sgid, src->sgid.raw, sizeof src->sgid); dst->dlid = src->dlid; dst->slid = src->slid; dst->raw_traffic = src->raw_traffic; dst->flow_label = src->flow_label; dst->hop_limit = src->hop_limit; dst->traffic_class = src->traffic_class; dst->reversible = src->reversible; dst->numb_path = src->numb_path; dst->pkey = src->pkey; dst->sl = src->sl; dst->mtu_selector = src->mtu_selector; dst->mtu = src->mtu; dst->rate_selector = src->rate_selector; dst->rate = src->rate; dst->packet_life_time = src->packet_life_time; dst->preference = src->preference; dst->packet_life_time_selector = src->packet_life_time_selector; } EXPORT_SYMBOL(ib_copy_path_rec_to_user); void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst, struct ib_user_path_rec *src) { memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid); memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid); dst->dlid = src->dlid; dst->slid = src->slid; dst->raw_traffic = src->raw_traffic; dst->flow_label = src->flow_label; dst->hop_limit = src->hop_limit; dst->traffic_class = src->traffic_class; dst->reversible = src->reversible; dst->numb_path = src->numb_path; dst->pkey = src->pkey; dst->sl = src->sl; dst->mtu_selector = src->mtu_selector; dst->mtu = src->mtu; dst->rate_selector = src->rate_selector; dst->rate = src->rate; dst->packet_life_time = src->packet_life_time; dst->preference = src->preference; dst->packet_life_time_selector = src->packet_life_time_selector; + + memset(dst->dmac, 0, sizeof(dst->dmac)); + dst->net = NULL; + dst->ifindex = 0; + dst->gid_type = IB_GID_TYPE_IB; } EXPORT_SYMBOL(ib_copy_path_rec_from_user); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/mtrack.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/mtrack.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/mtrack.h (nonexistent) @@ -1,844 +0,0 @@ -#ifndef __mtrack_h_ -#define __mtrack_h_ - -#include "memtrack.h" - -#include -#include -#include -#include /* For ioremap_nocache, ioremap, iounmap */ -#include -#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 27) -# include /* For ioremap_nocache, ioremap, iounmap */ -#endif -#include /* For all page handling */ -#include /* For all work-queue handling */ -#include /* For using scatterlists */ -#include /* For skbufs handling */ -#include /* For copy from/to user */ - -#define MEMTRACK_ERROR_INJECTION_MESSAGE(file, line, func) ({ \ - printk(KERN_ERR "%s failure injected at %s:%d\n", func, file, line); \ - dump_stack(); \ -}) - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 14) -#define RDMA_KZALLOC_H -#define kzalloc(size, flags) ({ \ - void *__memtrack_kz_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kzalloc");\ - else \ - __memtrack_kz_addr = kmalloc(size, flags); \ - if (__memtrack_kz_addr && !is_non_trackable_alloc_func(__func__)) { \ - memset(__memtrack_kz_addr, 0, size); \ - } \ - __memtrack_kz_addr; \ -}) - -#else -#define kzalloc(size, flags) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kzalloc");\ - else \ - __memtrack_addr = kzalloc(size, flags); \ - if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, flags); \ - } \ - __memtrack_addr; \ -}) - -#endif - -#define kzalloc_node(size, flags, node) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kzalloc_node"); \ - else \ - __memtrack_addr = kzalloc_node(size, flags, node); \ - if (__memtrack_addr && (size) && \ - !is_non_trackable_alloc_func(__func__)) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, flags); \ - } \ - __memtrack_addr; \ -}) - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19) -#define kcalloc(n, size, flags) kzalloc((n)*(size), flags) -#else -#define kcalloc(n, size, flags) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kcalloc");\ - else \ - __memtrack_addr = kcalloc(n, size, flags); \ - if (__memtrack_addr && (size)) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), (n)*(size), 0UL, 0, __FILE__, __LINE__, flags); \ - } \ - __memtrack_addr; \ -}) -#endif - - - -#ifdef ZERO_OR_NULL_PTR -#define kmalloc(sz, flgs) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmalloc");\ - else \ - __memtrack_addr = kmalloc(sz, flgs); \ - if (!ZERO_OR_NULL_PTR(__memtrack_addr)) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ - if (memtrack_randomize_mem()) \ - get_random_bytes(__memtrack_addr, sz); \ - } \ - __memtrack_addr; \ -}) -#else -#define kmalloc(sz, flgs) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmalloc");\ - else \ - __memtrack_addr = kmalloc(sz, flgs); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ - if (memtrack_randomize_mem()) \ - get_random_bytes(__memtrack_addr, sz); \ - } \ - __memtrack_addr; \ -}) - -#endif - -#define kmalloc_node(sz, flgs, node) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmalloc_node"); \ - else \ - __memtrack_addr = kmalloc_node(sz, flgs, node); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ - if (memtrack_randomize_mem() && ((flgs) == GFP_KERNEL)) \ - get_random_bytes(__memtrack_addr, sz); \ - } \ - __memtrack_addr; \ -}) - -#ifdef ZERO_OR_NULL_PTR -#define kmemdup(src, sz, flgs) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmemdup");\ - else \ - __memtrack_addr = kmemdup(src, sz, flgs); \ - if (!ZERO_OR_NULL_PTR(__memtrack_addr)) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ - } \ - __memtrack_addr; \ -}) -#else -#define kmemdup(src, sz, flgs) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmemdup");\ - else \ - __memtrack_addr = kmemdup(src, sz, flgs); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ - } \ - __memtrack_addr; \ -}) -#endif - -#ifdef ZERO_OR_NULL_PTR -#define kfree(addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (!ZERO_OR_NULL_PTR(__memtrack_addr) && \ - !is_non_trackable_free_func(__func__)) { \ - memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - kfree(__memtrack_addr); \ -}) -#else -#define kfree(addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr && !is_non_trackable_free_func(__func__)) { \ - memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - kfree(__memtrack_addr); \ -}) -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0) || defined (CONFIG_COMPAT_RCU) -#ifdef kfree_rcu - #undef kfree_rcu -#endif - -#ifdef ZERO_OR_NULL_PTR -#define kfree_rcu(addr, rcu_head) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (!ZERO_OR_NULL_PTR(__memtrack_addr) && \ - !is_non_trackable_free_func(__func__)) { \ - memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - __kfree_rcu(&((addr)->rcu_head), offsetof(typeof(*(addr)), rcu_head)); \ -}) -#else -#define kfree_rcu(addr, rcu_head) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr && !is_non_trackable_free_func(__func__)) { \ - memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - __kfree_rcu(&((addr)->rcu_head), offsetof(typeof(*(addr)), rcu_head)); \ -}) -#endif -#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) */ - -#define vmalloc(size) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vmalloc");\ - else \ - __memtrack_addr = vmalloc(size); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - if (memtrack_randomize_mem()) \ - get_random_bytes(__memtrack_addr, size); \ - } \ - __memtrack_addr; \ -}) - -#ifndef vzalloc -#define vzalloc(size) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vzalloc");\ - else \ - __memtrack_addr = vzalloc(size); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - __memtrack_addr; \ -}) -#endif - -#ifndef vzalloc_node -#define vzalloc_node(size, node) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vzalloc_node"); \ - else \ - __memtrack_addr = vzalloc_node(size, node); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - if (memtrack_randomize_mem()) \ - get_random_bytes(__memtrack_addr, size); \ - } \ - __memtrack_addr; \ -}) -#endif - -#define vmalloc_node(size, node) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vmalloc_node"); \ - else \ - __memtrack_addr = vmalloc_node(size, node); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - if (memtrack_randomize_mem()) \ - get_random_bytes(__memtrack_addr, size); \ - } \ - __memtrack_addr; \ -}) - -#define vfree(addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - if (__memtrack_addr) { \ - memtrack_free(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - vfree(__memtrack_addr); \ -}) - - -#define kmem_cache_alloc(cache, flags) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmem_cache_alloc"); \ - else \ - __memtrack_addr = kmem_cache_alloc(cache, flags); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_KMEM_OBJ, 0UL, (unsigned long)(__memtrack_addr), 1, 0UL, 0, __FILE__, __LINE__, flags); \ - } \ - __memtrack_addr; \ -}) - - -#define kmem_cache_free(cache, addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr) { \ - memtrack_free(MEMTRACK_KMEM_OBJ, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - kmem_cache_free(cache, __memtrack_addr); \ -}) - - -/* All IO-MAP handling */ -#define ioremap(phys_addr, size) ({ \ - void __iomem *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap");\ - else \ - __memtrack_addr = ioremap(phys_addr, size); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - __memtrack_addr; \ -}) - -#define io_mapping_create_wc(base, size) ({ \ - void __iomem *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "io_mapping_create_wc"); \ - else \ - __memtrack_addr = io_mapping_create_wc(base, size); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - __memtrack_addr; \ -}) - -#define io_mapping_free(addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr) { \ - memtrack_free(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - io_mapping_free(__memtrack_addr); \ -}) - -#ifdef CONFIG_PPC -#ifdef ioremap_nocache - #undef ioremap_nocache -#endif -#define ioremap_nocache(phys_addr, size) ({ \ - void __iomem *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap_nocache"); \ - else \ - __memtrack_addr = ioremap(phys_addr, size); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - __memtrack_addr; \ -}) -#else -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 18) /* 2.6.16 - 2.6.17 */ -#ifdef ioremap_nocache - #undef ioremap_nocache -#endif -#define ioremap_nocache(phys_addr, size) ({ \ - void __iomem *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap_nocache"); \ - else \ - __memtrack_addr = ioremap(phys_addr, size); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - __memtrack_addr; \ -}) -#else -#define ioremap_nocache(phys_addr, size) ({ \ - void __iomem *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap_nocache"); \ - else \ - __memtrack_addr = ioremap_nocache(phys_addr, size); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - __memtrack_addr; \ -}) -#endif /* Kernel version is under 2.6.18 */ -#endif /* PPC */ - -#define iounmap(addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr) { \ - memtrack_free(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - iounmap(__memtrack_addr); \ -}) - - -/* All Page handlers */ -/* TODO: Catch netif_rx for page dereference */ -#define alloc_pages_node(nid, gfp_mask, order) ({ \ - struct page *page_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_pages_node"); \ - else \ - page_addr = (struct page *)alloc_pages_node(nid, gfp_mask, order); \ - if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ - memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - page_addr; \ -}) - -#ifdef CONFIG_NUMA -#define alloc_pages(gfp_mask, order) ({ \ - struct page *page_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_pages"); \ - else \ - page_addr = (struct page *)alloc_pages(gfp_mask, order); \ - if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ - memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - page_addr; \ -}) -#else -#ifdef alloc_pages - #undef alloc_pages -#endif -#define alloc_pages(gfp_mask, order) ({ \ - struct page *page_addr; \ - \ - page_addr = (struct page *)alloc_pages_node(numa_node_id(), gfp_mask, order); \ - page_addr; \ -}) -#endif - -#define __get_free_pages(gfp_mask, order) ({ \ - struct page *page_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "__get_free_pages"); \ - else \ - page_addr = (struct page *)__get_free_pages(gfp_mask, order); \ - if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ - memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - page_addr; \ -}) - -#define get_zeroed_page(gfp_mask) ({ \ - struct page *page_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "get_zeroed_page"); \ - else \ - page_addr = (struct page *)get_zeroed_page(gfp_mask); \ - if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ - memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - (unsigned long)page_addr; \ -}) - -#define __free_pages(addr, order) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ - if (!memtrack_check_size(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), (unsigned long)(order), __FILE__, __LINE__)) \ - memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - __free_pages(addr, order); \ -}) - - -#define free_pages(addr, order) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ - if (!memtrack_check_size(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), (unsigned long)(order), __FILE__, __LINE__)) \ - memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - free_pages(addr, order); \ -}) - - -#define get_page(addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ - if (memtrack_is_new_addr(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), 0, __FILE__, __LINE__)) { \ - memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - } \ - get_page(addr); \ -}) - -#define get_user_pages_fast(start, nr_pages, write, pages) ({ \ - int __memtrack_rc = -1; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "get_user_pages_fast"); \ - else \ - __memtrack_rc = get_user_pages_fast(start, nr_pages, write, pages); \ - if (__memtrack_rc > 0 && !is_non_trackable_alloc_func(__func__)) { \ - int __memtrack_i; \ - \ - for (__memtrack_i = 0; __memtrack_i < __memtrack_rc; __memtrack_i++) \ - memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(pages[__memtrack_i]), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - __memtrack_rc; \ -}) - -#define put_page(addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ - /* Check whether this is not part of umem put page & not */\ - /* a new addr and the ref-count is 1 then we'll free this addr */\ - /* Don't change the order these conditions */ \ - if (!is_umem_put_page(__func__) && \ - !memtrack_is_new_addr(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), 1, __FILE__, __LINE__) && \ - (memtrack_get_page_ref_count((unsigned long)(__memtrack_addr)) == 1)) { \ - memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - } \ - put_page(addr); \ -}) - - -/* Work-Queue handlers */ -#ifdef create_workqueue - #undef create_workqueue -#endif -#ifdef create_rt_workqueue - #undef create_rt_workqueue -#endif -#ifdef create_freezeable_workqueue - #undef create_freezeable_workqueue -#endif -#ifdef create_singlethread_workqueue - #undef create_singlethread_workqueue -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20) /* 2.6.18 - 2.6.19 */ -#define create_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 0); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#define create_singlethread_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_singlethread_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 1); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) /* 2.6.20 - 2.6.27 */ -#define create_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 0, 0); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 22) /* 2.6.20 - 2.6.21 */ -#define create_freezeable_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_freezeable_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 0, 1); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) -#else /* 2.6.22 - 2.6.27 */ -#define create_freezeable_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_freezeable_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 1, 1); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) -#endif /* 2.6.20 - 2.6.27 */ - -#define create_singlethread_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_singlethread_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 1, 0); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 36) /* 2.6.28 - 2.6.35 */ - -#ifdef alloc_workqueue - #undef alloc_workqueue -#endif - -#define alloc_workqueue(name, flags, max_active) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), (flags), (max_active), 0); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#define create_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 0, 0, 0); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#define create_rt_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_rt_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 0, 0, 1); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#define create_freezeable_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_freezeable_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 1, 1, 0); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#define create_singlethread_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_singlethread_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 1, 0, 0); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) -#else /* 2.6.36 */ -#ifdef alloc_workqueue - #undef alloc_workqueue -#endif -#ifdef CONFIG_LOCKDEP -#define alloc_workqueue(name, flags, max_active) \ -({ \ - static struct lock_class_key __key; \ - const char *__lock_name; \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (__builtin_constant_p(name)) \ - __lock_name = (name); \ - else \ - __lock_name = #name; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_workqueue"); \ - else \ - wq_addr = __alloc_workqueue_key((name), (flags), (max_active), \ - &__key, __lock_name); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) -#else -#define alloc_workqueue(name, flags, max_active) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_workqueue"); \ - else \ - wq_addr = __alloc_workqueue_key((name), (flags), (max_active), NULL, NULL); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) -#endif - -#define create_workqueue(name) \ - alloc_workqueue((name), WQ_RESCUER, 1); - -#define create_freezeable_workqueue(name) \ - alloc_workqueue((name), WQ_FREEZEABLE | WQ_UNBOUND | WQ_RESCUER, 1); - -#define create_singlethread_workqueue(name) \ - alloc_workqueue((name), WQ_UNBOUND | WQ_RESCUER, 1); - -#endif /* Work-Queue Kernel Versions */ - -#define destroy_workqueue(wq_addr) ({ \ - void *__memtrack_addr = (void *)wq_addr; \ - \ - if (__memtrack_addr) { \ - memtrack_free(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - destroy_workqueue(wq_addr); \ -}) - -/* ONLY error injection to functions that we don't monitor */ -#define alloc_skb(size, prio) ({ \ - struct sk_buff *__memtrack_skb = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_skb"); \ - else \ - __memtrack_skb = alloc_skb(size, prio); \ - __memtrack_skb; \ -}) - -#define dev_alloc_skb(size) ({ \ - struct sk_buff *__memtrack_skb = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "dev_alloc_skb"); \ - else \ - __memtrack_skb = dev_alloc_skb(size); \ - __memtrack_skb; \ -}) - -#define alloc_skb_fclone(size, prio) ({ \ - struct sk_buff *__memtrack_skb = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_skb_fclone"); \ - else \ - __memtrack_skb = alloc_skb_fclone(size, prio); \ - __memtrack_skb; \ -}) - -#define copy_from_user(to, from, n) ({ \ - int ret = n; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "copy_from_user"); \ - else \ - ret = copy_from_user(to, from, n); \ - ret; \ -}) - -#define copy_to_user(to, from, n) ({ \ - int ret = n; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "copy_to_user"); \ - else \ - ret = copy_to_user(to, from, n); \ - ret; \ -}) - -#define sysfs_create_file(kobj, attr) ({ \ - int ret = -ENOSYS; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "sysfs_create_file"); \ - else \ - ret = sysfs_create_file(kobj, attr); \ - ret; \ -}) - -#define sysfs_create_link(kobj, target, name) ({ \ - int ret = -ENOSYS; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "sysfs_create_link"); \ - else \ - ret = sysfs_create_link(kobj, target, name); \ - ret; \ -}) - -#define sysfs_create_group(kobj, grp) ({ \ - int ret = -ENOSYS; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "sysfs_create_group"); \ - else \ - ret = sysfs_create_group(kobj, grp); \ - ret; \ -}) - -#endif /* __mtrack_h_ */ - Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/mtrack.h ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/memtrack.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/memtrack.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/memtrack.c (nonexistent) @@ -1,960 +0,0 @@ -/* - This software is available to you under a choice of one of two - licenses. You may choose to be licensed under the terms of the GNU - General Public License (GPL) Version 2, available at - , or the OpenIB.org BSD - license, available in the LICENSE.TXT file accompanying this - software. These details are also available at - . - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - - Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. -*/ - -#define LINUXKPI_PARAM_PREFIX memtrack_ - -#define C_MEMTRACK_C - -#ifdef kmalloc - #undef kmalloc -#endif -#ifdef kmemdup - #undef kmemdup -#endif -#ifdef kfree - #undef kfree -#endif -#ifdef vmalloc - #undef vmalloc -#endif -#ifdef vzalloc - #undef vzalloc -#endif -#ifdef vzalloc_node - #undef vzalloc_node -#endif -#ifdef vfree - #undef vfree -#endif -#ifdef kmem_cache_alloc - #undef kmem_cache_alloc -#endif -#ifdef kmem_cache_free - #undef kmem_cache_free -#endif -#ifdef ioremap - #undef ioremap -#endif -#ifdef io_mapping_create_wc - #undef io_mapping_create_wc -#endif -#ifdef io_mapping_free - #undef io_mapping_free -#endif -#ifdef ioremap_nocache - #undef ioremap_nocache -#endif -#ifdef iounmap - #undef iounmap -#endif -#ifdef alloc_pages - #undef alloc_pages -#endif -#ifdef free_pages - #undef free_pages -#endif -#ifdef get_page - #undef get_page -#endif -#ifdef put_page - #undef put_page -#endif -#ifdef create_workqueue - #undef create_workqueue -#endif -#ifdef create_rt_workqueue - #undef create_rt_workqueue -#endif -#ifdef create_freezeable_workqueue - #undef create_freezeable_workqueue -#endif -#ifdef create_singlethread_workqueue - #undef create_singlethread_workqueue -#endif -#ifdef destroy_workqueue - #undef destroy_workqueue -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "memtrack.h" - -#include - - -MODULE_AUTHOR("Mellanox Technologies LTD."); -MODULE_DESCRIPTION("Memory allocations tracking"); -MODULE_LICENSE("GPL"); - -#define MEMTRACK_HASH_SZ ((1<<15)-19) /* prime: http://www.utm.edu/research/primes/lists/2small/0bit.html */ -#define MAX_FILENAME_LEN 31 - -#define memtrack_spin_lock(spl, flags) spin_lock_irqsave(spl, flags) -#define memtrack_spin_unlock(spl, flags) spin_unlock_irqrestore(spl, flags) - -/* if a bit is set then the corresponding allocation is tracked. - bit0 corresponds to MEMTRACK_KMALLOC, bit1 corresponds to MEMTRACK_VMALLOC etc. */ -static unsigned long track_mask = -1; /* effectively everything */ -module_param(track_mask, ulong, 0444); -MODULE_PARM_DESC(track_mask, "bitmask defining what is tracked"); - -/* if a bit is set then the corresponding allocation is strictly tracked. - That is, before inserting the whole range is checked to not overlap any - of the allocations already in the database */ -static unsigned long strict_track_mask = 0; /* no strict tracking */ -module_param(strict_track_mask, ulong, 0444); -MODULE_PARM_DESC(strict_track_mask, "bitmask which allocation requires strict tracking"); - -/* Sets the frequency of allocations failures injections - if set to 0 all allocation should succeed */ -static unsigned int inject_freq = 0; -module_param(inject_freq, uint, 0644); -MODULE_PARM_DESC(inject_freq, "Error injection frequency, default is 0 (disabled)"); - -static int random_mem = 1; -module_param(random_mem, uint, 0644); -MODULE_PARM_DESC(random_mem, "When set, randomize allocated memory, default is 1 (enabled)"); - -struct memtrack_meminfo_t { - unsigned long addr; - unsigned long size; - unsigned long line_num; - unsigned long dev; - unsigned long addr2; - int direction; - struct memtrack_meminfo_t *next; - struct list_head list; /* used to link all items from a certain type together */ - char filename[MAX_FILENAME_LEN + 1]; /* putting the char array last is better for struct. packing */ - char ext_info[32]; -}; - -static struct kmem_cache *meminfo_cache; - -struct tracked_obj_desc_t { - struct memtrack_meminfo_t *mem_hash[MEMTRACK_HASH_SZ]; - spinlock_t hash_lock; - unsigned long count; /* size of memory tracked (*malloc) or number of objects tracked */ - struct list_head tracked_objs_head; /* head of list of all objects */ - int strict_track; /* if 1 then for each object inserted check if it overlaps any of the objects already in the list */ -}; - -static struct tracked_obj_desc_t *tracked_objs_arr[MEMTRACK_NUM_OF_MEMTYPES]; - -static const char *rsc_names[MEMTRACK_NUM_OF_MEMTYPES] = { - "kmalloc", - "vmalloc", - "kmem_cache_alloc", - "io_remap", - "create_workqueue", - "alloc_pages", - "ib_dma_map_single", - "ib_dma_map_page", - "ib_dma_map_sg" -}; - -static const char *rsc_free_names[MEMTRACK_NUM_OF_MEMTYPES] = { - "kfree", - "vfree", - "kmem_cache_free", - "io_unmap", - "destory_workqueue", - "free_pages", - "ib_dma_unmap_single", - "ib_dma_unmap_page", - "ib_dma_unmap_sg" -}; - -static inline const char *memtype_alloc_str(enum memtrack_memtype_t memtype) -{ - switch (memtype) { - case MEMTRACK_KMALLOC: - case MEMTRACK_VMALLOC: - case MEMTRACK_KMEM_OBJ: - case MEMTRACK_IOREMAP: - case MEMTRACK_WORK_QUEUE: - case MEMTRACK_PAGE_ALLOC: - case MEMTRACK_DMA_MAP_SINGLE: - case MEMTRACK_DMA_MAP_PAGE: - case MEMTRACK_DMA_MAP_SG: - return rsc_names[memtype]; - default: - return "(Unknown allocation type)"; - } -} - -static inline const char *memtype_free_str(enum memtrack_memtype_t memtype) -{ - switch (memtype) { - case MEMTRACK_KMALLOC: - case MEMTRACK_VMALLOC: - case MEMTRACK_KMEM_OBJ: - case MEMTRACK_IOREMAP: - case MEMTRACK_WORK_QUEUE: - case MEMTRACK_PAGE_ALLOC: - case MEMTRACK_DMA_MAP_SINGLE: - case MEMTRACK_DMA_MAP_PAGE: - case MEMTRACK_DMA_MAP_SG: - return rsc_free_names[memtype]; - default: - return "(Unknown allocation type)"; - } -} - -/* - * overlap_a_b - */ -static inline int overlap_a_b(unsigned long a_start, unsigned long a_end, - unsigned long b_start, unsigned long b_end) -{ - if ((b_start > a_end) || (a_start > b_end)) - return 0; - - return 1; -} - -/* - * check_overlap - */ -static void check_overlap(enum memtrack_memtype_t memtype, - struct memtrack_meminfo_t *mem_info_p, - struct tracked_obj_desc_t *obj_desc_p) -{ - struct list_head *pos, *next; - struct memtrack_meminfo_t *cur; - unsigned long start_a, end_a, start_b, end_b; - - start_a = mem_info_p->addr; - end_a = mem_info_p->addr + mem_info_p->size - 1; - - list_for_each_safe(pos, next, &obj_desc_p->tracked_objs_head) { - cur = list_entry(pos, struct memtrack_meminfo_t, list); - - start_b = cur->addr; - end_b = cur->addr + cur->size - 1; - - if (overlap_a_b(start_a, end_a, start_b, end_b)) - printk(KERN_ERR "%s overlaps! new_start=0x%lx, new_end=0x%lx, item_start=0x%lx, item_end=0x%lx\n", - memtype_alloc_str(memtype), mem_info_p->addr, - mem_info_p->addr + mem_info_p->size - 1, cur->addr, - cur->addr + cur->size - 1); - } -} - -/* Invoke on memory allocation */ -void memtrack_alloc(enum memtrack_memtype_t memtype, unsigned long dev, - unsigned long addr, unsigned long size, unsigned long addr2, - int direction, const char *filename, - const unsigned long line_num, int alloc_flags) -{ - unsigned long hash_val; - struct memtrack_meminfo_t *cur_mem_info_p, *new_mem_info_p; - struct tracked_obj_desc_t *obj_desc_p; - unsigned long flags; - - if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { - printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); - return; - } - - if (!tracked_objs_arr[memtype]) { - /* object is not tracked */ - return; - } - obj_desc_p = tracked_objs_arr[memtype]; - - hash_val = addr % MEMTRACK_HASH_SZ; - - new_mem_info_p = (struct memtrack_meminfo_t *)kmem_cache_alloc(meminfo_cache, alloc_flags); - if (new_mem_info_p == NULL) { - printk(KERN_ERR "%s: Failed allocating kmem_cache item for new mem_info. " - "Lost tracking on allocation at %s:%lu...\n", __func__, - filename, line_num); - return; - } - /* save allocation properties */ - new_mem_info_p->addr = addr; - new_mem_info_p->size = size; - new_mem_info_p->dev = dev; - new_mem_info_p->addr2 = addr2; - new_mem_info_p->direction = direction; - - new_mem_info_p->line_num = line_num; - *new_mem_info_p->ext_info = '\0'; - /* Make sure that we will print out the path tail if the given filename is longer - * than MAX_FILENAME_LEN. (otherwise, we will not see the name of the actual file - * in the printout -- only the path head! - */ - if (strlen(filename) > MAX_FILENAME_LEN) - strncpy(new_mem_info_p->filename, filename + strlen(filename) - MAX_FILENAME_LEN, MAX_FILENAME_LEN); - else - strncpy(new_mem_info_p->filename, filename, MAX_FILENAME_LEN); - - new_mem_info_p->filename[MAX_FILENAME_LEN] = 0; /* NULL terminate anyway */ - - memtrack_spin_lock(&obj_desc_p->hash_lock, flags); - /* make sure given memory location is not already allocated */ - if ((memtype != MEMTRACK_DMA_MAP_SINGLE) && (memtype != MEMTRACK_DMA_MAP_PAGE) && - (memtype != MEMTRACK_DMA_MAP_SG)) { - - /* make sure given memory location is not already allocated */ - cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; - while (cur_mem_info_p != NULL) { - if ((cur_mem_info_p->addr == addr) && (cur_mem_info_p->dev == dev)) { - /* Found given address in the database */ - printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s @ addr=0x%lX which is already known from %s:%lu\n", - __func__, filename, line_num, - memtype_alloc_str(memtype), addr, - cur_mem_info_p->filename, - cur_mem_info_p->line_num); - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - kmem_cache_free(meminfo_cache, new_mem_info_p); - return; - } - cur_mem_info_p = cur_mem_info_p->next; - } - } - /* not found - we can put in the hash bucket */ - /* link as first */ - new_mem_info_p->next = obj_desc_p->mem_hash[hash_val]; - obj_desc_p->mem_hash[hash_val] = new_mem_info_p; - if (obj_desc_p->strict_track) - check_overlap(memtype, new_mem_info_p, obj_desc_p); - obj_desc_p->count += size; - list_add(&new_mem_info_p->list, &obj_desc_p->tracked_objs_head); - - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return; -} -EXPORT_SYMBOL(memtrack_alloc); - -/* Invoke on memory free */ -void memtrack_free(enum memtrack_memtype_t memtype, unsigned long dev, - unsigned long addr, unsigned long size, int direction, - const char *filename, const unsigned long line_num) -{ - unsigned long hash_val; - struct memtrack_meminfo_t *cur_mem_info_p, *prev_mem_info_p; - struct tracked_obj_desc_t *obj_desc_p; - unsigned long flags; - - if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { - printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); - return; - } - - if (!tracked_objs_arr[memtype]) { - /* object is not tracked */ - return; - } - obj_desc_p = tracked_objs_arr[memtype]; - - hash_val = addr % MEMTRACK_HASH_SZ; - - memtrack_spin_lock(&obj_desc_p->hash_lock, flags); - /* find mem_info of given memory location */ - prev_mem_info_p = NULL; - cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; - while (cur_mem_info_p != NULL) { - if ((cur_mem_info_p->addr == addr) && (cur_mem_info_p->dev == dev)) { - /* Found given address in the database */ - if ((memtype == MEMTRACK_DMA_MAP_SINGLE) || (memtype == MEMTRACK_DMA_MAP_PAGE) || - (memtype == MEMTRACK_DMA_MAP_SG)) { - if (direction != cur_mem_info_p->direction) - printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s bad direction for addr 0x%lX: alloc:0x%x, free:0x%x (allocated in %s::%lu)\n", - __func__, filename, line_num, memtype_free_str(memtype), addr, cur_mem_info_p->direction, direction, - cur_mem_info_p->filename, cur_mem_info_p->line_num); - - if (size != cur_mem_info_p->size) - printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s bad size for addr 0x%lX: size:%lu, free:%lu (allocated in %s::%lu)\n", - __func__, filename, line_num, memtype_free_str(memtype), addr, cur_mem_info_p->size, size, - cur_mem_info_p->filename, cur_mem_info_p->line_num); - } - - /* Remove from the bucket/list */ - if (prev_mem_info_p == NULL) - obj_desc_p->mem_hash[hash_val] = cur_mem_info_p->next; /* removing first */ - else - prev_mem_info_p->next = cur_mem_info_p->next; /* "crossover" */ - - list_del(&cur_mem_info_p->list); - - obj_desc_p->count -= cur_mem_info_p->size; - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - kmem_cache_free(meminfo_cache, cur_mem_info_p); - return; - } - prev_mem_info_p = cur_mem_info_p; - cur_mem_info_p = cur_mem_info_p->next; - } - - /* not found */ - printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX, device=0x%lX\n", - __func__, filename, line_num, memtype_free_str(memtype), addr, dev); - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return; -} -EXPORT_SYMBOL(memtrack_free); - -/* - * This function recognizes allocations which - * may be released by kernel (e.g. skb) and - * therefore not trackable by memtrack. - * The allocations are recognized by the name - * of their calling function. - */ -int is_non_trackable_alloc_func(const char *func_name) -{ - static const char * const str_str_arr[] = { - /* functions containing these strings consider non trackable */ - "skb", - }; - static const char * const str_str_excep_arr[] = { - /* functions which are exception to the str_str_arr table */ - "ipoib_cm_skb_too_long" - }; - static const char * const str_cmp_arr[] = { - /* functions that allocate SKBs */ - "mlx4_en_alloc_frags", - "mlx4_en_alloc_frag", - "mlx4_en_init_allocator", - "mlx4_en_free_frag", - "mlx4_en_free_rx_desc", - "mlx4_en_destroy_allocator", - "mlx4_en_complete_rx_desc", - /* vnic skb functions */ - "free_single_frag", - "vnic_alloc_rx_skb", - "vnic_rx_skb", - "vnic_alloc_frag", - "vnic_empty_rx_entry", - "vnic_init_allocator", - "vnic_destroy_allocator", - "sdp_post_recv", - "sdp_rx_ring_purge", - "sdp_post_srcavail", - "sk_stream_alloc_page", - "update_send_head", - "sdp_bcopy_get", - "sdp_destroy_resources", - - /* function that allocate memory for RDMA device context */ - "ib_alloc_device" - }; - size_t str_str_arr_size = sizeof(str_str_arr)/sizeof(char *); - size_t str_str_excep_size = sizeof(str_str_excep_arr)/sizeof(char *); - size_t str_cmp_arr_size = sizeof(str_cmp_arr)/sizeof(char *); - - int i, j; - - for (i = 0; i < str_str_arr_size; ++i) - if (strstr(func_name, str_str_arr[i])) { - for (j = 0; j < str_str_excep_size; ++j) - if (!strcmp(func_name, str_str_excep_arr[j])) - return 0; - return 1; - } - for (i = 0; i < str_cmp_arr_size; ++i) - if (!strcmp(func_name, str_cmp_arr[i])) - return 1; - return 0; -} -EXPORT_SYMBOL(is_non_trackable_alloc_func); - -/* - * In some cases we need to free a memory - * we defined as "non trackable" (see - * is_non_trackable_alloc_func). - * This function recognizes such releases - * by the name of their calling function. - */ -int is_non_trackable_free_func(const char *func_name) -{ - - static const char * const str_cmp_arr[] = { - /* function that deallocate memory for RDMA device context */ - "ib_dealloc_device" - }; - size_t str_cmp_arr_size = sizeof(str_cmp_arr)/sizeof(char *); - - int i; - - for (i = 0; i < str_cmp_arr_size; ++i) - if (!strcmp(func_name, str_cmp_arr[i])) - return 1; - return 0; -} -EXPORT_SYMBOL(is_non_trackable_free_func); - - -/* WA - In this function handles confirm - the function name is - '__ib_umem_release' or 'ib_umem_get' - In this case we won't track the - memory there because the kernel - was the one who allocated it. - Return value: - 1 - if the function name is match, else 0 */ -int is_umem_put_page(const char *func_name) -{ - const char func_str[18] = "__ib_umem_release"; - /* In case of error flow put_page is called as part of ib_umem_get */ - const char func_str1[12] = "ib_umem_get"; - - return ((strstr(func_name, func_str) != NULL) || - (strstr(func_name, func_str1) != NULL)) ? 1 : 0; -} -EXPORT_SYMBOL(is_umem_put_page); - -/* Check page order size - When Freeing a page allocation it checks whether - we are trying to free the same size - we asked to allocate */ -int memtrack_check_size(enum memtrack_memtype_t memtype, unsigned long addr, - unsigned long size, const char *filename, - const unsigned long line_num) -{ - unsigned long hash_val; - struct memtrack_meminfo_t *cur_mem_info_p; - struct tracked_obj_desc_t *obj_desc_p; - unsigned long flags; - int ret = 0; - - if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { - printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); - return 1; - } - - if (!tracked_objs_arr[memtype]) { - /* object is not tracked */ - return 1; - } - obj_desc_p = tracked_objs_arr[memtype]; - - hash_val = addr % MEMTRACK_HASH_SZ; - - memtrack_spin_lock(&obj_desc_p->hash_lock, flags); - /* find mem_info of given memory location */ - cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; - while (cur_mem_info_p != NULL) { - if (cur_mem_info_p->addr == addr) { - /* Found given address in the database - check size */ - if (cur_mem_info_p->size != size) { - printk(KERN_ERR "mtl size inconsistency: %s: %s::%lu: try to %s at address=0x%lX with size %lu while was created with size %lu\n", - __func__, filename, line_num, memtype_free_str(memtype), - addr, size, cur_mem_info_p->size); - snprintf(cur_mem_info_p->ext_info, sizeof(cur_mem_info_p->ext_info), - "invalid free size %lu\n", size); - ret = 1; - } - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return ret; - } - cur_mem_info_p = cur_mem_info_p->next; - } - - /* not found - This function will not give any indication - but will only check the correct size\order - For inconsistency the 'free' function will check that */ - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return 1; -} -EXPORT_SYMBOL(memtrack_check_size); - -/* Search for a specific addr whether it exist in the - current data-base. - It will print an error msg if we get an unexpected result, - Return value: 0 - if addr exist, else 1 */ -int memtrack_is_new_addr(enum memtrack_memtype_t memtype, unsigned long addr, int expect_exist, - const char *filename, const unsigned long line_num) -{ - unsigned long hash_val; - struct memtrack_meminfo_t *cur_mem_info_p; - struct tracked_obj_desc_t *obj_desc_p; - unsigned long flags; - - if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { - printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); - return 1; - } - - if (!tracked_objs_arr[memtype]) { - /* object is not tracked */ - return 0; - } - obj_desc_p = tracked_objs_arr[memtype]; - - hash_val = addr % MEMTRACK_HASH_SZ; - - memtrack_spin_lock(&obj_desc_p->hash_lock, flags); - /* find mem_info of given memory location */ - cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; - while (cur_mem_info_p != NULL) { - if (cur_mem_info_p->addr == addr) { - /* Found given address in the database - exiting */ - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return 0; - } - cur_mem_info_p = cur_mem_info_p->next; - } - - /* not found */ - if (expect_exist) - printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX\n", - __func__, filename, line_num, memtype_free_str(memtype), addr); - - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return 1; -} -EXPORT_SYMBOL(memtrack_is_new_addr); - -/* Return current page reference counter */ -int memtrack_get_page_ref_count(unsigned long addr) -{ - unsigned long hash_val; - struct memtrack_meminfo_t *cur_mem_info_p; - struct tracked_obj_desc_t *obj_desc_p; - unsigned long flags; - /* This function is called only for page allocation */ - enum memtrack_memtype_t memtype = MEMTRACK_PAGE_ALLOC; - int ref_conut = 0; - - if (!tracked_objs_arr[memtype]) { - /* object is not tracked */ - return ref_conut; - } - obj_desc_p = tracked_objs_arr[memtype]; - - hash_val = addr % MEMTRACK_HASH_SZ; - - memtrack_spin_lock(&obj_desc_p->hash_lock, flags); - /* find mem_info of given memory location */ - cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; - while (cur_mem_info_p != NULL) { - if (cur_mem_info_p->addr == addr) { - /* Found given address in the database - check ref-count */ - struct page *page = (struct page *)(cur_mem_info_p->addr); - ref_conut = atomic_read(&page->_count); - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return ref_conut; - } - cur_mem_info_p = cur_mem_info_p->next; - } - - /* not found */ - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return ref_conut; -} -EXPORT_SYMBOL(memtrack_get_page_ref_count); - -/* Report current allocations status (for all memory types) */ -static void memtrack_report(void) -{ - enum memtrack_memtype_t memtype; - unsigned long cur_bucket; - struct memtrack_meminfo_t *cur_mem_info_p; - int serial = 1; - struct tracked_obj_desc_t *obj_desc_p; - unsigned long flags; - unsigned long detected_leaks = 0; - - printk(KERN_INFO "%s: Currently known allocations:\n", __func__); - for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) { - if (tracked_objs_arr[memtype]) { - printk(KERN_INFO "%d) %s:\n", serial, memtype_alloc_str(memtype)); - obj_desc_p = tracked_objs_arr[memtype]; - /* Scan all buckets to find existing allocations */ - /* TBD: this may be optimized by holding a linked list of all hash items */ - for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ; cur_bucket++) { - memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* protect per bucket/list */ - cur_mem_info_p = obj_desc_p->mem_hash[cur_bucket]; - while (cur_mem_info_p != NULL) { /* scan bucket */ - printk(KERN_INFO "%s::%lu: %s(%lu)==%lX dev=%lX %s\n", - cur_mem_info_p->filename, - cur_mem_info_p->line_num, - memtype_alloc_str(memtype), - cur_mem_info_p->size, - cur_mem_info_p->addr, - cur_mem_info_p->dev, - cur_mem_info_p->ext_info); - cur_mem_info_p = cur_mem_info_p->next; - ++ detected_leaks; - } /* while cur_mem_info_p */ - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - } /* for cur_bucket */ - serial++; - } - } /* for memtype */ - printk(KERN_INFO "%s: Summary: %lu leak(s) detected\n", __func__, detected_leaks); -} - - - -static struct proc_dir_entry *memtrack_tree; - -static enum memtrack_memtype_t get_rsc_by_name(const char *name) -{ - enum memtrack_memtype_t i; - - for (i = 0; i < MEMTRACK_NUM_OF_MEMTYPES; ++i) { - if (strcmp(name, rsc_names[i]) == 0) - return i; - } - - return i; -} - - -static ssize_t memtrack_read(struct file *filp, - char __user *buf, - size_t size, - loff_t *offset) -{ - unsigned long cur, flags; - loff_t pos = *offset; - static char kbuf[20]; - static int file_len; - int _read, to_ret, left; - const char *fname; - enum memtrack_memtype_t memtype; - - if (pos < 0) - return -EINVAL; - - fname = filp->f_dentry->d_name.name; - - memtype = get_rsc_by_name(fname); - if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { - printk(KERN_ERR "invalid file name\n"); - return -EINVAL; - } - - if (pos == 0) { - memtrack_spin_lock(&tracked_objs_arr[memtype]->hash_lock, flags); - cur = tracked_objs_arr[memtype]->count; - memtrack_spin_unlock(&tracked_objs_arr[memtype]->hash_lock, flags); - _read = sprintf(kbuf, "%lu\n", cur); - if (_read < 0) - return _read; - else - file_len = _read; - } - - left = file_len - pos; - to_ret = (left < size) ? left : size; - if (copy_to_user(buf, kbuf+pos, to_ret)) - return -EFAULT; - else { - *offset = pos + to_ret; - return to_ret; - } -} - -static const struct file_operations memtrack_proc_fops = { - .read = memtrack_read, -}; - -static const char *memtrack_proc_entry_name = "mt_memtrack"; - -static int create_procfs_tree(void) -{ - struct proc_dir_entry *dir_ent; - struct proc_dir_entry *proc_ent; - int i, j; - unsigned long bit_mask; - - dir_ent = proc_mkdir(memtrack_proc_entry_name, NULL); - if (!dir_ent) - return -1; - - memtrack_tree = dir_ent; - - for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) { - if (bit_mask & track_mask) { - proc_ent = create_proc_entry(rsc_names[i], S_IRUGO, memtrack_tree); - if (!proc_ent) - goto undo_create_root; - - proc_ent->proc_fops = &memtrack_proc_fops; - } - } - - goto exit_ok; - -undo_create_root: - for (j = 0, bit_mask = 1; j < i; ++j, bit_mask <<= 1) { - if (bit_mask & track_mask) - remove_proc_entry(rsc_names[j], memtrack_tree); - } - remove_proc_entry(memtrack_proc_entry_name, NULL); - return -1; - -exit_ok: - return 0; -} - - -static void destroy_procfs_tree(void) -{ - int i; - unsigned long bit_mask; - - for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) { - if (bit_mask & track_mask) - remove_proc_entry(rsc_names[i], memtrack_tree); - - } - remove_proc_entry(memtrack_proc_entry_name, NULL); -} - -int memtrack_inject_error(void) -{ - int val = 0; - - if (inject_freq) { - if (!(random32() % inject_freq)) - val = 1; - } - - return val; -} -EXPORT_SYMBOL(memtrack_inject_error); - -int memtrack_randomize_mem(void) -{ - return random_mem; -} -EXPORT_SYMBOL(memtrack_randomize_mem); - -/* module entry points */ - -int init_module(void) -{ - enum memtrack_memtype_t i; - int j; - unsigned long bit_mask; - - - /* create a cache for the memtrack_meminfo_t strcutures */ - meminfo_cache = kmem_cache_create("memtrack_meminfo_t", - sizeof(struct memtrack_meminfo_t), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!meminfo_cache) { - printk(KERN_ERR "memtrack::%s: failed to allocate meminfo cache\n", __func__); - return -1; - } - - /* initialize array of descriptors */ - memset(tracked_objs_arr, 0, sizeof(tracked_objs_arr)); - - /* create a tracking object descriptor for all required objects */ - for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) { - if (bit_mask & track_mask) { - tracked_objs_arr[i] = vmalloc(sizeof(struct tracked_obj_desc_t)); - if (!tracked_objs_arr[i]) { - printk(KERN_ERR "memtrack: failed to allocate tracking object\n"); - goto undo_cache_create; - } - - memset(tracked_objs_arr[i], 0, sizeof(struct tracked_obj_desc_t)); - spin_lock_init(&tracked_objs_arr[i]->hash_lock); - INIT_LIST_HEAD(&tracked_objs_arr[i]->tracked_objs_head); - if (bit_mask & strict_track_mask) - tracked_objs_arr[i]->strict_track = 1; - else - tracked_objs_arr[i]->strict_track = 0; - } - } - - - if (create_procfs_tree()) { - printk(KERN_ERR "%s: create_procfs_tree() failed\n", __FILE__); - goto undo_cache_create; - } - - printk(KERN_INFO "memtrack::%s done.\n", __func__); - - return 0; - -undo_cache_create: - for (j = 0; j < i; ++j) { - if (tracked_objs_arr[j]) - vfree(tracked_objs_arr[j]); - } - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19) - if (kmem_cache_destroy(meminfo_cache) != 0) - printk(KERN_ERR "Failed on kmem_cache_destroy!\n"); -#else - kmem_cache_destroy(meminfo_cache); -#endif - return -1; -} - - -void cleanup_module(void) -{ - enum memtrack_memtype_t memtype; - unsigned long cur_bucket; - struct memtrack_meminfo_t *cur_mem_info_p, *next_mem_info_p; - struct tracked_obj_desc_t *obj_desc_p; - unsigned long flags; - - - memtrack_report(); - - - destroy_procfs_tree(); - - /* clean up any hash table left-overs */ - for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) { - /* Scan all buckets to find existing allocations */ - /* TBD: this may be optimized by holding a linked list of all hash items */ - if (tracked_objs_arr[memtype]) { - obj_desc_p = tracked_objs_arr[memtype]; - for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ; cur_bucket++) { - memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* protect per bucket/list */ - cur_mem_info_p = obj_desc_p->mem_hash[cur_bucket]; - while (cur_mem_info_p != NULL) { /* scan bucket */ - next_mem_info_p = cur_mem_info_p->next; /* save "next" pointer before the "free" */ - kmem_cache_free(meminfo_cache, cur_mem_info_p); - cur_mem_info_p = next_mem_info_p; - } /* while cur_mem_info_p */ - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - } /* for cur_bucket */ - vfree(obj_desc_p); - } - } /* for memtype */ - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19) - if (kmem_cache_destroy(meminfo_cache) != 0) - printk(KERN_ERR "memtrack::cleanup_module: Failed on kmem_cache_destroy!\n"); -#else - kmem_cache_destroy(meminfo_cache); -#endif - printk(KERN_INFO "memtrack::cleanup_module done.\n"); -} Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/memtrack.c ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/memtrack.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/memtrack.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/memtrack.h (nonexistent) @@ -1,106 +0,0 @@ -/* - This software is available to you under a choice of one of two - licenses. You may choose to be licensed under the terms of the GNU - General Public License (GPL) Version 2, available at - , or the OpenIB.org BSD - license, available in the LICENSE.TXT file accompanying this - software. These details are also available at - . - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - - Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. -*/ - -#ifndef H_MEMTRACK_H -#define H_MEMTRACK_H - -enum memtrack_memtype_t { - MEMTRACK_KMALLOC, - MEMTRACK_VMALLOC, - MEMTRACK_KMEM_OBJ, - MEMTRACK_IOREMAP, /* IO-RE/UN-MAP */ - MEMTRACK_WORK_QUEUE, /* Handle work-queue create & destroy */ - MEMTRACK_PAGE_ALLOC, /* Handle page allocation and free */ - MEMTRACK_DMA_MAP_SINGLE,/* Handle ib_dma_single map and unmap */ - MEMTRACK_DMA_MAP_PAGE, /* Handle ib_dma_page map and unmap */ - MEMTRACK_DMA_MAP_SG, /* Handle ib_dma_sg map and unmap with and without attributes */ - MEMTRACK_NUM_OF_MEMTYPES -}; - -/* Invoke on memory allocation */ -void memtrack_alloc(enum memtrack_memtype_t memtype, unsigned long dev, - unsigned long addr, unsigned long size, unsigned long addr2, - int direction, const char *filename, - const unsigned long line_num, int alloc_flags); - -/* Invoke on memory free */ -void memtrack_free(enum memtrack_memtype_t memtype, unsigned long dev, - unsigned long addr, unsigned long size, int direction, - const char *filename, const unsigned long line_num); - -/* - * This function recognizes allocations which - * may be released by kernel (e.g. skb & vnic) and - * therefore not trackable by memtrack. - * The allocations are recognized by the name - * of their calling function. - */ -int is_non_trackable_alloc_func(const char *func_name); -/* - * In some cases we need to free a memory - * we defined as "non trackable" (see - * is_non_trackable_alloc_func). - * This function recognizes such releases - * by the name of their calling function. - */ -int is_non_trackable_free_func(const char *func_name); - -/* WA - In this function handles confirm - the function name is - '__ib_umem_release' or 'ib_umem_get' - In this case we won't track the - memory there because the kernel - was the one who allocated it. - Return value: - 1 - if the function name is match, else 0 */ -int is_umem_put_page(const char *func_name); - -/* Check page order size - When Freeing a page allocation it checks whether - we are trying to free the same amount of pages - we ask to allocate (In log2(order)). - In case an error if found it will print - an error msg */ -int memtrack_check_size(enum memtrack_memtype_t memtype, unsigned long addr, - unsigned long size, const char *filename, - const unsigned long line_num); - -/* Search for a specific addr whether it exist in the - current data-base. - If not it will print an error msg, - Return value: 0 - if addr exist, else 1 */ -int memtrack_is_new_addr(enum memtrack_memtype_t memtype, unsigned long addr, int expect_exist, - const char *filename, const unsigned long line_num); - -/* Return current page reference counter */ -int memtrack_get_page_ref_count(unsigned long addr); - -/* Report current allocations status (for all memory types) */ -/* we do not export this function since it is used by cleanup_module only */ -/* void memtrack_report(void); */ - -/* Allow support of error injections */ -int memtrack_inject_error(void); - -/* randomize allocated memory */ -int memtrack_randomize_mem(void); - -#endif Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/memtrack.h ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/Makefile =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/Makefile (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/Makefile (nonexistent) @@ -1,3 +0,0 @@ -EXTRA_CFLAGS := $(subst $(KERNEL_MEMTRACK_CFLAGS),,$(EXTRA_CFLAGS)) - -obj-m += memtrack.o Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/debug/Makefile ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig (nonexistent) @@ -1,50 +0,0 @@ -config INFINIBAND_IPOIB - tristate "IP-over-InfiniBand" - depends on NETDEVICES && INET && (IPV6 || IPV6=n) - select INET_LRO - ---help--- - Support for the IP-over-InfiniBand protocol (IPoIB). This - transports IP packets over InfiniBand so you can use your IB - device as a fancy NIC. - - See Documentation/infiniband/ipoib.txt for more information - -config INFINIBAND_IPOIB_CM - bool "IP-over-InfiniBand Connected Mode support" - depends on INFINIBAND_IPOIB - default n - ---help--- - This option enables support for IPoIB connected mode. After - enabling this option, you need to switch to connected mode - through /sys/class/net/ibXXX/mode to actually create - connections, and then increase the interface MTU with - e.g. ifconfig ib0 mtu 65520. - - WARNING: Enabling connected mode will trigger some packet - drops for multicast and UD mode traffic from this interface, - unless you limit mtu for these destinations to 2044. - -config INFINIBAND_IPOIB_DEBUG - bool "IP-over-InfiniBand debugging" if EMBEDDED - depends on INFINIBAND_IPOIB - default y - ---help--- - This option causes debugging code to be compiled into the - IPoIB driver. The output can be turned on via the - debug_level and mcast_debug_level module parameters (which - can also be set after the driver is loaded through sysfs). - - This option also creates a directory tree under ipoib/ in - debugfs, which contains files that expose debugging - information about IB multicast groups used by the IPoIB - driver. - -config INFINIBAND_IPOIB_DEBUG_DATA - bool "IP-over-InfiniBand data path debugging" - depends on INFINIBAND_IPOIB_DEBUG - ---help--- - This option compiles debugging code into the data path - of the IPoIB driver. The output can be turned on via the - data_debug_level module parameter; however, even with output - turned off, this debugging code will have some performance - impact. Property changes on: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h (revision 319974) @@ -1,763 +1,763 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef _IPOIB_H #define _IPOIB_H #define LINUXKPI_PARAM_PREFIX ipoib_ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ofed.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #endif #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include /* constants */ #define INFINIBAND_ALEN 20 /* Octets in IPoIB HW addr */ #ifdef IPOIB_CM #define CONFIG_INFINIBAND_IPOIB_CM #endif #ifdef IPOIB_DEBUG #define CONFIG_INFINIBAND_IPOIB_DEBUG #define CONFIG_INFINIBAND_IPOIB_DEBUG_DATA #endif enum ipoib_flush_level { IPOIB_FLUSH_LIGHT, IPOIB_FLUSH_NORMAL, IPOIB_FLUSH_HEAVY }; enum { IPOIB_ENCAP_LEN = 4, IPOIB_HEADER_LEN = IPOIB_ENCAP_LEN + INFINIBAND_ALEN, IPOIB_UD_MAX_MTU = 4 * 1024, // IPOIB_UD_RX_SG = (IPOIB_UD_MAX_MTU / MJUMPAGESIZE), IPOIB_UD_RX_SG = 2, IPOIB_UD_TX_SG = (IPOIB_UD_MAX_MTU / MCLBYTES) + 2, IPOIB_CM_MAX_MTU = (64 * 1024), IPOIB_CM_TX_SG = (IPOIB_CM_MAX_MTU / MCLBYTES) + 2, IPOIB_CM_RX_SG = (IPOIB_CM_MAX_MTU / MJUMPAGESIZE), IPOIB_RX_RING_SIZE = 256, IPOIB_TX_RING_SIZE = 128, IPOIB_MAX_RX_SG = MAX(IPOIB_CM_RX_SG, IPOIB_UD_RX_SG), IPOIB_MAX_TX_SG = MAX(IPOIB_CM_TX_SG, IPOIB_UD_TX_SG), IPOIB_MAX_QUEUE_SIZE = 8192, IPOIB_MIN_QUEUE_SIZE = 2, IPOIB_CM_MAX_CONN_QP = 4096, IPOIB_NUM_WC = 4, IPOIB_MAX_PATH_REC_QUEUE = 3, IPOIB_MAX_MCAST_QUEUE = 3, IPOIB_FLAG_OPER_UP = 0, IPOIB_FLAG_INITIALIZED = 1, IPOIB_FLAG_ADMIN_UP = 2, IPOIB_PKEY_ASSIGNED = 3, IPOIB_PKEY_STOP = 4, IPOIB_FLAG_SUBINTERFACE = 5, IPOIB_MCAST_RUN = 6, IPOIB_STOP_REAPER = 7, IPOIB_FLAG_UMCAST = 10, IPOIB_FLAG_CSUM = 11, IPOIB_MAX_BACKOFF_SECONDS = 16, IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ IPOIB_MCAST_FLAG_SENDONLY = 1, IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ IPOIB_MCAST_FLAG_ATTACHED = 3, IPOIB_MAX_LRO_DESCRIPTORS = 8, IPOIB_LRO_MAX_AGGR = 64, MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, }; #define IPOIB_OP_RECV (1ul << 31) #ifdef CONFIG_INFINIBAND_IPOIB_CM #define IPOIB_OP_CM (1ul << 30) #else #define IPOIB_OP_CM (0) #endif /* structs */ struct ipoib_header { u8 hwaddr[INFINIBAND_ALEN]; __be16 proto; u16 reserved; }; struct ipoib_pseudoheader { u8 hwaddr[INFINIBAND_ALEN]; }; /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ struct ipoib_mcast { struct ib_sa_mcmember_rec mcmember; struct ib_sa_multicast *mc; struct ipoib_ah *ah; struct rb_node rb_node; struct list_head list; unsigned long created; unsigned long backoff; unsigned long flags; unsigned char logcount; struct ifqueue pkt_queue; struct ipoib_dev_priv *priv; }; struct ipoib_cm_rx_buf { struct mbuf *mb; u64 mapping[IPOIB_CM_RX_SG]; }; struct ipoib_cm_tx_buf { struct mbuf *mb; u64 mapping[IPOIB_CM_TX_SG]; }; struct ipoib_rx_buf { struct mbuf *mb; u64 mapping[IPOIB_UD_RX_SG]; }; struct ipoib_tx_buf { struct mbuf *mb; u64 mapping[IPOIB_UD_TX_SG]; }; struct ib_cm_id; struct ipoib_cm_data { __be32 qpn; /* High byte MUST be ignored on receive */ __be32 mtu; }; /* * Quoting 10.3.1 Queue Pair and EE Context States: * * Note, for QPs that are associated with an SRQ, the Consumer should take the * QP through the Error State before invoking a Destroy QP or a Modify QP to the * Reset State. The Consumer may invoke the Destroy QP without first performing * a Modify QP to the Error State and waiting for the Affiliated Asynchronous * Last WQE Reached Event. However, if the Consumer does not wait for the * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment * leakage may occur. Therefore, it is good programming practice to tear down a * QP that is associated with an SRQ by using the following process: * * - Put the QP in the Error State * - Wait for the Affiliated Asynchronous Last WQE Reached Event; * - either: * drain the CQ by invoking the Poll CQ verb and either wait for CQ * to be empty or the number of Poll CQ operations has exceeded * CQ capacity size; * - or * post another WR that completes on the same CQ and wait for this * WR to return as a WC; * - and then invoke a Destroy QP or Reset QP. * * We use the second option and wait for a completion on the * same CQ before destroying QPs attached to our SRQ. */ enum ipoib_cm_state { IPOIB_CM_RX_LIVE, IPOIB_CM_RX_ERROR, /* Ignored by stale task */ IPOIB_CM_RX_FLUSH /* Last WQE Reached event observed */ }; struct ipoib_cm_rx { struct ib_cm_id *id; struct ib_qp *qp; struct ipoib_cm_rx_buf *rx_ring; struct list_head list; struct ipoib_dev_priv *priv; unsigned long jiffies; enum ipoib_cm_state state; int recv_count; }; struct ipoib_cm_tx { struct ib_cm_id *id; struct ib_qp *qp; struct list_head list; struct ipoib_dev_priv *priv; struct ipoib_path *path; struct ipoib_cm_tx_buf *tx_ring; unsigned tx_head; unsigned tx_tail; unsigned long flags; u32 mtu; /* remote specified mtu, with grh. */ }; struct ipoib_cm_dev_priv { struct ib_srq *srq; struct ipoib_cm_rx_buf *srq_ring; struct ib_cm_id *id; struct list_head passive_ids; /* state: LIVE */ struct list_head rx_error_list; /* state: ERROR */ struct list_head rx_flush_list; /* state: FLUSH, drain not started */ struct list_head rx_drain_list; /* state: FLUSH, drain started */ struct list_head rx_reap_list; /* state: FLUSH, drain done */ struct work_struct start_task; struct work_struct reap_task; struct work_struct mb_task; struct work_struct rx_reap_task; struct delayed_work stale_task; struct ifqueue mb_queue; struct list_head start_list; struct list_head reap_list; struct ib_sge rx_sge[IPOIB_CM_RX_SG]; struct ib_recv_wr rx_wr; int nonsrq_conn_qp; int max_cm_mtu; /* Actual buf size. */ int num_frags; }; struct ipoib_ethtool_st { u16 coalesce_usecs; u16 max_coalesced_frames; }; /* * Device private locking: network stack tx_lock protects members used * in TX fast path, lock protects everything else. lock nests inside * of tx_lock (ie tx_lock must be acquired first if needed). */ struct ipoib_dev_priv { spinlock_t lock; spinlock_t drain_lock; struct ifnet *dev; u8 broadcastaddr[INFINIBAND_ALEN]; unsigned long flags; int gone; + int unit; struct mutex vlan_mutex; struct rb_root path_tree; struct list_head path_list; struct ipoib_mcast *broadcast; struct list_head multicast_list; struct rb_root multicast_tree; struct delayed_work pkey_poll_task; struct delayed_work mcast_task; struct work_struct carrier_on_task; struct work_struct flush_light; struct work_struct flush_normal; struct work_struct flush_heavy; struct work_struct restart_task; struct delayed_work ah_reap_task; struct ib_device *ca; u8 port; u16 pkey; u16 pkey_index; struct ib_pd *pd; - struct ib_mr *mr; struct ib_cq *recv_cq; struct ib_cq *send_cq; struct ib_qp *qp; u32 qkey; union ib_gid local_gid; u16 local_lid; unsigned int admin_mtu; /* User selected MTU, no GRH. */ unsigned int mcast_mtu; /* Minus GRH bytes, from mcast group. */ unsigned int max_ib_mtu; /* Without header, actual buf size. */ struct ipoib_rx_buf *rx_ring; struct ipoib_tx_buf *tx_ring; unsigned tx_head; unsigned tx_tail; struct ib_sge tx_sge[IPOIB_MAX_TX_SG]; - struct ib_send_wr tx_wr; + struct ib_ud_wr tx_wr; unsigned tx_outstanding; struct ib_wc send_wc[MAX_SEND_CQE]; struct ib_recv_wr rx_wr; struct ib_sge rx_sge[IPOIB_MAX_RX_SG]; struct ib_wc ibwc[IPOIB_NUM_WC]; struct list_head dead_ahs; struct ib_event_handler event_handler; struct ifnet *parent; struct list_head child_intfs; struct list_head list; #ifdef CONFIG_INFINIBAND_IPOIB_CM struct ipoib_cm_dev_priv cm; #endif #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct list_head fs_list; struct dentry *mcg_dentry; struct dentry *path_dentry; #endif int hca_caps; struct ipoib_ethtool_st ethtool; struct timer_list poll_timer; }; struct ipoib_ah { struct ipoib_dev_priv *priv; struct ib_ah *ah; struct list_head list; struct kref ref; unsigned last_send; }; struct ipoib_path { struct ipoib_dev_priv *priv; struct rb_node rb_node; struct list_head list; #ifdef CONFIG_INFINIBAND_IPOIB_CM uint8_t hwaddr[INFINIBAND_ALEN]; struct ipoib_cm_tx *cm; #endif struct ipoib_ah *ah; struct ib_sa_path_rec pathrec; struct ifqueue queue; int query_id; struct ib_sa_query *query; struct completion done; int valid; }; /* UD Only transmits encap len but we want the two sizes to be symmetrical. */ #define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) #define IPOIB_CM_MTU(ib_mtu) (ib_mtu - 0x10) #define IPOIB_IS_MULTICAST(addr) ((addr)[4] == 0xff) extern struct workqueue_struct *ipoib_workqueue; #define IPOIB_MTAP_PROTO(_ifp, _m, _proto) \ do { \ if (bpf_peers_present((_ifp)->if_bpf)) { \ M_ASSERTVALID(_m); \ ipoib_mtap_proto((_ifp), (_m), (_proto)); \ } \ } while (0) /* functions */ void ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto); void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr); void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr); struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *, struct ib_pd *pd, struct ib_ah_attr *attr); void ipoib_free_ah(struct kref *kref); static inline void ipoib_put_ah(struct ipoib_ah *ah) { kref_put(&ah->ref, ipoib_free_ah); } int ipoib_open(struct ipoib_dev_priv *priv); int ipoib_add_pkey_attr(struct ipoib_dev_priv *priv); int ipoib_add_umcast_attr(struct ipoib_dev_priv *priv); void ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto); void ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_ah *address, u32 qpn); void ipoib_reap_ah(struct work_struct *work); void ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv); void ipoib_flush_paths(struct ipoib_dev_priv *priv); struct ipoib_dev_priv *ipoib_intf_alloc(const char *format); int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port); void ipoib_ib_dev_flush_light(struct work_struct *work); void ipoib_ib_dev_flush_normal(struct work_struct *work); void ipoib_ib_dev_flush_heavy(struct work_struct *work); void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv); int ipoib_ib_dev_open(struct ipoib_dev_priv *priv); int ipoib_ib_dev_up(struct ipoib_dev_priv *priv); int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush); int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush); int ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port); void ipoib_dev_cleanup(struct ipoib_dev_priv *priv); void ipoib_mcast_join_task(struct work_struct *work); void ipoib_mcast_carrier_on_task(struct work_struct *work); void ipoib_mcast_send(struct ipoib_dev_priv *priv, void *mgid, struct mbuf *mb); void ipoib_mcast_restart_task(struct work_struct *work); void ipoib_mcast_restart(struct ipoib_dev_priv *); int ipoib_mcast_start_thread(struct ipoib_dev_priv *priv); int ipoib_mcast_stop_thread(struct ipoib_dev_priv *priv, int flush); void ipoib_mcast_dev_down(struct ipoib_dev_priv *priv); void ipoib_mcast_dev_flush(struct ipoib_dev_priv *priv); void ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct ipoib_dev_priv *priv); int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter); void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, union ib_gid *gid, unsigned long *created, unsigned int *queuelen, unsigned int *complete, unsigned int *send_only); struct ipoib_path_iter *ipoib_path_iter_init(struct ipoib_dev_priv *priv); int ipoib_path_iter_next(struct ipoib_path_iter *iter); void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path); #endif int ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu); int ipoib_mcast_attach(struct ipoib_dev_priv *priv, u16 mlid, union ib_gid *mgid, int set_qkey); int ipoib_init_qp(struct ipoib_dev_priv *priv); int ipoib_transport_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca); void ipoib_transport_dev_cleanup(struct ipoib_dev_priv *priv); void ipoib_event(struct ib_event_handler *handler, struct ib_event *record); void ipoib_pkey_poll(struct work_struct *work); int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv); void ipoib_drain_cq(struct ipoib_dev_priv *priv); int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max); void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req); int ipoib_poll_tx(struct ipoib_dev_priv *priv); void ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req); void ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length); struct mbuf *ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req, int size); void ipoib_set_ethtool_ops(struct ifnet *dev); int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca); #ifdef CONFIG_INFINIBAND_IPOIB_CM #define IPOIB_FLAGS_RC 0x80 #define IPOIB_FLAGS_UC 0x40 /* We don't support UC connections at the moment */ #define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC)) extern int ipoib_max_conn_qp; static inline int ipoib_cm_admin_enabled(struct ipoib_dev_priv *priv) { return IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev)); } static inline int ipoib_cm_enabled(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { return IPOIB_CM_SUPPORTED(hwaddr); } static inline int ipoib_cm_up(struct ipoib_path *path) { return test_bit(IPOIB_FLAG_OPER_UP, &path->cm->flags); } static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_path *path) { return path->cm; } static inline void ipoib_cm_set(struct ipoib_path *path, struct ipoib_cm_tx *tx) { path->cm = tx; } static inline int ipoib_cm_has_srq(struct ipoib_dev_priv *priv) { return !!priv->cm.srq; } static inline unsigned int ipoib_cm_max_mtu(struct ipoib_dev_priv *priv) { return priv->cm.max_cm_mtu; } void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx); int ipoib_cm_dev_open(struct ipoib_dev_priv *priv); void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv); int ipoib_cm_dev_init(struct ipoib_dev_priv *priv); int ipoib_cm_add_mode_attr(struct ipoib_dev_priv *priv); void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv); struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv, struct ipoib_path *path); void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx); void ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu); void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc); void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc); #else struct ipoib_cm_tx; #define ipoib_max_conn_qp 0 static inline int ipoib_cm_admin_enabled(struct ipoib_dev_priv *priv) { return 0; } static inline int ipoib_cm_enabled(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { return 0; } static inline int ipoib_cm_up(struct ipoib_path *path) { return 0; } static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_path *path) { return NULL; } static inline void ipoib_cm_set(struct ipoib_path *path, struct ipoib_cm_tx *tx) { } static inline int ipoib_cm_has_srq(struct ipoib_dev_priv *priv) { return 0; } static inline unsigned int ipoib_cm_max_mtu(struct ipoib_dev_priv *priv) { return 0; } static inline void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx) { return; } static inline int ipoib_cm_dev_open(struct ipoib_dev_priv *priv) { return 0; } static inline void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv) { return; } static inline int ipoib_cm_dev_init(struct ipoib_dev_priv *priv) { return -ENOSYS; } static inline void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv) { return; } static inline struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv, struct ipoib_path *path) { return NULL; } static inline void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) { return; } static inline int ipoib_cm_add_mode_attr(struct ipoib_dev_priv *priv) { return 0; } static inline void ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu) { m_freem(mb); } static inline void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { } static inline void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { } #endif #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG void ipoib_create_debug_files(struct ipoib_dev_priv *priv); void ipoib_delete_debug_files(struct ipoib_dev_priv *priv); int ipoib_register_debugfs(void); void ipoib_unregister_debugfs(void); #else static inline void ipoib_create_debug_files(struct ipoib_dev_priv *priv) { } static inline void ipoib_delete_debug_files(struct ipoib_dev_priv *priv) { } static inline int ipoib_register_debugfs(void) { return 0; } static inline void ipoib_unregister_debugfs(void) { } #endif #define ipoib_printk(level, priv, format, arg...) \ printk(level "%s: " format, if_name(((struct ipoib_dev_priv *) priv)->dev), ## arg) #define ipoib_warn(priv, format, arg...) \ ipoib_printk(KERN_WARNING, priv, format , ## arg) extern int ipoib_sendq_size; extern int ipoib_recvq_size; extern struct ib_sa_client ipoib_sa_client; #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG extern int ipoib_debug_level; #define ipoib_dbg(priv, format, arg...) \ do { \ if (ipoib_debug_level > 0) \ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ } while (0) #define ipoib_dbg_mcast(priv, format, arg...) \ do { \ if (mcast_debug_level > 0) \ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ } while (0) #else /* CONFIG_INFINIBAND_IPOIB_DEBUG */ #define ipoib_dbg(priv, format, arg...) \ do { (void) (priv); } while (0) #define ipoib_dbg_mcast(priv, format, arg...) \ do { (void) (priv); } while (0) #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA #define ipoib_dbg_data(priv, format, arg...) \ do { \ if (data_debug_level > 0) \ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ } while (0) #else /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ #define ipoib_dbg_data(priv, format, arg...) \ do { (void) (priv); } while (0) #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ #define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) #endif /* _IPOIB_H */ Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c (revision 319974) @@ -1,998 +1,997 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ipoib.h" #include #include #include #include #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA static int data_debug_level; module_param(data_debug_level, int, 0644); MODULE_PARM_DESC(data_debug_level, "Enable data path debug tracing if > 0"); #endif static DEFINE_MUTEX(pkey_mutex); struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *priv, struct ib_pd *pd, struct ib_ah_attr *attr) { struct ipoib_ah *ah; ah = kmalloc(sizeof *ah, GFP_KERNEL); if (!ah) return NULL; ah->priv = priv; ah->last_send = 0; kref_init(&ah->ref); ah->ah = ib_create_ah(pd, attr); if (IS_ERR(ah->ah)) { kfree(ah); ah = NULL; } else ipoib_dbg(priv, "Created ah %p\n", ah->ah); return ah; } void ipoib_free_ah(struct kref *kref) { struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref); struct ipoib_dev_priv *priv = ah->priv; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); list_add_tail(&ah->list, &priv->dead_ahs); spin_unlock_irqrestore(&priv->lock, flags); } void ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req) { struct mbuf *m; int i; for (i = 0, m = rx_req->mb; m != NULL; m = m->m_next, i++) ib_dma_unmap_single(priv->ca, rx_req->mapping[i], m->m_len, DMA_FROM_DEVICE); } void ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length) { m_adj(mb, -(mb->m_pkthdr.len - length)); } struct mbuf * ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req, int size) { struct mbuf *mb, *m; int i, j; rx_req->mb = NULL; mb = m_getm2(NULL, size, M_NOWAIT, MT_DATA, M_PKTHDR); if (mb == NULL) return (NULL); for (i = 0, m = mb; m != NULL; m = m->m_next, i++) { m->m_len = M_SIZE(m); mb->m_pkthdr.len += m->m_len; rx_req->mapping[i] = ib_dma_map_single(priv->ca, mtod(m, void *), m->m_len, DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(priv->ca, rx_req->mapping[i]))) goto error; } rx_req->mb = mb; return (mb); error: for (j = 0, m = mb; j < i; m = m->m_next, j++) ib_dma_unmap_single(priv->ca, rx_req->mapping[j], m->m_len, DMA_FROM_DEVICE); m_freem(mb); return (NULL); } static int ipoib_ib_post_receive(struct ipoib_dev_priv *priv, int id) { struct ipoib_rx_buf *rx_req; struct ib_recv_wr *bad_wr; struct mbuf *m; int ret; int i; rx_req = &priv->rx_ring[id]; for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) { priv->rx_sge[i].addr = rx_req->mapping[i]; priv->rx_sge[i].length = m->m_len; } priv->rx_wr.num_sge = i; priv->rx_wr.wr_id = id | IPOIB_OP_RECV; ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret); ipoib_dma_unmap_rx(priv, &priv->rx_ring[id]); m_freem(priv->rx_ring[id].mb); priv->rx_ring[id].mb = NULL; } return ret; } static struct mbuf * ipoib_alloc_rx_mb(struct ipoib_dev_priv *priv, int id) { return ipoib_alloc_map_mb(priv, &priv->rx_ring[id], priv->max_ib_mtu + IB_GRH_BYTES); } static int ipoib_ib_post_receives(struct ipoib_dev_priv *priv) { int i; for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_alloc_rx_mb(priv, i)) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); return -ENOMEM; } if (ipoib_ib_post_receive(priv, i)) { ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i); return -EIO; } } return 0; } static void ipoib_ib_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { struct ipoib_rx_buf saverx; unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV; struct ifnet *dev = priv->dev; struct ipoib_header *eh; struct mbuf *mb; ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n", wr_id, wc->status); if (unlikely(wr_id >= ipoib_recvq_size)) { ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n", wr_id, ipoib_recvq_size); return; } mb = priv->rx_ring[wr_id].mb; if (unlikely(wc->status != IB_WC_SUCCESS)) { if (wc->status != IB_WC_WR_FLUSH_ERR) { ipoib_warn(priv, "failed recv event " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); goto repost; } if (mb) { ipoib_dma_unmap_rx(priv, &priv->rx_ring[wr_id]); m_freem(mb); priv->rx_ring[wr_id].mb = NULL; } return; } /* * Drop packets that this interface sent, ie multicast packets * that the HCA has replicated. */ if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) goto repost; memcpy(&saverx, &priv->rx_ring[wr_id], sizeof(saverx)); /* * If we can't allocate a new RX buffer, dump * this packet and reuse the old buffer. */ if (unlikely(!ipoib_alloc_rx_mb(priv, wr_id))) { memcpy(&priv->rx_ring[wr_id], &saverx, sizeof(saverx)); if_inc_counter(dev, IFCOUNTER_IQDROPS, 1); goto repost; } ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", wc->byte_len, wc->slid); ipoib_dma_unmap_rx(priv, &saverx); ipoib_dma_mb(priv, mb, wc->byte_len); if_inc_counter(dev, IFCOUNTER_IPACKETS, 1); if_inc_counter(dev, IFCOUNTER_IBYTES, mb->m_pkthdr.len); mb->m_pkthdr.rcvif = dev; m_adj(mb, sizeof(struct ib_grh) - INFINIBAND_ALEN); eh = mtod(mb, struct ipoib_header *); bzero(eh->hwaddr, 4); /* Zero the queue pair, only dgid is in grh */ - if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok)) + if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->wc_flags & IB_WC_IP_CSUM_OK)) mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID; dev->if_input(dev, mb); repost: if (unlikely(ipoib_ib_post_receive(priv, wr_id))) ipoib_warn(priv, "ipoib_ib_post_receive failed " "for buf %d\n", wr_id); } int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max) { struct mbuf *mb = tx_req->mb; u64 *mapping = tx_req->mapping; struct mbuf *m, *p; int error; int i; for (m = mb, p = NULL, i = 0; m != NULL; p = m, m = m->m_next, i++) { if (m->m_len != 0) continue; if (p == NULL) panic("ipoib_dma_map_tx: First mbuf empty\n"); p->m_next = m_free(m); m = p; i--; } i--; if (i >= max) { tx_req->mb = mb = m_defrag(mb, M_NOWAIT); if (mb == NULL) return -EIO; for (m = mb, i = 0; m != NULL; m = m->m_next, i++); if (i >= max) return -EIO; } error = 0; for (m = mb, i = 0; m != NULL; m = m->m_next, i++) { mapping[i] = ib_dma_map_single(ca, mtod(m, void *), m->m_len, DMA_TO_DEVICE); if (unlikely(ib_dma_mapping_error(ca, mapping[i]))) { error = -EIO; break; } } if (error) { int end; end = i; for (m = mb, i = 0; i < end; m = m->m_next, i++) ib_dma_unmap_single(ca, mapping[i], m->m_len, DMA_TO_DEVICE); } return error; } void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req) { struct mbuf *mb = tx_req->mb; u64 *mapping = tx_req->mapping; struct mbuf *m; int i; for (m = mb, i = 0; m != NULL; m = m->m_next, i++) ib_dma_unmap_single(ca, mapping[i], m->m_len, DMA_TO_DEVICE); } static void ipoib_ib_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { struct ifnet *dev = priv->dev; unsigned int wr_id = wc->wr_id; struct ipoib_tx_buf *tx_req; ipoib_dbg_data(priv, "send completion: id %d, status: %d\n", wr_id, wc->status); if (unlikely(wr_id >= ipoib_sendq_size)) { ipoib_warn(priv, "send completion event with wrid %d (> %d)\n", wr_id, ipoib_sendq_size); return; } tx_req = &priv->tx_ring[wr_id]; ipoib_dma_unmap_tx(priv->ca, tx_req); if_inc_counter(dev, IFCOUNTER_OPACKETS, 1); m_freem(tx_req->mb); ++priv->tx_tail; if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && (dev->if_drv_flags & IFF_DRV_OACTIVE) && test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) dev->if_drv_flags &= ~IFF_DRV_OACTIVE; if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) ipoib_warn(priv, "failed send event " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); } int ipoib_poll_tx(struct ipoib_dev_priv *priv) { int n, i; n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc); for (i = 0; i < n; ++i) { struct ib_wc *wc = priv->send_wc + i; if (wc->wr_id & IPOIB_OP_CM) ipoib_cm_handle_tx_wc(priv, wc); else ipoib_ib_handle_tx_wc(priv, wc); } return n == MAX_SEND_CQE; } static void ipoib_poll(struct ipoib_dev_priv *priv) { int n, i; poll_more: spin_lock(&priv->drain_lock); for (;;) { n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); for (i = 0; i < n; i++) { struct ib_wc *wc = priv->ibwc + i; if ((wc->wr_id & IPOIB_OP_RECV) == 0) panic("ipoib_poll: Bad wr_id 0x%jX\n", (intmax_t)wc->wr_id); if (wc->wr_id & IPOIB_OP_CM) ipoib_cm_handle_rx_wc(priv, wc); else ipoib_ib_handle_rx_wc(priv, wc); } if (n != IPOIB_NUM_WC) break; } spin_unlock(&priv->drain_lock); if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)) goto poll_more; } void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr) { struct ipoib_dev_priv *priv = dev_ptr; ipoib_poll(priv); } static void drain_tx_cq(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; spin_lock(&priv->lock); while (ipoib_poll_tx(priv)) ; /* nothing */ if (dev->if_drv_flags & IFF_DRV_OACTIVE) mod_timer(&priv->poll_timer, jiffies + 1); spin_unlock(&priv->lock); } void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr) { struct ipoib_dev_priv *priv = dev_ptr; mod_timer(&priv->poll_timer, jiffies); } static inline int post_send(struct ipoib_dev_priv *priv, unsigned int wr_id, struct ib_ah *address, u32 qpn, struct ipoib_tx_buf *tx_req, void *head, int hlen) { struct ib_send_wr *bad_wr; struct mbuf *mb = tx_req->mb; u64 *mapping = tx_req->mapping; struct mbuf *m; int i; for (m = mb, i = 0; m != NULL; m = m->m_next, i++) { priv->tx_sge[i].addr = mapping[i]; priv->tx_sge[i].length = m->m_len; } - priv->tx_wr.num_sge = i; - priv->tx_wr.wr_id = wr_id; - priv->tx_wr.wr.ud.remote_qpn = qpn; - priv->tx_wr.wr.ud.ah = address; + priv->tx_wr.wr.num_sge = i; + priv->tx_wr.wr.wr_id = wr_id; + priv->tx_wr.remote_qpn = qpn; + priv->tx_wr.ah = address; - if (head) { - priv->tx_wr.wr.ud.mss = 0; /* XXX mb_shinfo(mb)->gso_size; */ - priv->tx_wr.wr.ud.header = head; - priv->tx_wr.wr.ud.hlen = hlen; - priv->tx_wr.opcode = IB_WR_LSO; + priv->tx_wr.mss = 0; /* XXX mb_shinfo(mb)->gso_size; */ + priv->tx_wr.header = head; + priv->tx_wr.hlen = hlen; + priv->tx_wr.wr.opcode = IB_WR_LSO; } else - priv->tx_wr.opcode = IB_WR_SEND; + priv->tx_wr.wr.opcode = IB_WR_SEND; - return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr); + return ib_post_send(priv->qp, &priv->tx_wr.wr, &bad_wr); } void ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_ah *address, u32 qpn) { struct ifnet *dev = priv->dev; struct ipoib_tx_buf *tx_req; int hlen; void *phead; if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) while (ipoib_poll_tx(priv)) ; /* nothing */ m_adj(mb, sizeof (struct ipoib_pseudoheader)); if (0 /* XXX segment offload mb_is_gso(mb) */) { /* XXX hlen = mb_transport_offset(mb) + tcp_hdrlen(mb); */ phead = mtod(mb, void *); if (mb->m_len < hlen) { ipoib_warn(priv, "linear data too small\n"); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); m_freem(mb); return; } m_adj(mb, hlen); } else { if (unlikely(mb->m_pkthdr.len - IPOIB_ENCAP_LEN > priv->mcast_mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", mb->m_pkthdr.len, priv->mcast_mtu); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); ipoib_cm_mb_too_long(priv, mb, priv->mcast_mtu); return; } phead = NULL; hlen = 0; } ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n", mb->m_pkthdr.len, address, qpn); /* * We put the mb into the tx_ring _before_ we call post_send() * because it's entirely possible that the completion handler will * run before we execute anything after the post_send(). That * means we have to make sure everything is properly recorded and * our state is consistent before we call post_send(). */ tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; tx_req->mb = mb; if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req, IPOIB_UD_TX_SG))) { if_inc_counter(dev, IFCOUNTER_OERRORS, 1); if (tx_req->mb) m_freem(tx_req->mb); return; } if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP)) - priv->tx_wr.send_flags |= IB_SEND_IP_CSUM; + priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM; else - priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; if (++priv->tx_outstanding == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) ipoib_warn(priv, "request notify on send CQ failed\n"); dev->if_drv_flags |= IFF_DRV_OACTIVE; } if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), address->ah, qpn, tx_req, phead, hlen))) { ipoib_warn(priv, "post_send failed\n"); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); --priv->tx_outstanding; ipoib_dma_unmap_tx(priv->ca, tx_req); m_freem(mb); if (dev->if_drv_flags & IFF_DRV_OACTIVE) dev->if_drv_flags &= ~IFF_DRV_OACTIVE; } else { address->last_send = priv->tx_head; ++priv->tx_head; } } static void __ipoib_reap_ah(struct ipoib_dev_priv *priv) { struct ipoib_ah *ah, *tah; LIST_HEAD(remove_list); unsigned long flags; spin_lock_irqsave(&priv->lock, flags); list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list) if ((int) priv->tx_tail - (int) ah->last_send >= 0) { list_del(&ah->list); ib_destroy_ah(ah->ah); kfree(ah); } spin_unlock_irqrestore(&priv->lock, flags); } void ipoib_reap_ah(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, ah_reap_task.work); __ipoib_reap_ah(priv); if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); } static void ipoib_ah_dev_cleanup(struct ipoib_dev_priv *priv) { unsigned long begin; begin = jiffies; while (!list_empty(&priv->dead_ahs)) { __ipoib_reap_ah(priv); if (time_after(jiffies, begin + HZ)) { ipoib_warn(priv, "timing out; will leak address handles\n"); break; } msleep(1); } } static void ipoib_ib_tx_timer_func(unsigned long ctx) { drain_tx_cq((struct ipoib_dev_priv *)ctx); } int ipoib_ib_dev_open(struct ipoib_dev_priv *priv) { int ret; if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) { ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey); clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); return -1; } set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); ret = ipoib_init_qp(priv); if (ret) { ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret); return -1; } ret = ipoib_ib_post_receives(priv); if (ret) { ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); ipoib_ib_dev_stop(priv, 1); return -1; } ret = ipoib_cm_dev_open(priv); if (ret) { ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); ipoib_ib_dev_stop(priv, 1); return -1; } clear_bit(IPOIB_STOP_REAPER, &priv->flags); queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); return 0; } static void ipoib_pkey_dev_check_presence(struct ipoib_dev_priv *priv) { u16 pkey_index = 0; if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); else set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); } int ipoib_ib_dev_up(struct ipoib_dev_priv *priv) { ipoib_pkey_dev_check_presence(priv); if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { ipoib_dbg(priv, "PKEY is not assigned.\n"); return 0; } set_bit(IPOIB_FLAG_OPER_UP, &priv->flags); return ipoib_mcast_start_thread(priv); } int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush) { ipoib_dbg(priv, "downing ib_dev\n"); clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); if_link_state_change(priv->dev, LINK_STATE_DOWN); /* Shutdown the P_Key thread if still active */ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { mutex_lock(&pkey_mutex); set_bit(IPOIB_PKEY_STOP, &priv->flags); cancel_delayed_work(&priv->pkey_poll_task); mutex_unlock(&pkey_mutex); if (flush) flush_workqueue(ipoib_workqueue); } ipoib_mcast_stop_thread(priv, flush); ipoib_mcast_dev_flush(priv); ipoib_flush_paths(priv); return 0; } static int recvs_pending(struct ipoib_dev_priv *priv) { int pending = 0; int i; for (i = 0; i < ipoib_recvq_size; ++i) if (priv->rx_ring[i].mb) ++pending; return pending; } void ipoib_drain_cq(struct ipoib_dev_priv *priv) { int i, n; spin_lock(&priv->drain_lock); do { n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); for (i = 0; i < n; ++i) { /* * Convert any successful completions to flush * errors to avoid passing packets up the * stack after bringing the device down. */ if (priv->ibwc[i].status == IB_WC_SUCCESS) priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR; if ((priv->ibwc[i].wr_id & IPOIB_OP_RECV) == 0) panic("ipoib_drain_cq: Bad wrid 0x%jX\n", (intmax_t)priv->ibwc[i].wr_id); if (priv->ibwc[i].wr_id & IPOIB_OP_CM) ipoib_cm_handle_rx_wc(priv, priv->ibwc + i); else ipoib_ib_handle_rx_wc(priv, priv->ibwc + i); } } while (n == IPOIB_NUM_WC); spin_unlock(&priv->drain_lock); spin_lock(&priv->lock); while (ipoib_poll_tx(priv)) ; /* nothing */ spin_unlock(&priv->lock); } int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush) { struct ib_qp_attr qp_attr; unsigned long begin; struct ipoib_tx_buf *tx_req; int i; ipoib_cm_dev_stop(priv); /* * Move our QP to the error state and then reinitialize in * when all work requests have completed or have been flushed. */ qp_attr.qp_state = IB_QPS_ERR; if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) ipoib_warn(priv, "Failed to modify QP to ERROR state\n"); /* Wait for all sends and receives to complete */ begin = jiffies; while (priv->tx_head != priv->tx_tail || recvs_pending(priv)) { if (time_after(jiffies, begin + 5 * HZ)) { ipoib_warn(priv, "timing out; %d sends %d receives not completed\n", priv->tx_head - priv->tx_tail, recvs_pending(priv)); /* * assume the HW is wedged and just free up * all our pending work requests. */ while ((int) priv->tx_tail - (int) priv->tx_head < 0) { tx_req = &priv->tx_ring[priv->tx_tail & (ipoib_sendq_size - 1)]; ipoib_dma_unmap_tx(priv->ca, tx_req); m_freem(tx_req->mb); ++priv->tx_tail; --priv->tx_outstanding; } for (i = 0; i < ipoib_recvq_size; ++i) { struct ipoib_rx_buf *rx_req; rx_req = &priv->rx_ring[i]; if (!rx_req->mb) continue; ipoib_dma_unmap_rx(priv, &priv->rx_ring[i]); m_freem(rx_req->mb); rx_req->mb = NULL; } goto timeout; } ipoib_drain_cq(priv); msleep(1); } ipoib_dbg(priv, "All sends and receives done.\n"); timeout: del_timer_sync(&priv->poll_timer); qp_attr.qp_state = IB_QPS_RESET; if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) ipoib_warn(priv, "Failed to modify QP to RESET state\n"); /* Wait for all AHs to be reaped */ set_bit(IPOIB_STOP_REAPER, &priv->flags); cancel_delayed_work(&priv->ah_reap_task); if (flush) flush_workqueue(ipoib_workqueue); ipoib_ah_dev_cleanup(priv); ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); return 0; } int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port) { struct ifnet *dev = priv->dev; priv->ca = ca; priv->port = port; priv->qp = NULL; if (ipoib_transport_dev_init(priv, ca)) { printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name); return -ENODEV; } setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func, (unsigned long) priv); if (dev->if_flags & IFF_UP) { if (ipoib_ib_dev_open(priv)) { ipoib_transport_dev_cleanup(priv); return -ENODEV; } } return 0; } static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, enum ipoib_flush_level level) { struct ipoib_dev_priv *cpriv; u16 new_index; mutex_lock(&priv->vlan_mutex); /* * Flush any child interfaces too -- they might be up even if * the parent is down. */ list_for_each_entry(cpriv, &priv->child_intfs, list) __ipoib_ib_dev_flush(cpriv, level); mutex_unlock(&priv->vlan_mutex); if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); return; } if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); return; } if (level == IPOIB_FLUSH_HEAVY) { if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); ipoib_ib_dev_down(priv, 0); ipoib_ib_dev_stop(priv, 0); if (ipoib_pkey_dev_delay_open(priv)) return; } /* restart QP only if P_Key index is changed */ if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && new_index == priv->pkey_index) { ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); return; } priv->pkey_index = new_index; } if (level == IPOIB_FLUSH_LIGHT) { ipoib_mark_paths_invalid(priv); ipoib_mcast_dev_flush(priv); } if (level >= IPOIB_FLUSH_NORMAL) ipoib_ib_dev_down(priv, 0); if (level == IPOIB_FLUSH_HEAVY) { ipoib_ib_dev_stop(priv, 0); ipoib_ib_dev_open(priv); } /* * The device could have been brought down between the start and when * we get here, don't bring it back up if it's not configured up */ if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { if (level >= IPOIB_FLUSH_NORMAL) ipoib_ib_dev_up(priv); ipoib_mcast_restart_task(&priv->restart_task); } } void ipoib_ib_dev_flush_light(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, flush_light); __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT); } void ipoib_ib_dev_flush_normal(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, flush_normal); __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL); } void ipoib_ib_dev_flush_heavy(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, flush_heavy); __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY); } void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv) { ipoib_dbg(priv, "cleaning up ib_dev\n"); ipoib_mcast_stop_thread(priv, 1); ipoib_mcast_dev_flush(priv); ipoib_ah_dev_cleanup(priv); ipoib_transport_dev_cleanup(priv); } /* * Delayed P_Key Assigment Interim Support * * The following is initial implementation of delayed P_Key assigment * mechanism. It is using the same approach implemented for the multicast * group join. The single goal of this implementation is to quickly address * Bug #2507. This implementation will probably be removed when the P_Key * change async notification is available. */ void ipoib_pkey_poll(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, pkey_poll_task.work); ipoib_pkey_dev_check_presence(priv); if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) ipoib_open(priv); else { mutex_lock(&pkey_mutex); if (!test_bit(IPOIB_PKEY_STOP, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->pkey_poll_task, HZ); mutex_unlock(&pkey_mutex); } } int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv) { /* Look for the interface pkey value in the IB Port P_Key table and */ /* set the interface pkey assigment flag */ ipoib_pkey_dev_check_presence(priv); /* P_Key value not assigned yet - start polling */ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { mutex_lock(&pkey_mutex); clear_bit(IPOIB_PKEY_STOP, &priv->flags); queue_delayed_work(ipoib_workqueue, &priv->pkey_poll_task, HZ); mutex_unlock(&pkey_mutex); return 1; } return 0; } Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c (revision 319974) @@ -1,1551 +1,1565 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ipoib.h" static int ipoib_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); #include #include #include #include #include /* For ARPHRD_xxx */ #include #include #include MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); int ipoib_sendq_size = IPOIB_TX_RING_SIZE; int ipoib_recvq_size = IPOIB_RX_RING_SIZE; module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level = 1; module_param_named(debug_level, ipoib_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif struct ipoib_path_iter { struct ipoib_dev_priv *priv; struct ipoib_path path; }; static const u8 ipv4_bcast_addr[] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; struct workqueue_struct *ipoib_workqueue; struct ib_sa_client ipoib_sa_client; static void ipoib_add_one(struct ib_device *device); -static void ipoib_remove_one(struct ib_device *device); +static void ipoib_remove_one(struct ib_device *device, void *client_data); static void ipoib_start(struct ifnet *dev); static int ipoib_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data); static void ipoib_input(struct ifnet *ifp, struct mbuf *m); #define IPOIB_MTAP(_ifp, _m) \ do { \ if (bpf_peers_present((_ifp)->if_bpf)) { \ M_ASSERTVALID(_m); \ ipoib_mtap_mb((_ifp), (_m)); \ } \ } while (0) +static struct unrhdr *ipoib_unrhdr; + +static void +ipoib_unrhdr_init(void *arg) +{ + + ipoib_unrhdr = new_unrhdr(0, 65535, NULL); +} +SYSINIT(ipoib_unrhdr_init, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_init, NULL); + +static void +ipoib_unrhdr_uninit(void *arg) +{ + + if (ipoib_unrhdr != NULL) { + struct unrhdr *hdr; + + hdr = ipoib_unrhdr; + ipoib_unrhdr = NULL; + + delete_unrhdr(hdr); + } +} +SYSUNINIT(ipoib_unrhdr_uninit, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_uninit, NULL); + /* * This is for clients that have an ipoib_header in the mbuf. */ static void ipoib_mtap_mb(struct ifnet *ifp, struct mbuf *mb) { struct ipoib_header *ih; struct ether_header eh; ih = mtod(mb, struct ipoib_header *); eh.ether_type = ih->proto; bcopy(ih->hwaddr, &eh.ether_dhost, ETHER_ADDR_LEN); bzero(&eh.ether_shost, ETHER_ADDR_LEN); mb->m_data += sizeof(struct ipoib_header); mb->m_len -= sizeof(struct ipoib_header); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); mb->m_data -= sizeof(struct ipoib_header); mb->m_len += sizeof(struct ipoib_header); } void ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto) { struct ether_header eh; eh.ether_type = proto; bzero(&eh.ether_shost, ETHER_ADDR_LEN); bzero(&eh.ether_dhost, ETHER_ADDR_LEN); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); } static struct ib_client ipoib_client = { .name = "ipoib", .add = ipoib_add_one, .remove = ipoib_remove_one }; int ipoib_open(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "bringing up interface\n"); set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_pkey_dev_delay_open(priv)) return 0; if (ipoib_ib_dev_open(priv)) goto err_disable; if (ipoib_ib_dev_up(priv)) goto err_stop; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(cpriv); mutex_unlock(&priv->vlan_mutex); } dev->if_drv_flags |= IFF_DRV_RUNNING; dev->if_drv_flags &= ~IFF_DRV_OACTIVE; return 0; err_stop: ipoib_ib_dev_stop(priv, 1); err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); return -EINVAL; } static void ipoib_init(void *arg) { struct ifnet *dev; struct ipoib_dev_priv *priv; priv = arg; dev = priv->dev; if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(priv); queue_work(ipoib_workqueue, &priv->flush_light); } static int ipoib_stop(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "stopping interface\n"); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); ipoib_ib_dev_down(priv, 0); ipoib_ib_dev_stop(priv, 0); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) != 0) ipoib_stop(cpriv); mutex_unlock(&priv->vlan_mutex); } return 0; } int ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu) { struct ifnet *dev = priv->dev; /* dev->if_mtu > 2K ==> connected mode */ if (ipoib_cm_admin_enabled(priv)) { if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv))) return -EINVAL; if (new_mtu > priv->mcast_mtu) ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", priv->mcast_mtu); dev->if_mtu = new_mtu; return 0; } if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) return -EINVAL; priv->admin_mtu = new_mtu; dev->if_mtu = min(priv->mcast_mtu, priv->admin_mtu); queue_work(ipoib_workqueue, &priv->flush_light); return 0; } static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ipoib_dev_priv *priv = ifp->if_softc; struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; /* check if detaching */ if (priv == NULL || priv->gone != 0) return (ENXIO); switch (command) { case SIOCSIFFLAGS: if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) error = -ipoib_open(priv); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_stop(priv); break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifp->if_drv_flags & IFF_DRV_RUNNING) queue_work(ipoib_workqueue, &priv->restart_task); break; case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: { struct sockaddr *sa; sa = (struct sockaddr *) & ifr->ifr_data; bcopy(IF_LLADDR(ifp), (caddr_t) sa->sa_data, INFINIBAND_ALEN); } break; case SIOCSIFMTU: /* * Set the interface MTU. */ error = -ipoib_change_mtu(priv, ifr->ifr_mtu); break; default: error = EINVAL; break; } return (error); } static struct ipoib_path * __path_find(struct ipoib_dev_priv *priv, void *gid) { struct rb_node *n = priv->path_tree.rb_node; struct ipoib_path *path; int ret; while (n) { path = rb_entry(n, struct ipoib_path, rb_node); ret = memcmp(gid, path->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; else if (ret > 0) n = n->rb_right; else return path; } return NULL; } static int __path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct rb_node **n = &priv->path_tree.rb_node; struct rb_node *pn = NULL; struct ipoib_path *tpath; int ret; while (*n) { pn = *n; tpath = rb_entry(pn, struct ipoib_path, rb_node); ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = &pn->rb_left; else if (ret > 0) n = &pn->rb_right; else return -EEXIST; } rb_link_node(&path->rb_node, pn, n); rb_insert_color(&path->rb_node, &priv->path_tree); list_add_tail(&path->list, &priv->path_list); return 0; } void ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path) { _IF_DRAIN(&path->queue); if (path->ah) ipoib_put_ah(path->ah); if (ipoib_cm_get(path)) ipoib_cm_destroy_tx(ipoib_cm_get(path)); kfree(path); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_path_iter * ipoib_path_iter_init(struct ipoib_dev_priv *priv) { struct ipoib_path_iter *iter; iter = kmalloc(sizeof *iter, GFP_KERNEL); if (!iter) return NULL; iter->priv = priv; memset(iter->path.pathrec.dgid.raw, 0, 16); if (ipoib_path_iter_next(iter)) { kfree(iter); return NULL; } return iter; } int ipoib_path_iter_next(struct ipoib_path_iter *iter) { struct ipoib_dev_priv *priv = iter->priv; struct rb_node *n; struct ipoib_path *path; int ret = 1; spin_lock_irq(&priv->lock); n = rb_first(&priv->path_tree); while (n) { path = rb_entry(n, struct ipoib_path, rb_node); if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, sizeof (union ib_gid)) < 0) { iter->path = *path; ret = 0; break; } n = rb_next(n); } spin_unlock_irq(&priv->lock); return ret; } void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path) { *path = iter->path; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ void ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; spin_lock_irq(&priv->lock); list_for_each_entry_safe(path, tp, &priv->path_list, list) { ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n", be16_to_cpu(path->pathrec.dlid), path->pathrec.dgid.raw, ":"); path->valid = 0; } spin_unlock_irq(&priv->lock); } void ipoib_flush_paths(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; LIST_HEAD(remove_list); unsigned long flags; spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->path_list, &remove_list); list_for_each_entry(path, &remove_list, list) rb_erase(&path->rb_node, &priv->path_tree); list_for_each_entry_safe(path, tp, &remove_list, list) { if (path->query) ib_sa_cancel_query(path->query_id, path->query); spin_unlock_irqrestore(&priv->lock, flags); wait_for_completion(&path->done); ipoib_path_free(priv, path); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); } static void path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr) { struct ipoib_path *path = path_ptr; struct ipoib_dev_priv *priv = path->priv; struct ifnet *dev = priv->dev; struct ipoib_ah *ah = NULL; struct ipoib_ah *old_ah = NULL; struct ifqueue mbqueue; struct mbuf *mb; unsigned long flags; if (!status) ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n", be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":"); else ipoib_dbg(priv, "PathRec status %d for GID %16D\n", status, path->pathrec.dgid.raw, ":"); bzero(&mbqueue, sizeof(mbqueue)); if (!status) { struct ib_ah_attr av; if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) ah = ipoib_create_ah(priv, priv->pd, &av); } spin_lock_irqsave(&priv->lock, flags); if (ah) { path->pathrec = *pathrec; old_ah = path->ah; path->ah = ah; ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", ah, be16_to_cpu(pathrec->dlid), pathrec->sl); for (;;) { _IF_DEQUEUE(&path->queue, mb); if (mb == NULL) break; _IF_ENQUEUE(&mbqueue, mb); } #ifdef CONFIG_INFINIBAND_IPOIB_CM if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path)) ipoib_cm_set(path, ipoib_cm_create_tx(priv, path)); #endif path->valid = 1; } path->query = NULL; complete(&path->done); spin_unlock_irqrestore(&priv->lock, flags); if (old_ah) ipoib_put_ah(old_ah); for (;;) { _IF_DEQUEUE(&mbqueue, mb); if (mb == NULL) break; mb->m_pkthdr.rcvif = dev; if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed " "to requeue packet\n"); } } static struct ipoib_path * path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { struct ipoib_path *path; if (!priv->broadcast) return NULL; path = kzalloc(sizeof *path, GFP_ATOMIC); if (!path) return NULL; path->priv = priv; bzero(&path->queue, sizeof(path->queue)); #ifdef CONFIG_INFINIBAND_IPOIB_CM memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN); #endif memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid)); path->pathrec.sgid = priv->local_gid; path->pathrec.pkey = cpu_to_be16(priv->pkey); path->pathrec.numb_path = 1; path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; return path; } static int path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct ifnet *dev = priv->dev; ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU; struct ib_sa_path_rec p_rec; p_rec = path->pathrec; p_rec.mtu_selector = IB_SA_GT; switch (roundup_pow_of_two(dev->if_mtu + IPOIB_ENCAP_LEN)) { case 512: p_rec.mtu = IB_MTU_256; break; case 1024: p_rec.mtu = IB_MTU_512; break; case 2048: p_rec.mtu = IB_MTU_1024; break; case 4096: p_rec.mtu = IB_MTU_2048; break; default: /* Wildcard everything */ comp_mask = 0; p_rec.mtu = 0; p_rec.mtu_selector = 0; } ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n", p_rec.dgid.raw, ":", comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0); init_completion(&path->done); path->query_id = ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, &p_rec, comp_mask | IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, 1000, GFP_ATOMIC, path_rec_completion, path, &path->query); if (path->query_id < 0) { ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); path->query = NULL; complete(&path->done); return path->query_id; } return 0; } static void ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh) { struct ipoib_path *path; path = __path_find(priv, eh->hwaddr + 4); if (!path || !path->valid) { int new_path = 0; if (!path) { path = path_rec_create(priv, eh->hwaddr); new_path = 1; } if (path) { if (_IF_QLEN(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) _IF_ENQUEUE(&path->queue, mb); else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } if (!path->query && path_rec_start(priv, path)) { spin_unlock_irqrestore(&priv->lock, flags); if (new_path) ipoib_path_free(priv, path); return; } else __path_add(priv, path); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } return; } if (ipoib_cm_get(path) && ipoib_cm_up(path)) { ipoib_cm_send(priv, mb, ipoib_cm_get(path)); } else if (path->ah) { ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr)); } else if ((path->query || !path_rec_start(priv, path)) && path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) { _IF_ENQUEUE(&path->queue, mb); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } } static int ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb) { struct ipoib_header *eh; eh = mtod(mb, struct ipoib_header *); if (IPOIB_IS_MULTICAST(eh->hwaddr)) { /* Add in the P_Key for multicast*/ eh->hwaddr[8] = (priv->pkey >> 8) & 0xff; eh->hwaddr[9] = priv->pkey & 0xff; ipoib_mcast_send(priv, eh->hwaddr + 4, mb); } else ipoib_unicast_send(mb, priv, eh); return 0; } static void _ipoib_start(struct ifnet *dev, struct ipoib_dev_priv *priv) { struct mbuf *mb; if ((dev->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; spin_lock(&priv->lock); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd) && (dev->if_drv_flags & IFF_DRV_OACTIVE) == 0) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; IPOIB_MTAP(dev, mb); ipoib_send_one(priv, mb); } spin_unlock(&priv->lock); } static void ipoib_start(struct ifnet *dev) { _ipoib_start(dev, dev->if_softc); } static void ipoib_vlan_start(struct ifnet *dev) { struct ipoib_dev_priv *priv; struct mbuf *mb; priv = VLAN_COOKIE(dev); if (priv != NULL) return _ipoib_start(dev, priv); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd)) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; m_freem(mb); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); } } int ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port) { /* Allocate RX/TX "rings" to hold queued mbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", ca->name, ipoib_recvq_size); goto out; } priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL); if (!priv->tx_ring) { printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring); /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ if (ipoib_ib_dev_init(priv, ca, port)) goto out_tx_ring_cleanup; return 0; out_tx_ring_cleanup: kfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); out: return -ENOMEM; } static void ipoib_detach(struct ipoib_dev_priv *priv) { struct ifnet *dev; dev = priv->dev; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { priv->gone = 1; bpfdetach(dev); if_detach(dev); if_free(dev); + free_unr(ipoib_unrhdr, priv->unit); } else VLAN_SETCOOKIE(priv->dev, NULL); free(priv, M_TEMP); } void ipoib_dev_cleanup(struct ipoib_dev_priv *priv) { struct ipoib_dev_priv *cpriv, *tcpriv; /* Delete any child interfaces first */ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { ipoib_dev_cleanup(cpriv); ipoib_detach(cpriv); } ipoib_ib_dev_cleanup(priv); kfree(priv->rx_ring); kfree(priv->tx_ring); priv->rx_ring = NULL; priv->tx_ring = NULL; } -static volatile int ipoib_unit; - static struct ipoib_dev_priv * ipoib_priv_alloc(void) { struct ipoib_dev_priv *priv; priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK); spin_lock_init(&priv->lock); spin_lock_init(&priv->drain_lock); mutex_init(&priv->vlan_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN); return (priv); } struct ipoib_dev_priv * ipoib_intf_alloc(const char *name) { struct ipoib_dev_priv *priv; struct sockaddr_dl *sdl; struct ifnet *dev; priv = ipoib_priv_alloc(); dev = priv->dev = if_alloc(IFT_INFINIBAND); if (!dev) { free(priv, M_TEMP); return NULL; } dev->if_softc = priv; - if_initname(dev, name, atomic_fetchadd_int(&ipoib_unit, 1)); + priv->unit = alloc_unr(ipoib_unrhdr); + if (priv->unit == -1) { + if_free(dev); + free(priv, M_TEMP); + return NULL; + } + if_initname(dev, name, priv->unit); dev->if_flags = IFF_BROADCAST | IFF_MULTICAST; dev->if_addrlen = INFINIBAND_ALEN; dev->if_hdrlen = IPOIB_HEADER_LEN; if_attach(dev); dev->if_init = ipoib_init; dev->if_ioctl = ipoib_ioctl; dev->if_start = ipoib_start; dev->if_output = ipoib_output; dev->if_input = ipoib_input; dev->if_resolvemulti = ipoib_resolvemulti; dev->if_baudrate = IF_Gbps(10); dev->if_broadcastaddr = priv->broadcastaddr; dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2; sdl = (struct sockaddr_dl *)dev->if_addr->ifa_addr; sdl->sdl_type = IFT_INFINIBAND; sdl->sdl_alen = dev->if_addrlen; priv->dev = dev; if_link_state_change(dev, LINK_STATE_DOWN); bpfattach(dev, DLT_EN10MB, ETHER_HDR_LEN); return dev->if_softc; } int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) { - struct ib_device_attr *device_attr; - int result = -ENOMEM; + struct ib_device_attr *device_attr = &hca->attrs; - device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL); - if (!device_attr) { - printk(KERN_WARNING "%s: allocation of %zu bytes failed\n", - hca->name, sizeof *device_attr); - return result; - } - - result = ib_query_device(hca, device_attr); - if (result) { - printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n", - hca->name, result); - kfree(device_attr); - return result; - } priv->hca_caps = device_attr->device_cap_flags; - kfree(device_attr); - priv->dev->if_hwassist = 0; priv->dev->if_capabilities = 0; #ifndef CONFIG_INFINIBAND_IPOIB_CM if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { set_bit(IPOIB_FLAG_CSUM, &priv->flags); priv->dev->if_hwassist = CSUM_IP | CSUM_TCP | CSUM_UDP; priv->dev->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; } #if 0 if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) { priv->dev->if_capabilities |= IFCAP_TSO4; priv->dev->if_hwassist |= CSUM_TSO; } #endif #endif priv->dev->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE; priv->dev->if_capenable = priv->dev->if_capabilities; return 0; } static struct ifnet * ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { struct ipoib_dev_priv *priv; struct ib_port_attr attr; int result = -ENOMEM; priv = ipoib_intf_alloc(format); if (!priv) goto alloc_mem_failed; if (!ib_query_port(hca, port, &attr)) priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); else { printk(KERN_WARNING "%s: ib_query_port %d failed\n", hca->name, port); goto device_init_failed; } /* MTU will be reset when mcast join happens */ priv->dev->if_mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = priv->dev->if_mtu; result = ib_query_pkey(hca, port, 0, &priv->pkey); if (result) { printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_set_dev_features(priv, hca)) goto device_init_failed; /* * Set the full membership bit, so that we join the right * broadcast group, etc. */ priv->pkey |= 0x8000; priv->broadcastaddr[8] = priv->pkey >> 8; priv->broadcastaddr[9] = priv->pkey & 0xff; - result = ib_query_gid(hca, port, 0, &priv->local_gid); + result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL); if (result) { printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } memcpy(IF_LLADDR(priv->dev) + 4, priv->local_gid.raw, sizeof (union ib_gid)); result = ipoib_dev_init(priv, hca, port); if (result < 0) { printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_cm_admin_enabled(priv)) priv->dev->if_mtu = IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)); INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); result = ib_register_event_handler(&priv->event_handler); if (result < 0) { printk(KERN_WARNING "%s: ib_register_event_handler failed for " "port %d (ret = %d)\n", hca->name, port, result); goto event_failed; } if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port); return priv->dev; event_failed: ipoib_dev_cleanup(priv); device_init_failed: ipoib_detach(priv); alloc_mem_failed: return ERR_PTR(result); } static void ipoib_add_one(struct ib_device *device) { struct list_head *dev_list; struct ifnet *dev; struct ipoib_dev_priv *priv; int s, e, p; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); if (!dev_list) return; INIT_LIST_HEAD(dev_list); if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; } else { s = 1; e = device->phys_port_cnt; } for (p = s; p <= e; ++p) { if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) continue; dev = ipoib_add_port("ib", device, p); if (!IS_ERR(dev)) { priv = dev->if_softc; list_add_tail(&priv->list, dev_list); } } ib_set_client_data(device, &ipoib_client, dev_list); } static void -ipoib_remove_one(struct ib_device *device) +ipoib_remove_one(struct ib_device *device, void *client_data) { struct ipoib_dev_priv *priv, *tmp; - struct list_head *dev_list; + struct list_head *dev_list = client_data; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + if (!dev_list) return; - dev_list = ib_get_client_data(device, &ipoib_client); + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; list_for_each_entry_safe(priv, tmp, dev_list, list) { if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND) continue; ipoib_stop(priv); ib_unregister_event_handler(&priv->event_handler); /* dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); */ flush_workqueue(ipoib_workqueue); ipoib_dev_cleanup(priv); ipoib_detach(priv); } kfree(dev_list); } static void ipoib_config_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct ifnet *dev; uint16_t pkey; int error; if (ifp->if_type != IFT_INFINIBAND) return; dev = VLAN_DEVAT(ifp, vtag); if (dev == NULL) return; priv = NULL; error = 0; parent = ifp->if_softc; /* We only support 15 bits of pkey. */ if (vtag & 0x8000) return; pkey = vtag | 0x8000; /* Set full membership bit. */ if (pkey == parent->pkey) return; /* Check for dups */ mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { priv = NULL; error = EBUSY; goto out; } } priv = ipoib_priv_alloc(); priv->dev = dev; priv->max_ib_mtu = parent->max_ib_mtu; priv->mcast_mtu = priv->admin_mtu = parent->dev->if_mtu; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); error = ipoib_set_dev_features(priv, parent->ca); if (error) goto out; priv->pkey = pkey; priv->broadcastaddr[8] = pkey >> 8; priv->broadcastaddr[9] = pkey & 0xff; dev->if_broadcastaddr = priv->broadcastaddr; error = ipoib_dev_init(priv, parent->ca, parent->port); if (error) goto out; priv->parent = parent->dev; list_add_tail(&priv->list, &parent->child_intfs); VLAN_SETCOOKIE(dev, priv); dev->if_start = ipoib_vlan_start; dev->if_drv_flags &= ~IFF_DRV_RUNNING; dev->if_hdrlen = IPOIB_HEADER_LEN; if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_open(priv); mutex_unlock(&parent->vlan_mutex); return; out: mutex_unlock(&parent->vlan_mutex); if (priv) free(priv, M_TEMP); if (error) ipoib_warn(parent, "failed to initialize subinterface: device %s, port %d vtag 0x%X", parent->ca->name, parent->port, vtag); return; } static void ipoib_unconfig_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct ifnet *dev; uint16_t pkey; if (ifp->if_type != IFT_INFINIBAND) return; dev = VLAN_DEVAT(ifp, vtag); if (dev) VLAN_SETCOOKIE(dev, NULL); pkey = vtag | 0x8000; parent = ifp->if_softc; mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { ipoib_dev_cleanup(priv); list_del(&priv->list); break; } } mutex_unlock(&parent->vlan_mutex); } eventhandler_tag ipoib_vlan_attach; eventhandler_tag ipoib_vlan_detach; static int __init ipoib_init_module(void) { int ret; ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE)); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); #endif ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST); ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST); /* * We create our own workqueue mainly because we want to be * able to flush it when devices are being removed. We can't * use schedule_work()/flush_scheduled_work() because both * unregister_netdev() and linkwatch_event take the rtnl lock, * so flush_scheduled_work() can deadlock during device * removal. */ ipoib_workqueue = create_singlethread_workqueue("ipoib"); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; } ib_sa_register_client(&ipoib_sa_client); ret = ib_register_client(&ipoib_client); if (ret) goto err_sa; return 0; err_sa: ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); err_fs: return ret; } static void __exit ipoib_cleanup_module(void) { EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); } /* * Infiniband output routine. */ static int ipoib_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_char edst[INFINIBAND_ALEN]; #if defined(INET) || defined(INET6) struct llentry *lle = NULL; #endif struct ipoib_header *eh; int error = 0, is_gw = 0; short type; if (ro != NULL) is_gw = (ro->ro_flags & RT_HAS_GW) != 0; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) goto bad; #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) { error = ENETDOWN; goto bad; } if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) { error = ENETDOWN; goto bad; } switch (dst->sa_family) { #ifdef INET case AF_INET: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, lle->ll_addr, sizeof(edst)); else if (m->m_flags & M_MCAST) ip_ib_mc_map(((struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, edst); else error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); break; case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_INFINIBAND); switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: type = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: type = htons(ETHERTYPE_ARP); break; } if (m->m_flags & M_BCAST) bcopy(ifp->if_broadcastaddr, edst, INFINIBAND_ALEN); else bcopy(ar_tha(ah), edst, INFINIBAND_ALEN); } break; #endif #ifdef INET6 case AF_INET6: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, lle->ll_addr, sizeof(edst)); else if (m->m_flags & M_MCAST) ipv6_ib_mc_map(&((struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, edst); else error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return error; type = htons(ETHERTYPE_IPV6); break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); error = EAFNOSUPPORT; goto bad; } /* * Add local net header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, IPOIB_HEADER_LEN, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } eh = mtod(m, struct ipoib_header *); (void)memcpy(&eh->proto, &type, sizeof(eh->proto)); (void)memcpy(&eh->hwaddr, edst, sizeof (edst)); /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ return ((ifp->if_transmit)(ifp, m)); bad: if (m != NULL) m_freem(m); return (error); } /* * Upper layer processing for a received Infiniband packet. */ void ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto) { int isr; #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { if_printf(ifp, "discard frame at IFF_MONITOR\n"); m_freem(m); return; } /* * Dispatch frame to upper layer. */ switch (proto) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: goto discard; } netisr_dispatch(isr, m); return; discard: m_freem(m); } /* * Process a received Infiniband packet. */ static void ipoib_input(struct ifnet *ifp, struct mbuf *m) { struct ipoib_header *eh; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } CURVNET_SET_QUIET(ifp->if_vnet); /* Let BPF have it before we strip the header. */ IPOIB_MTAP(ifp, m); eh = mtod(m, struct ipoib_header *); /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Infiniband header. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); m_adj(m, IPOIB_HEADER_LEN); if (IPOIB_IS_MULTICAST(eh->hwaddr)) { if (memcmp(eh->hwaddr, ifp->if_broadcastaddr, ifp->if_addrlen) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } ipoib_demux(ifp, m, ntohs(eh->proto)); CURVNET_RESTORE(); } static int ipoib_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!IPOIB_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ALEN; e_addr = LLADDR(sdl); ip_ib_mc_map(sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; /* * An IP6 address of 0 means listen to all * of the multicast address used for IP6. * This has no meaning in ipoib. */ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) return EADDRNOTAVAIL; if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ALEN; e_addr = LLADDR(sdl); ipv6_ib_mc_map(&sin6->sin6_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: return EAFNOSUPPORT; } } module_init(ipoib_init_module); module_exit(ipoib_cleanup_module); static int ipoib_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t ipoib_mod = { .name = "ipoib", .evhand = ipoib_evhand, }; DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_LAST, SI_ORDER_ANY); MODULE_DEPEND(ipoib, ibcore, 1, 1, 1); MODULE_DEPEND(ipoib, linuxkpi, 1, 1, 1); Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c (revision 319974) @@ -1,915 +1,915 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ipoib.h" #include #include #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG static int mcast_debug_level = 1; module_param(mcast_debug_level, int, 0644); MODULE_PARM_DESC(mcast_debug_level, "Enable multicast debug tracing if > 0"); #endif static DEFINE_MUTEX(mcast_mutex); struct ipoib_mcast_iter { struct ipoib_dev_priv *priv; union ib_gid mgid; unsigned long created; unsigned int queuelen; unsigned int complete; unsigned int send_only; }; static void ipoib_mcast_free(struct ipoib_mcast *mcast) { struct ifnet *dev = mcast->priv->dev; int tx_dropped = 0; ipoib_dbg_mcast(mcast->priv, "deleting multicast group %16D\n", mcast->mcmember.mgid.raw, ":"); if (mcast->ah) ipoib_put_ah(mcast->ah); tx_dropped = mcast->pkt_queue.ifq_len; _IF_DRAIN(&mcast->pkt_queue); /* XXX Locking. */ if_inc_counter(dev, IFCOUNTER_OERRORS, tx_dropped); kfree(mcast); } static struct ipoib_mcast *ipoib_mcast_alloc(struct ipoib_dev_priv *priv, int can_sleep) { struct ipoib_mcast *mcast; mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC); if (!mcast) return NULL; mcast->priv = priv; mcast->created = jiffies; mcast->backoff = 1; INIT_LIST_HEAD(&mcast->list); bzero(&mcast->pkt_queue, sizeof(mcast->pkt_queue)); return mcast; } static struct ipoib_mcast *__ipoib_mcast_find(struct ipoib_dev_priv *priv, void *mgid) { struct rb_node *n = priv->multicast_tree.rb_node; while (n) { struct ipoib_mcast *mcast; int ret; mcast = rb_entry(n, struct ipoib_mcast, rb_node); ret = memcmp(mgid, mcast->mcmember.mgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; else if (ret > 0) n = n->rb_right; else return mcast; } return NULL; } static int __ipoib_mcast_add(struct ipoib_dev_priv *priv, struct ipoib_mcast *mcast) { struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL; while (*n) { struct ipoib_mcast *tmcast; int ret; pn = *n; tmcast = rb_entry(pn, struct ipoib_mcast, rb_node); ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw, sizeof (union ib_gid)); if (ret < 0) n = &pn->rb_left; else if (ret > 0) n = &pn->rb_right; else return -EEXIST; } rb_link_node(&mcast->rb_node, pn, n); rb_insert_color(&mcast->rb_node, &priv->multicast_tree); return 0; } static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, struct ib_sa_mcmember_rec *mcmember) { struct ipoib_dev_priv *priv = mcast->priv; struct ifnet *dev = priv->dev; struct ipoib_ah *ah; int ret; int set_qkey = 0; mcast->mcmember = *mcmember; /* Set the cached Q_Key before we attach if it's the broadcast group */ if (!memcmp(mcast->mcmember.mgid.raw, dev->if_broadcastaddr + 4, sizeof (union ib_gid))) { spin_lock_irq(&priv->lock); if (!priv->broadcast) { spin_unlock_irq(&priv->lock); return -EAGAIN; } priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); spin_unlock_irq(&priv->lock); - priv->tx_wr.wr.ud.remote_qkey = priv->qkey; + priv->tx_wr.remote_qkey = priv->qkey; set_qkey = 1; } if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { ipoib_warn(priv, "multicast group %16D already attached\n", mcast->mcmember.mgid.raw, ":"); return 0; } ret = ipoib_mcast_attach(priv, be16_to_cpu(mcast->mcmember.mlid), &mcast->mcmember.mgid, set_qkey); if (ret < 0) { ipoib_warn(priv, "couldn't attach QP to multicast group %16D\n", mcast->mcmember.mgid.raw, ":"); clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags); return ret; } } { struct ib_ah_attr av = { .dlid = be16_to_cpu(mcast->mcmember.mlid), .port_num = priv->port, .sl = mcast->mcmember.sl, .ah_flags = IB_AH_GRH, .static_rate = mcast->mcmember.rate, .grh = { .flow_label = be32_to_cpu(mcast->mcmember.flow_label), .hop_limit = mcast->mcmember.hop_limit, .sgid_index = 0, .traffic_class = mcast->mcmember.traffic_class } }; av.grh.dgid = mcast->mcmember.mgid; ah = ipoib_create_ah(priv, priv->pd, &av); if (!ah) { ipoib_warn(priv, "ib_address_create failed\n"); } else { spin_lock_irq(&priv->lock); mcast->ah = ah; spin_unlock_irq(&priv->lock); ipoib_dbg_mcast(priv, "MGID %16D AV %p, LID 0x%04x, SL %d\n", mcast->mcmember.mgid.raw, ":", mcast->ah->ah, be16_to_cpu(mcast->mcmember.mlid), mcast->mcmember.sl); } } /* actually send any queued packets */ while (mcast->pkt_queue.ifq_len) { struct mbuf *mb; _IF_DEQUEUE(&mcast->pkt_queue, mb); mb->m_pkthdr.rcvif = dev; if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n"); } return 0; } static int ipoib_mcast_sendonly_join_complete(int status, struct ib_sa_multicast *multicast) { struct ipoib_mcast *mcast = multicast->context; struct ipoib_dev_priv *priv = mcast->priv; /* We trap for port events ourselves. */ if (status == -ENETRESET) return 0; if (!status) status = ipoib_mcast_join_finish(mcast, &multicast->rec); if (status) { if (mcast->logcount++ < 20) ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n", mcast->mcmember.mgid.raw, ":", status); /* Flush out any queued packets */ if_inc_counter(priv->dev, IFCOUNTER_OERRORS, mcast->pkt_queue.ifq_len); _IF_DRAIN(&mcast->pkt_queue); /* Clear the busy flag so we try again */ status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); } return status; } static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) { struct ipoib_dev_priv *priv = mcast->priv; struct ib_sa_mcmember_rec rec = { #if 0 /* Some SMs don't support send-only yet */ .join_state = 4 #else .join_state = 1 #endif }; int ret = 0; if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n"); return -ENODEV; } if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n"); return -EBUSY; } rec.mgid = mcast->mcmember.mgid; rec.port_gid = priv->local_gid; rec.pkey = cpu_to_be16(priv->pkey); mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE, GFP_ATOMIC, ipoib_mcast_sendonly_join_complete, mcast); if (IS_ERR(mcast->mc)) { ret = PTR_ERR(mcast->mc); clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", ret); } else { ipoib_dbg_mcast(priv, "no multicast record for %16D, starting join\n", mcast->mcmember.mgid.raw, ":"); } return ret; } void ipoib_mcast_carrier_on_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, carrier_on_task); struct ib_port_attr attr; /* * Take rtnl_lock to avoid racing with ipoib_stop() and * turning the carrier back on while a device is being * removed. */ if (ib_query_port(priv->ca, priv->port, &attr) || attr.state != IB_PORT_ACTIVE) { ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); return; } if_link_state_change(priv->dev, LINK_STATE_UP); } static int ipoib_mcast_join_complete(int status, struct ib_sa_multicast *multicast) { struct ipoib_mcast *mcast = multicast->context; struct ipoib_dev_priv *priv = mcast->priv; ipoib_dbg_mcast(priv, "join completion for %16D (status %d)\n", mcast->mcmember.mgid.raw, ":", status); /* We trap for port events ourselves. */ if (status == -ENETRESET) return 0; if (!status) status = ipoib_mcast_join_finish(mcast, &multicast->rec); if (!status) { mcast->backoff = 1; mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); mutex_unlock(&mcast_mutex); /* * Defer carrier on work to ipoib_workqueue to avoid a * deadlock on rtnl_lock here. */ if (mcast == priv->broadcast) queue_work(ipoib_workqueue, &priv->carrier_on_task); return 0; } if (mcast->logcount++ < 20) { if (status == -ETIMEDOUT || status == -EAGAIN) { ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n", mcast->mcmember.mgid.raw, ":", status); } else { ipoib_warn(priv, "multicast join failed for %16D, status %d\n", mcast->mcmember.mgid.raw, ":", status); } } mcast->backoff *= 2; if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; /* Clear the busy flag so we try again */ status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); mutex_lock(&mcast_mutex); spin_lock_irq(&priv->lock); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, mcast->backoff * HZ); spin_unlock_irq(&priv->lock); mutex_unlock(&mcast_mutex); return status; } static void ipoib_mcast_join(struct ipoib_dev_priv *priv, struct ipoib_mcast *mcast, int create) { struct ib_sa_mcmember_rec rec = { .join_state = 1 }; ib_sa_comp_mask comp_mask; int ret = 0; ipoib_dbg_mcast(priv, "joining MGID %16D\n", mcast->mcmember.mgid.raw, ":"); rec.mgid = mcast->mcmember.mgid; rec.port_gid = priv->local_gid; rec.pkey = cpu_to_be16(priv->pkey); comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE; if (create) { comp_mask |= IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_MTU_SELECTOR | IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS | IB_SA_MCMEMBER_REC_RATE_SELECTOR | IB_SA_MCMEMBER_REC_RATE | IB_SA_MCMEMBER_REC_SL | IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_HOP_LIMIT; rec.qkey = priv->broadcast->mcmember.qkey; rec.mtu_selector = IB_SA_EQ; rec.mtu = priv->broadcast->mcmember.mtu; rec.traffic_class = priv->broadcast->mcmember.traffic_class; rec.rate_selector = IB_SA_EQ; rec.rate = priv->broadcast->mcmember.rate; rec.sl = priv->broadcast->mcmember.sl; rec.flow_label = priv->broadcast->mcmember.flow_label; rec.hop_limit = priv->broadcast->mcmember.hop_limit; } set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, comp_mask, GFP_KERNEL, ipoib_mcast_join_complete, mcast); if (IS_ERR(mcast->mc)) { clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); ret = PTR_ERR(mcast->mc); ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); mcast->backoff *= 2; if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, mcast->backoff * HZ); mutex_unlock(&mcast_mutex); } } void ipoib_mcast_join_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, mcast_task.work); struct ifnet *dev = priv->dev; struct ib_port_attr attr; ipoib_dbg_mcast(priv, "Running join task. flags 0x%lX\n", priv->flags); if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) return; if (ib_query_port(priv->ca, priv->port, &attr) || attr.state != IB_PORT_ACTIVE) { ipoib_dbg(priv, "%s: port state is not ACTIVE (state = %d) suspend task.\n", __func__, attr.state); return; } - if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) + if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL)) ipoib_warn(priv, "ib_query_gid() failed\n"); else memcpy(IF_LLADDR(dev) + 4, priv->local_gid.raw, sizeof (union ib_gid)); { struct ib_port_attr attr; if (!ib_query_port(priv->ca, priv->port, &attr)) priv->local_lid = attr.lid; else ipoib_warn(priv, "ib_query_port failed\n"); } if (!priv->broadcast) { struct ipoib_mcast *broadcast; if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) return; broadcast = ipoib_mcast_alloc(priv, 1); if (!broadcast) { ipoib_warn(priv, "failed to allocate broadcast group\n"); mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, HZ); mutex_unlock(&mcast_mutex); return; } spin_lock_irq(&priv->lock); memcpy(broadcast->mcmember.mgid.raw, dev->if_broadcastaddr + 4, sizeof (union ib_gid)); priv->broadcast = broadcast; __ipoib_mcast_add(priv, priv->broadcast); spin_unlock_irq(&priv->lock); } if (priv->broadcast && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { if (priv->broadcast && !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) ipoib_mcast_join(priv, priv->broadcast, 0); return; } while (1) { struct ipoib_mcast *mcast = NULL; spin_lock_irq(&priv->lock); list_for_each_entry(mcast, &priv->multicast_list, list) { if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { /* Found the next unjoined group */ break; } } spin_unlock_irq(&priv->lock); if (&mcast->list == &priv->multicast_list) { /* All done */ break; } ipoib_mcast_join(priv, mcast, 1); return; } spin_lock_irq(&priv->lock); if (priv->broadcast) priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); else priv->mcast_mtu = priv->admin_mtu; spin_unlock_irq(&priv->lock); if (!ipoib_cm_admin_enabled(priv)) ipoib_change_mtu(priv, min(priv->mcast_mtu, priv->admin_mtu)); ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); clear_bit(IPOIB_MCAST_RUN, &priv->flags); } int ipoib_mcast_start_thread(struct ipoib_dev_priv *priv) { ipoib_dbg_mcast(priv, "starting multicast thread flags 0x%lX\n", priv->flags); mutex_lock(&mcast_mutex); if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); mutex_unlock(&mcast_mutex); return 0; } int ipoib_mcast_stop_thread(struct ipoib_dev_priv *priv, int flush) { ipoib_dbg_mcast(priv, "stopping multicast thread\n"); mutex_lock(&mcast_mutex); clear_bit(IPOIB_MCAST_RUN, &priv->flags); cancel_delayed_work(&priv->mcast_task); mutex_unlock(&mcast_mutex); if (flush) flush_workqueue(ipoib_workqueue); return 0; } static int ipoib_mcast_leave(struct ipoib_dev_priv *priv, struct ipoib_mcast *mcast) { int ret = 0; if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) ib_sa_free_multicast(mcast->mc); if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { ipoib_dbg_mcast(priv, "leaving MGID %16D\n", mcast->mcmember.mgid.raw, ":"); /* Remove ourselves from the multicast group */ ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid, be16_to_cpu(mcast->mcmember.mlid)); if (ret) ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); } return 0; } void ipoib_mcast_send(struct ipoib_dev_priv *priv, void *mgid, struct mbuf *mb) { struct ifnet *dev = priv->dev; struct ipoib_mcast *mcast; if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags) || !priv->broadcast || !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { if_inc_counter(dev, IFCOUNTER_OERRORS, 1); m_freem(mb); return; } mcast = __ipoib_mcast_find(priv, mgid); if (!mcast) { /* Let's create a new send only group now */ ipoib_dbg_mcast(priv, "setting up send only multicast group for %16D\n", mgid, ":"); mcast = ipoib_mcast_alloc(priv, 0); if (!mcast) { ipoib_warn(priv, "unable to allocate memory for " "multicast structure\n"); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); m_freem(mb); goto out; } set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); __ipoib_mcast_add(priv, mcast); list_add_tail(&mcast->list, &priv->multicast_list); } if (!mcast->ah) { if (mcast->pkt_queue.ifq_len < IPOIB_MAX_MCAST_QUEUE) { _IF_ENQUEUE(&mcast->pkt_queue, mb); } else { if_inc_counter(dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) ipoib_dbg_mcast(priv, "no address vector, " "but multicast join already started\n"); else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) ipoib_mcast_sendonly_join(mcast); /* * If lookup completes between here and out:, don't * want to send packet twice. */ mcast = NULL; } out: if (mcast && mcast->ah) ipoib_send(priv, mb, mcast->ah, IB_MULTICAST_QPN); } void ipoib_mcast_dev_flush(struct ipoib_dev_priv *priv) { LIST_HEAD(remove_list); struct ipoib_mcast *mcast, *tmcast; unsigned long flags; ipoib_dbg_mcast(priv, "flushing multicast list\n"); spin_lock_irqsave(&priv->lock, flags); list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { list_del(&mcast->list); rb_erase(&mcast->rb_node, &priv->multicast_tree); list_add_tail(&mcast->list, &remove_list); } if (priv->broadcast) { rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree); list_add_tail(&priv->broadcast->list, &remove_list); priv->broadcast = NULL; } spin_unlock_irqrestore(&priv->lock, flags); list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(priv, mcast); ipoib_mcast_free(mcast); } } static int ipoib_mcast_addr_is_valid(const u8 *addr, unsigned int addrlen, const u8 *broadcast) { if (addrlen != INFINIBAND_ALEN) return 0; /* reserved QPN, prefix, scope */ if (memcmp(addr, broadcast, 6)) return 0; /* signature lower, pkey */ if (memcmp(addr + 7, broadcast + 7, 3)) return 0; return 1; } void ipoib_mcast_restart_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, restart_task); ipoib_mcast_restart(priv); } void ipoib_mcast_restart(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; struct ifmultiaddr *ifma; struct ipoib_mcast *mcast, *tmcast; LIST_HEAD(remove_list); struct ib_sa_mcmember_rec rec; int addrlen; ipoib_dbg_mcast(priv, "restarting multicast task flags 0x%lX\n", priv->flags); ipoib_mcast_stop_thread(priv, 0); if_maddr_rlock(dev); spin_lock(&priv->lock); /* * Unfortunately, the networking core only gives us a list of all of * the multicast hardware addresses. We need to figure out which ones * are new and which ones have been removed */ /* Clear out the found flag */ list_for_each_entry(mcast, &priv->multicast_list, list) clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); /* Mark all of the entries that are found or don't exist */ TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) { union ib_gid mgid; uint8_t *addr; if (ifma->ifma_addr->sa_family != AF_LINK) continue; addr = LLADDR((struct sockaddr_dl *)ifma->ifma_addr); addrlen = ((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen; if (!ipoib_mcast_addr_is_valid(addr, addrlen, dev->if_broadcastaddr)) continue; memcpy(mgid.raw, addr + 4, sizeof mgid); mcast = __ipoib_mcast_find(priv, &mgid); if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { struct ipoib_mcast *nmcast; /* ignore group which is directly joined by userspace */ if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) && !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) { ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %16D\n", mgid.raw, ":"); continue; } /* Not found or send-only group, let's add a new entry */ ipoib_dbg_mcast(priv, "adding multicast entry for mgid %16D\n", mgid.raw, ":"); nmcast = ipoib_mcast_alloc(priv, 0); if (!nmcast) { ipoib_warn(priv, "unable to allocate memory for multicast structure\n"); continue; } set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags); nmcast->mcmember.mgid = mgid; if (mcast) { /* Destroy the send only entry */ list_move_tail(&mcast->list, &remove_list); rb_replace_node(&mcast->rb_node, &nmcast->rb_node, &priv->multicast_tree); } else __ipoib_mcast_add(priv, nmcast); list_add_tail(&nmcast->list, &priv->multicast_list); } if (mcast) set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); } /* Remove all of the entries don't exist anymore */ list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) && !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { ipoib_dbg_mcast(priv, "deleting multicast group %16D\n", mcast->mcmember.mgid.raw, ":"); rb_erase(&mcast->rb_node, &priv->multicast_tree); /* Move to the remove list */ list_move_tail(&mcast->list, &remove_list); } } spin_unlock(&priv->lock); if_maddr_runlock(dev); /* We have to cancel outside of the spinlock */ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(mcast->priv, mcast); ipoib_mcast_free(mcast); } if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) ipoib_mcast_start_thread(priv); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct ipoib_dev_priv *priv) { struct ipoib_mcast_iter *iter; iter = kmalloc(sizeof *iter, GFP_KERNEL); if (!iter) return NULL; iter->priv = priv; memset(iter->mgid.raw, 0, 16); if (ipoib_mcast_iter_next(iter)) { kfree(iter); return NULL; } return iter; } int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter) { struct ipoib_dev_priv *priv = iter->priv; struct rb_node *n; struct ipoib_mcast *mcast; int ret = 1; spin_lock_irq(&priv->lock); n = rb_first(&priv->multicast_tree); while (n) { mcast = rb_entry(n, struct ipoib_mcast, rb_node); if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw, sizeof (union ib_gid)) < 0) { iter->mgid = mcast->mcmember.mgid; iter->created = mcast->created; iter->queuelen = mcast->pkt_queue.ifq_len; iter->complete = !!mcast->ah; iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY)); ret = 0; break; } n = rb_next(n); } spin_unlock_irq(&priv->lock); return ret; } void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, union ib_gid *mgid, unsigned long *created, unsigned int *queuelen, unsigned int *complete, unsigned int *send_only) { *mgid = iter->mgid; *created = iter->created; *queuelen = iter->queuelen; *complete = iter->complete; *send_only = iter->send_only; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c (revision 319974) @@ -1,294 +1,285 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ipoib.h" int ipoib_mcast_attach(struct ipoib_dev_priv *priv, u16 mlid, union ib_gid *mgid, int set_qkey) { struct ib_qp_attr *qp_attr = NULL; int ret; u16 pkey_index; if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) { clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); ret = -ENXIO; goto out; } set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); if (set_qkey) { ret = -ENOMEM; qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); if (!qp_attr) goto out; /* set correct QKey for QP */ qp_attr->qkey = priv->qkey; ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY); if (ret) { ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret); goto out; } } /* attach QP to multicast group */ ret = ib_attach_mcast(priv->qp, mgid, mlid); if (ret) ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret); out: kfree(qp_attr); return ret; } int ipoib_init_qp(struct ipoib_dev_priv *priv) { int ret; struct ib_qp_attr qp_attr; int attr_mask; if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return -1; qp_attr.qp_state = IB_QPS_INIT; qp_attr.qkey = 0; qp_attr.port_num = priv->port; qp_attr.pkey_index = priv->pkey_index; attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE; ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to init, ret = %d\n", ret); goto out_fail; } qp_attr.qp_state = IB_QPS_RTR; /* Can't set this in a INIT->RTR transition */ attr_mask &= ~IB_QP_PORT; ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTR, ret = %d\n", ret); goto out_fail; } qp_attr.qp_state = IB_QPS_RTS; qp_attr.sq_psn = 0; attr_mask |= IB_QP_SQ_PSN; attr_mask &= ~IB_QP_PKEY_INDEX; ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTS, ret = %d\n", ret); goto out_fail; } return 0; out_fail: qp_attr.qp_state = IB_QPS_RESET; if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) ipoib_warn(priv, "Failed to modify QP to RESET state\n"); return ret; } int ipoib_transport_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca) { struct ib_qp_init_attr init_attr = { .cap = { .max_send_wr = ipoib_sendq_size, .max_recv_wr = ipoib_recvq_size, .max_send_sge = 1, .max_recv_sge = IPOIB_UD_RX_SG }, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_UD }; + struct ib_cq_init_attr cq_attr = {}; int ret, size; int i; /* XXX struct ethtool_coalesce *coal; */ - priv->pd = ib_alloc_pd(priv->ca); + priv->pd = ib_alloc_pd(priv->ca, 0); if (IS_ERR(priv->pd)) { printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); return -ENODEV; } - priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(priv->mr)) { - printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name); - goto out_free_pd; - } - size = ipoib_recvq_size + 1; ret = ipoib_cm_dev_init(priv); if (!ret) { size += ipoib_sendq_size; if (ipoib_cm_has_srq(priv)) size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */ else size += ipoib_recvq_size * ipoib_max_conn_qp; } - priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, priv, size, 0); + cq_attr.cqe = size; + priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, priv, &cq_attr); if (IS_ERR(priv->recv_cq)) { printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name); goto out_free_mr; } + cq_attr.cqe = ipoib_sendq_size; priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL, - priv, ipoib_sendq_size, 0); + priv, &cq_attr); if (IS_ERR(priv->send_cq)) { printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name); goto out_free_recv_cq; } if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP)) goto out_free_send_cq; #if 0 /* XXX */ coal = kzalloc(sizeof *coal, GFP_KERNEL); if (coal) { coal->rx_coalesce_usecs = 10; coal->tx_coalesce_usecs = 10; coal->rx_max_coalesced_frames = 16; coal->tx_max_coalesced_frames = 16; dev->ethtool_ops->set_coalesce(dev, coal); kfree(coal); } #endif init_attr.send_cq = priv->send_cq; init_attr.recv_cq = priv->recv_cq; if (priv->hca_caps & IB_DEVICE_UD_TSO) init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK) init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; init_attr.cap.max_send_sge = IPOIB_UD_TX_SG; priv->qp = ib_create_qp(priv->pd, &init_attr); if (IS_ERR(priv->qp)) { printk(KERN_WARNING "%s: failed to create QP\n", ca->name); goto out_free_send_cq; } IF_LLADDR(priv->dev)[1] = (priv->qp->qp_num >> 16) & 0xff; IF_LLADDR(priv->dev)[2] = (priv->qp->qp_num >> 8) & 0xff; IF_LLADDR(priv->dev)[3] = (priv->qp->qp_num ) & 0xff; for (i = 0; i < IPOIB_MAX_TX_SG; ++i) - priv->tx_sge[i].lkey = priv->mr->lkey; + priv->tx_sge[i].lkey = priv->pd->local_dma_lkey; - priv->tx_wr.opcode = IB_WR_SEND; - priv->tx_wr.sg_list = priv->tx_sge; - priv->tx_wr.send_flags = IB_SEND_SIGNALED; + priv->tx_wr.wr.opcode = IB_WR_SEND; + priv->tx_wr.wr.sg_list = priv->tx_sge; + priv->tx_wr.wr.send_flags = IB_SEND_SIGNALED; for (i = 0; i < IPOIB_UD_RX_SG; ++i) - priv->rx_sge[i].lkey = priv->mr->lkey; + priv->rx_sge[i].lkey = priv->pd->local_dma_lkey; priv->rx_wr.next = NULL; priv->rx_wr.sg_list = priv->rx_sge; return 0; out_free_send_cq: ib_destroy_cq(priv->send_cq); out_free_recv_cq: ib_destroy_cq(priv->recv_cq); out_free_mr: - ib_dereg_mr(priv->mr); ipoib_cm_dev_cleanup(priv); -out_free_pd: ib_dealloc_pd(priv->pd); return -ENODEV; } void ipoib_transport_dev_cleanup(struct ipoib_dev_priv *priv) { if (priv->qp) { if (ib_destroy_qp(priv->qp)) ipoib_warn(priv, "ib_qp_destroy failed\n"); priv->qp = NULL; clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); } if (ib_destroy_cq(priv->send_cq)) ipoib_warn(priv, "ib_cq_destroy (send) failed\n"); if (ib_destroy_cq(priv->recv_cq)) ipoib_warn(priv, "ib_cq_destroy (recv) failed\n"); ipoib_cm_dev_cleanup(priv); - if (ib_dereg_mr(priv->mr)) - ipoib_warn(priv, "ib_dereg_mr failed\n"); - - if (ib_dealloc_pd(priv->pd)) - ipoib_warn(priv, "ib_dealloc_pd failed\n"); + ib_dealloc_pd(priv->pd); } void ipoib_event(struct ib_event_handler *handler, struct ib_event *record) { struct ipoib_dev_priv *priv = container_of(handler, struct ipoib_dev_priv, event_handler); if (record->element.port_num != priv->port) return; ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event, record->device->name, record->element.port_num); if (record->event == IB_EVENT_SM_CHANGE || record->event == IB_EVENT_CLIENT_REREGISTER) { queue_work(ipoib_workqueue, &priv->flush_light); } else if (record->event == IB_EVENT_PORT_ERR || record->event == IB_EVENT_PORT_ACTIVE || record->event == IB_EVENT_LID_CHANGE) { queue_work(ipoib_workqueue, &priv->flush_normal); } else if (record->event == IB_EVENT_PKEY_CHANGE) { queue_work(ipoib_workqueue, &priv->flush_heavy); } } Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_peer_mem.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_peer_mem.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_peer_mem.h (nonexistent) @@ -1,59 +0,0 @@ -#if !defined(IB_PEER_MEM_H) -#define IB_PEER_MEM_H - -#include - - -struct invalidation_ctx; -struct ib_ucontext; - -struct ib_peer_memory_statistics { - unsigned long num_alloc_mrs; - unsigned long num_dealloc_mrs; - unsigned long num_reg_pages; - unsigned long num_dereg_pages; - unsigned long num_free_callbacks; -}; - -struct ib_peer_memory_client { - const struct peer_memory_client *peer_mem; - - struct list_head core_peer_list; - struct list_head core_ticket_list; - unsigned long last_ticket; -#ifdef __FreeBSD__ - int holdcount; - int needwakeup; - struct cv peer_cv; -#else - struct srcu_struct peer_srcu; -#endif - struct mutex lock; - struct kobject *kobj; - struct attribute_group peer_mem_attr_group; - struct ib_peer_memory_statistics stats; -}; - -struct core_ticket { - unsigned long key; - void *context; - struct list_head ticket_list; -}; - -struct ib_peer_memory_client *ib_get_peer_client(struct ib_ucontext *context, unsigned long addr, - size_t size, void **peer_client_context, - int *srcu_key); - -void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client, - void *peer_client_context, - int srcu_key); - -unsigned long ib_peer_insert_context(struct ib_peer_memory_client *ib_peer_client, - void *context); -int ib_peer_remove_context(struct ib_peer_memory_client *ib_peer_client, - unsigned long key); -struct core_ticket *ib_peer_search_context(struct ib_peer_memory_client *ib_peer_client, - unsigned long key); -#endif - - Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_peer_mem.h ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/peer_mem.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/peer_mem.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/peer_mem.h (nonexistent) @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2013, Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#if !defined(PEER_MEM_H) -#define PEER_MEM_H - -#include -#include -#include -#include -#include - - -#define IB_PEER_MEMORY_NAME_MAX 64 -#define IB_PEER_MEMORY_VER_MAX 16 - -struct peer_memory_client { - char name[IB_PEER_MEMORY_NAME_MAX]; - char version[IB_PEER_MEMORY_VER_MAX]; - /* acquire return code: 1 mine, 0 - not mine */ - int (*acquire) (unsigned long addr, size_t size, void *peer_mem_private_data, - char *peer_mem_name, void **client_context); - int (*get_pages) (unsigned long addr, - size_t size, int write, int force, - struct sg_table *sg_head, - void *client_context, void *core_context); - int (*dma_map) (struct sg_table *sg_head, void *client_context, - struct device *dma_device, int dmasync, int *nmap); - int (*dma_unmap) (struct sg_table *sg_head, void *client_context, - struct device *dma_device); - void (*put_pages) (struct sg_table *sg_head, void *client_context); - unsigned long (*get_page_size) (void *client_context); - void (*release) (void *client_context); - -}; - -typedef int (*invalidate_peer_memory)(void *reg_handle, - void *core_context); - -void *ib_register_peer_memory_client(struct peer_memory_client *peer_client, - invalidate_peer_memory *invalidate_callback); -void ib_unregister_peer_memory_client(void *reg_handle); - -#endif Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/rdma/peer_mem.h ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/sdp_socket.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/sdp_socket.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/sdp_socket.h (nonexistent) @@ -1,23 +0,0 @@ -/* Stuff that should go into include/linux/socket.h */ - -#ifndef SDP_SOCKET_H -#define SDP_SOCKET_H - -#ifndef __FreeBSD__ -#ifndef AF_INET_SDP -#define AF_INET_SDP 27 -#define PF_INET_SDP AF_INET_SDP -#endif -#endif - -#ifndef SDP_ZCOPY_THRESH -#define SDP_ZCOPY_THRESH 80 -#endif - -#ifndef SDP_LAST_BIND_ERR -#define SDP_LAST_BIND_ERR 81 -#endif - -/* TODO: AF_INET6_SDP ? */ - -#endif Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/rdma/sdp_socket.h ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_user_verbs_exp.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_user_verbs_exp.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_user_verbs_exp.h (nonexistent) @@ -1,204 +0,0 @@ -/* - * Copyright (c) 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. - * Copyright (c) 2005 PathScale, Inc. All rights reserved. - * Copyright (c) 2006 Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef IB_USER_VERBS_EXP_H -#define IB_USER_VERBS_EXP_H - -#include - -enum { - IB_USER_VERBS_EXP_CMD_FIRST = 64 -}; - -enum { - IB_USER_VERBS_EXP_CMD_CREATE_QP, - IB_USER_VERBS_EXP_CMD_MODIFY_CQ, - IB_USER_VERBS_EXP_CMD_MODIFY_QP, - IB_USER_VERBS_EXP_CMD_CREATE_CQ, - IB_USER_VERBS_EXP_CMD_QUERY_DEVICE, - IB_USER_VERBS_EXP_CMD_CREATE_DCT, - IB_USER_VERBS_EXP_CMD_DESTROY_DCT, - IB_USER_VERBS_EXP_CMD_QUERY_DCT, -}; - -/* - * Make sure that all structs defined in this file remain laid out so - * that they pack the same way on 32-bit and 64-bit architectures (to - * avoid incompatibility between 32-bit userspace and 64-bit kernels). - * Specifically: - * - Do not use pointer types -- pass pointers in __u64 instead. - * - Make sure that any structure larger than 4 bytes is padded to a - * multiple of 8 bytes. Otherwise the structure size will be - * different between 32-bit and 64-bit architectures. - */ - -enum ib_uverbs_exp_create_qp_comp_mask { - IB_UVERBS_EXP_CREATE_QP_CAP_FLAGS = (1ULL << 0), - IB_UVERBS_EXP_CREATE_QP_INL_RECV = (1ULL << 1), - IB_UVERBS_EXP_CREATE_QP_QPG = (1ULL << 2) -}; - -struct ib_uverbs_qpg_init_attrib { - __u32 tss_child_count; - __u32 rss_child_count; -}; - -struct ib_uverbs_qpg { - __u32 qpg_type; - union { - struct { - __u32 parent_handle; - __u32 reserved; - }; - struct ib_uverbs_qpg_init_attrib parent_attrib; - }; - __u32 reserved2; -}; - -struct ib_uverbs_exp_create_qp { - __u64 comp_mask; - __u64 user_handle; - __u32 pd_handle; - __u32 send_cq_handle; - __u32 recv_cq_handle; - __u32 srq_handle; - __u32 max_send_wr; - __u32 max_recv_wr; - __u32 max_send_sge; - __u32 max_recv_sge; - __u32 max_inline_data; - __u8 sq_sig_all; - __u8 qp_type; - __u8 is_srq; - __u8 reserved; - __u64 qp_cap_flags; - __u32 max_inl_recv; - __u32 reserved1; - struct ib_uverbs_qpg qpg; - __u64 driver_data[0]; -}; - -enum ib_uverbs_exp_create_qp_resp_comp_mask { - IB_UVERBS_EXP_CREATE_QP_RESP_INL_RECV = (1ULL << 0), -}; - -struct ib_uverbs_exp_create_qp_resp { - __u64 comp_mask; - __u32 qp_handle; - __u32 qpn; - __u32 max_send_wr; - __u32 max_recv_wr; - __u32 max_send_sge; - __u32 max_recv_sge; - __u32 max_inline_data; - __u32 max_inl_recv; -}; - -struct ib_uverbs_create_dct { - __u64 comp_mask; - __u64 user_handle; - __u32 pd_handle; - __u32 cq_handle; - __u32 srq_handle; - __u32 access_flags; - __u32 flow_label; - __u64 dc_key; - __u8 min_rnr_timer; - __u8 tclass; - __u8 port; - __u8 pkey_index; - __u8 gid_index; - __u8 hop_limit; - __u8 mtu; - __u8 rsvd; - __u32 create_flags; - __u64 driver_data[0]; -}; - -struct ib_uverbs_create_dct_resp { - __u32 dct_handle; - __u32 dctn; -}; - -struct ib_uverbs_destroy_dct { - __u64 comp_mask; - __u64 user_handle; -}; - -struct ib_uverbs_destroy_dct_resp { - __u64 reserved; -}; - -struct ib_uverbs_query_dct { - __u64 comp_mask; - __u64 dct_handle; - __u64 driver_data[0]; -}; - -struct ib_uverbs_query_dct_resp { - __u64 dc_key; - __u32 access_flags; - __u32 flow_label; - __u32 key_violations; - __u8 port; - __u8 min_rnr_timer; - __u8 tclass; - __u8 mtu; - __u8 pkey_index; - __u8 gid_index; - __u8 hop_limit; - __u8 state; - __u32 rsvd; - __u64 driver_data[0]; -}; - -struct ib_uverbs_exp_query_device { - __u64 comp_mask; - __u64 driver_data[0]; -}; - -struct ib_uverbs_exp_query_device_resp { - __u64 comp_mask; - struct ib_uverbs_query_device_resp base; - __u64 timestamp_mask; - __u64 hca_core_clock; - __u64 device_cap_flags2; - __u32 dc_rd_req; - __u32 dc_rd_res; - __u32 inline_recv_sz; - __u32 max_rss_tbl_sz; -}; - -#endif /* IB_USER_VERBS_EXP_H */ Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_user_verbs_exp.h ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_verbs_exp.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_verbs_exp.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_verbs_exp.h (nonexistent) @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. - * Copyright (c) 2004 Infinicon Corporation. All rights reserved. - * Copyright (c) 2004 Intel Corporation. All rights reserved. - * Copyright (c) 2004 Topspin Corporation. All rights reserved. - * Copyright (c) 2004 Voltaire Corporation. All rights reserved. - * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef IB_VERBS_EXP_H -#define IB_VERBS_EXP_H - -#include - - -enum ib_exp_device_cap_flags2 { - IB_EXP_DEVICE_DC_TRANSPORT = 1 << 0, - IB_EXP_DEVICE_QPG = 1 << 1, - IB_EXP_DEVICE_UD_RSS = 1 << 2, - IB_EXP_DEVICE_UD_TSS = 1 << 3 -}; - -enum ib_exp_device_attr_comp_mask { - IB_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK = 1ULL << 1, - IB_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK = 1ULL << 2, - IB_EXP_DEVICE_ATTR_CAP_FLAGS2 = 1ULL << 3, - IB_EXP_DEVICE_ATTR_DC_REQ_RD = 1ULL << 4, - IB_EXP_DEVICE_ATTR_DC_RES_RD = 1ULL << 5, - IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ = 1ULL << 6, - IB_EXP_DEVICE_ATTR_RSS_TBL_SZ = 1ULL << 7, -}; - -struct ib_exp_device_attr { - struct ib_device_attr base; - /* Use IB_EXP_DEVICE_ATTR_... for exp_comp_mask */ - uint32_t exp_comp_mask; - uint64_t device_cap_flags2; - uint32_t dc_rd_req; - uint32_t dc_rd_res; - uint32_t inline_recv_sz; - uint32_t max_rss_tbl_sz; -}; - -struct ib_exp_qp_init_attr { - void (*event_handler)(struct ib_event *, void *); - void *qp_context; - struct ib_cq *send_cq; - struct ib_cq *recv_cq; - struct ib_srq *srq; - struct ib_xrcd *xrcd; /* XRC TGT QPs only */ - struct ib_qp_cap cap; - union { - struct ib_qp *qpg_parent; /* see qpg_type */ - struct ib_qpg_init_attrib parent_attrib; - }; - enum ib_sig_type sq_sig_type; - enum ib_qp_type qp_type; - enum ib_qp_create_flags create_flags; - enum ib_qpg_type qpg_type; - u8 port_num; /* special QP types only */ - u32 max_inl_recv; -}; - - -int ib_exp_query_device(struct ib_device *device, - struct ib_exp_device_attr *device_attr); - - - - -#endif /* IB_VERBS_EXP_H */ Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_verbs_exp.h ___________________________________________________________________ Deleted: fbsd:nokeywords ## -1 +0,0 ## -true \ No newline at end of property Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib.h (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib.h (revision 319974) @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(_RDMA_IB_H) +#define _RDMA_IB_H + +#include +#include +#include + +/* + * Define a native infiniband address as in Linux upstream + * 8d36eb01da5d371feffa280e501377b5c450f5a5 + */ +#define AF_IB 41 + +struct ib_addr { + union { + __u8 uib_addr8[16]; + __be16 uib_addr16[8]; + __be32 uib_addr32[4]; + __be64 uib_addr64[2]; + } ib_u; +#define sib_addr8 ib_u.uib_addr8 +#define sib_addr16 ib_u.uib_addr16 +#define sib_addr32 ib_u.uib_addr32 +#define sib_addr64 ib_u.uib_addr64 +#define sib_raw ib_u.uib_addr8 +#define sib_subnet_prefix ib_u.uib_addr64[0] +#define sib_interface_id ib_u.uib_addr64[1] +}; + +static inline int ib_addr_any(const struct ib_addr *a) +{ + return ((a->sib_addr64[0] | a->sib_addr64[1]) == 0); +} + +static inline int ib_addr_loopback(const struct ib_addr *a) +{ + return ((a->sib_addr32[0] | a->sib_addr32[1] | + a->sib_addr32[2] | (a->sib_addr32[3] ^ htonl(1))) == 0); +} + +static inline void ib_addr_set(struct ib_addr *addr, + __be32 w1, __be32 w2, __be32 w3, __be32 w4) +{ + addr->sib_addr32[0] = w1; + addr->sib_addr32[1] = w2; + addr->sib_addr32[2] = w3; + addr->sib_addr32[3] = w4; +} + +static inline int ib_addr_cmp(const struct ib_addr *a1, const struct ib_addr *a2) +{ + return memcmp(a1, a2, sizeof(struct ib_addr)); +} + +struct sockaddr_ib { + unsigned short int sib_family; /* AF_IB */ + __be16 sib_pkey; + __be32 sib_flowinfo; + struct ib_addr sib_addr; + __be64 sib_sid; + __be64 sib_sid_mask; + __u64 sib_scope_id; +}; + +/* + * The IB interfaces that use write() as bi-directional ioctl() are + * fundamentally unsafe, since there are lots of ways to trigger "write()" + * calls from various contexts with elevated privileges. That includes the + * traditional suid executable error message writes, but also various kernel + * interfaces that can write to file descriptors. + * + * This function provides protection for the legacy API by restricting the + * calling context. + */ +static inline bool ib_safe_file_access(struct file *filp) +{ + struct thread *td = curthread; + + /* + * Check if called from userspace through a devfs related + * system call belonging to the given file: + */ + return (filp->_file != NULL && + filp->_file == td->td_fpop && + filp->_file->f_cred == td->td_ucred); +} + +#endif /* _RDMA_IB_H */ Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_addr.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_addr.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_addr.h (revision 319974) @@ -1,319 +1,337 @@ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef IB_ADDR_H +#if !defined(IB_ADDR_H) #define IB_ADDR_H #include #include #include #include #include #include #include +#include +#include +#include #include #include -#include #include struct rdma_addr_client { atomic_t refcount; struct completion comp; }; /** * rdma_addr_register_client - Register an address client. */ void rdma_addr_register_client(struct rdma_addr_client *client); /** * rdma_addr_unregister_client - Deregister an address client. * @client: Client object to deregister. */ void rdma_addr_unregister_client(struct rdma_addr_client *client); +/** + * struct rdma_dev_addr - Contains resolved RDMA hardware addresses + * @src_dev_addr: Source MAC address. + * @dst_dev_addr: Destination MAC address. + * @broadcast: Broadcast address of the device. + * @dev_type: The interface hardware type of the device. + * @bound_dev_if: An optional device interface index. + * @transport: The transport type used. + * @net: Network namespace containing the bound_dev_if net_dev. + */ +struct vnet; struct rdma_dev_addr { unsigned char src_dev_addr[MAX_ADDR_LEN]; unsigned char dst_dev_addr[MAX_ADDR_LEN]; unsigned char broadcast[MAX_ADDR_LEN]; unsigned short dev_type; int bound_dev_if; enum rdma_transport_type transport; + struct vnet *net; + enum rdma_network_type network; + int hoplimit; }; /** * rdma_translate_ip - Translate a local IP address to an RDMA hardware * address. + * + * The dev_addr->net field must be initialized. */ -int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, - u16 *vlan_id); +int rdma_translate_ip(const struct sockaddr *addr, + struct rdma_dev_addr *dev_addr, u16 *vlan_id); /** * rdma_resolve_ip - Resolve source and destination IP addresses to * RDMA hardware addresses. * @client: Address client associated with request. * @src_addr: An optional source address to use in the resolution. If a * source address is not provided, a usable address will be returned via * the callback. * @dst_addr: The destination address to resolve. * @addr: A reference to a data location that will receive the resolved * addresses. The data location must remain valid until the callback has - * been invoked. + * been invoked. The net field of the addr struct must be valid. * @timeout_ms: Amount of time to wait for the address resolution to complete. * @callback: Call invoked once address resolution has completed, timed out, * or been canceled. A status of 0 indicates success. * @context: User-specified context associated with the call. */ int rdma_resolve_ip(struct rdma_addr_client *client, struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), void *context); +int rdma_resolve_ip_route(struct sockaddr *src_addr, + const struct sockaddr *dst_addr, + struct rdma_dev_addr *addr); + void rdma_addr_cancel(struct rdma_dev_addr *addr); int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, const unsigned char *dst_dev_addr); -int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id, - u32 scope_id); -int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *smac, - u16 *vlan_id, u32 scope_id); -static inline int ip_addr_size(struct sockaddr *addr) -{ - return addr->sa_family == AF_INET6 ? - sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in); -} +int rdma_addr_size(struct sockaddr *addr); +int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id); +int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + const union ib_gid *dgid, + u8 *smac, u16 *vlan_id, int *if_index, + int *hoplimit); + static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr) { return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9]; } static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey) { dev_addr->broadcast[8] = pkey >> 8; dev_addr->broadcast[9] = (unsigned char) pkey; } static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->broadcast + 4, sizeof *gid); } static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr) { return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0; } static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev) { uint16_t tag; if (VLAN_TAG(__DECONST(struct ifnet *, dev), &tag) != 0) return 0xffff; return tag; } -static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid) +static inline int rdma_ip2gid(const struct sockaddr *addr, union ib_gid *gid) { switch (addr->sa_family) { case AF_INET: - ipv6_addr_set_v4mapped(((struct sockaddr_in *)addr)->sin_addr.s_addr, + ipv6_addr_set_v4mapped(((const struct sockaddr_in *) + addr)->sin_addr.s_addr, (struct in6_addr *)gid); break; case AF_INET6: - memcpy(gid->raw, &((struct sockaddr_in6 *)addr)->sin6_addr, - 16); + memcpy(gid->raw, &((const struct sockaddr_in6 *)addr)->sin6_addr, 16); break; default: return -EINVAL; } return 0; } /* Important - sockaddr should be a union of sockaddr_in and sockaddr_in6 */ -static inline int rdma_gid2ip(struct sockaddr *out, union ib_gid *gid, - uint32_t scope_id) +static inline void rdma_gid2ip(struct sockaddr *out, const union ib_gid *gid) { - if (ipv6_addr_v4mapped((struct in6_addr *)gid)) { + if (ipv6_addr_v4mapped((const struct in6_addr *)gid)) { struct sockaddr_in *out_in = (struct sockaddr_in *)out; memset(out_in, 0, sizeof(*out_in)); out_in->sin_len = sizeof(*out_in); out_in->sin_family = AF_INET; memcpy(&out_in->sin_addr.s_addr, gid->raw + 12, 4); } else { struct sockaddr_in6 *out_in = (struct sockaddr_in6 *)out; memset(out_in, 0, sizeof(*out_in)); out_in->sin6_len = sizeof(*out_in); out_in->sin6_family = AF_INET6; memcpy(&out_in->sin6_addr.s6_addr, gid->raw, 16); - if (scope_id < 256 && - IN6_IS_SCOPE_LINKLOCAL(&out_in->sin6_addr)) - out_in->sin6_scope_id = scope_id; } - return 0; } -u32 rdma_get_ipv6_scope_id(struct ib_device *ib, u8 port_num); - -/* This func is called only in loopback ip address (127.0.0.1) - * case in which sgid is not relevant - */ static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { + struct net_device *dev; + struct ifaddr *ifa; + + dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + if (dev) { + TAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL || + ifa->ifa_addr->sa_family != AF_INET) + continue; + ipv6_addr_set_v4mapped(((struct sockaddr_in *) + ifa->ifa_addr)->sin_addr.s_addr, + (struct in6_addr *)gid); + break; + } + dev_put(dev); + } } static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { if (dev_addr->transport == RDMA_TRANSPORT_IB && dev_addr->dev_type != ARPHRD_INFINIBAND) iboe_addr_get_sgid(dev_addr, gid); else memcpy(gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid); } static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } static inline void rdma_addr_get_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid); } static inline void rdma_addr_set_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } static inline enum ib_mtu iboe_get_mtu(int mtu) { /* * reduce IB headers from effective IBoE MTU. 28 stands for * atomic header which is the biggest possible header after BTH */ mtu = mtu - IB_GRH_BYTES - IB_BTH_BYTES - 28; if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096)) return IB_MTU_4096; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048)) return IB_MTU_2048; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024)) return IB_MTU_1024; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512)) return IB_MTU_512; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256)) return IB_MTU_256; else return 0; } static inline int iboe_get_rate(struct net_device *dev) { - if (dev->if_baudrate >= IF_Gbps(40)) + uint64_t baudrate = dev->if_baudrate; +#ifdef if_baudrate_pf + int exp; + for (exp = dev->if_baudrate_pf; exp > 0; exp--) + baudrate *= 10; +#endif + if (baudrate >= IF_Gbps(40)) return IB_RATE_40_GBPS; - else if (dev->if_baudrate >= IF_Gbps(30)) + else if (baudrate >= IF_Gbps(30)) return IB_RATE_30_GBPS; - else if (dev->if_baudrate >= IF_Gbps(20)) + else if (baudrate >= IF_Gbps(20)) return IB_RATE_20_GBPS; - else if (dev->if_baudrate >= IF_Gbps(10)) + else if (baudrate >= IF_Gbps(10)) return IB_RATE_10_GBPS; else return IB_RATE_PORT_CURRENT; } static inline int rdma_link_local_addr(struct in6_addr *addr) { if (addr->s6_addr32[0] == htonl(0xfe800000) && addr->s6_addr32[1] == 0) return 1; return 0; } static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac) { memcpy(mac, &addr->s6_addr[8], 3); memcpy(mac + 3, &addr->s6_addr[13], 3); mac[0] ^= 2; } static inline int rdma_is_multicast_addr(struct in6_addr *addr) { return addr->s6_addr[0] == 0xff; } -static inline void resolve_mcast_mac(struct in6_addr *addr, u8 *mac) -{ - if (addr->s6_addr[0] != 0xff) - return; - -#ifdef DUAL_MODE_MCAST_MAC - if (addr->s6_addr[1] == 0x0e) /* IPv4 */ - ip_eth_mc_map(addr->s6_addr32[3], mac); - else -#endif - ipv6_eth_mc_map(addr, mac); -} - - static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac) { int i; mac[0] = 0x33; mac[1] = 0x33; for (i = 2; i < 6; ++i) mac[i] = addr->s6_addr[i + 10]; } static inline u16 rdma_get_vlan_id(union ib_gid *dgid) { u16 vid; vid = dgid->raw[11] << 8 | dgid->raw[12]; - return vid < 0x1000 ? vid : 0xffff; + return vid < 0x1000 ? vid : 0xffff; } static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev) { return VLAN_TRUNKDEV(__DECONST(struct ifnet *, dev)); } #endif /* IB_ADDR_H */ Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_cache.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_cache.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_cache.h (revision 319974) @@ -1,132 +1,168 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef _IB_CACHE_H #define _IB_CACHE_H #include /** * ib_get_cached_gid - Returns a cached GID table entry * @device: The device to query. * @port_num: The port number of the device to query. * @index: The index into the cached GID table to query. * @gid: The GID value found at the specified index. + * @attr: The GID attribute found at the specified index (only in RoCE). + * NULL means ignore (output parameter). * * ib_get_cached_gid() fetches the specified GID table entry stored in * the local software cache. */ int ib_get_cached_gid(struct ib_device *device, u8 port_num, int index, - union ib_gid *gid); + union ib_gid *gid, + struct ib_gid_attr *attr); /** * ib_find_cached_gid - Returns the port number and GID table index where * a specified GID value occurs. * @device: The device to query. * @gid: The GID value to search for. + * @gid_type: The GID type to search for. + * @ndev: In RoCE, the net device of the device. NULL means ignore. * @port_num: The port number of the device where the GID value was found. * @index: The index into the cached GID table where the GID was found. This * parameter may be NULL. * * ib_find_cached_gid() searches for the specified GID value in * the local software cache. */ int ib_find_cached_gid(struct ib_device *device, - union ib_gid *gid, + const union ib_gid *gid, + enum ib_gid_type gid_type, + struct net_device *ndev, u8 *port_num, u16 *index); +/** + * ib_find_cached_gid_by_port - Returns the GID table index where a specified + * GID value occurs + * @device: The device to query. + * @gid: The GID value to search for. + * @gid_type: The GID type to search for. + * @port_num: The port number of the device where the GID value sould be + * searched. + * @ndev: In RoCE, the net device of the device. Null means ignore. + * @index: The index into the cached GID table where the GID was found. This + * parameter may be NULL. + * + * ib_find_cached_gid() searches for the specified GID value in + * the local software cache. + */ +int ib_find_cached_gid_by_port(struct ib_device *device, + const union ib_gid *gid, + enum ib_gid_type gid_type, + u8 port_num, + struct net_device *ndev, + u16 *index); + +int ib_find_gid_by_filter(struct ib_device *device, + const union ib_gid *gid, + u8 port_num, + bool (*filter)(const union ib_gid *gid, + const struct ib_gid_attr *, + void *), + void *context, u16 *index); /** * ib_get_cached_pkey - Returns a cached PKey table entry * @device: The device to query. * @port_num: The port number of the device to query. * @index: The index into the cached PKey table to query. * @pkey: The PKey value found at the specified index. * * ib_get_cached_pkey() fetches the specified PKey table entry stored in * the local software cache. */ int ib_get_cached_pkey(struct ib_device *device_handle, u8 port_num, int index, u16 *pkey); /** * ib_find_cached_pkey - Returns the PKey table index where a specified * PKey value occurs. * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the cached PKey table where the PKey was found. * * ib_find_cached_pkey() searches the specified PKey table in * the local software cache. */ int ib_find_cached_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index); /** * ib_find_exact_cached_pkey - Returns the PKey table index where a specified * PKey value occurs. Comparison uses the FULL 16 bits (incl membership bit) * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the cached PKey table where the PKey was found. * * ib_find_exact_cached_pkey() searches the specified PKey table in * the local software cache. */ int ib_find_exact_cached_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index); /** * ib_get_cached_lmc - Returns a cached lmc table entry * @device: The device to query. * @port_num: The port number of the device to query. * @lmc: The lmc value for the specified port for that device. * * ib_get_cached_lmc() fetches the specified lmc table entry stored in * the local software cache. */ int ib_get_cached_lmc(struct ib_device *device, u8 port_num, u8 *lmc); #endif /* _IB_CACHE_H */ Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_cm.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_cm.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_cm.h (revision 319974) @@ -1,606 +1,606 @@ /* * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if !defined(IB_CM_H) #define IB_CM_H #include #include /* ib_cm and ib_user_cm modules share /sys/class/infiniband_cm */ extern struct class cm_class; enum ib_cm_state { IB_CM_IDLE, IB_CM_LISTEN, IB_CM_REQ_SENT, IB_CM_REQ_RCVD, IB_CM_MRA_REQ_SENT, IB_CM_MRA_REQ_RCVD, IB_CM_REP_SENT, IB_CM_REP_RCVD, IB_CM_MRA_REP_SENT, IB_CM_MRA_REP_RCVD, IB_CM_ESTABLISHED, IB_CM_DREQ_SENT, IB_CM_DREQ_RCVD, IB_CM_TIMEWAIT, IB_CM_SIDR_REQ_SENT, IB_CM_SIDR_REQ_RCVD }; enum ib_cm_lap_state { IB_CM_LAP_UNINIT, IB_CM_LAP_IDLE, IB_CM_LAP_SENT, IB_CM_LAP_RCVD, IB_CM_MRA_LAP_SENT, IB_CM_MRA_LAP_RCVD, }; enum ib_cm_event_type { IB_CM_REQ_ERROR, IB_CM_REQ_RECEIVED, IB_CM_REP_ERROR, IB_CM_REP_RECEIVED, IB_CM_RTU_RECEIVED, IB_CM_USER_ESTABLISHED, IB_CM_DREQ_ERROR, IB_CM_DREQ_RECEIVED, IB_CM_DREP_RECEIVED, IB_CM_TIMEWAIT_EXIT, IB_CM_MRA_RECEIVED, IB_CM_REJ_RECEIVED, IB_CM_LAP_ERROR, IB_CM_LAP_RECEIVED, IB_CM_APR_RECEIVED, IB_CM_SIDR_REQ_ERROR, IB_CM_SIDR_REQ_RECEIVED, IB_CM_SIDR_REP_RECEIVED }; enum ib_cm_data_size { IB_CM_REQ_PRIVATE_DATA_SIZE = 92, IB_CM_MRA_PRIVATE_DATA_SIZE = 222, IB_CM_REJ_PRIVATE_DATA_SIZE = 148, IB_CM_REP_PRIVATE_DATA_SIZE = 196, IB_CM_RTU_PRIVATE_DATA_SIZE = 224, IB_CM_DREQ_PRIVATE_DATA_SIZE = 220, IB_CM_DREP_PRIVATE_DATA_SIZE = 224, IB_CM_REJ_ARI_LENGTH = 72, IB_CM_LAP_PRIVATE_DATA_SIZE = 168, IB_CM_APR_PRIVATE_DATA_SIZE = 148, IB_CM_APR_INFO_LENGTH = 72, IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216, IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136, IB_CM_SIDR_REP_INFO_LENGTH = 72, - IB_CM_COMPARE_SIZE = 64 }; struct ib_cm_id; struct ib_cm_req_event_param { struct ib_cm_id *listen_id; + + /* P_Key that was used by the GMP's BTH header */ + u16 bth_pkey; + u8 port; struct ib_sa_path_rec *primary_path; struct ib_sa_path_rec *alternate_path; __be64 remote_ca_guid; u32 remote_qkey; u32 remote_qpn; enum ib_qp_type qp_type; u32 starting_psn; u8 responder_resources; u8 initiator_depth; unsigned int local_cm_response_timeout:5; unsigned int flow_control:1; unsigned int remote_cm_response_timeout:5; unsigned int retry_count:3; unsigned int rnr_retry_count:3; unsigned int srq:1; }; struct ib_cm_rep_event_param { __be64 remote_ca_guid; u32 remote_qkey; u32 remote_qpn; u32 starting_psn; u8 responder_resources; u8 initiator_depth; unsigned int target_ack_delay:5; unsigned int failover_accepted:2; unsigned int flow_control:1; unsigned int rnr_retry_count:3; unsigned int srq:1; }; enum ib_cm_rej_reason { IB_CM_REJ_NO_QP = 1, IB_CM_REJ_NO_EEC = 2, IB_CM_REJ_NO_RESOURCES = 3, IB_CM_REJ_TIMEOUT = 4, IB_CM_REJ_UNSUPPORTED = 5, IB_CM_REJ_INVALID_COMM_ID = 6, IB_CM_REJ_INVALID_COMM_INSTANCE = 7, IB_CM_REJ_INVALID_SERVICE_ID = 8, IB_CM_REJ_INVALID_TRANSPORT_TYPE = 9, IB_CM_REJ_STALE_CONN = 10, IB_CM_REJ_RDC_NOT_EXIST = 11, IB_CM_REJ_INVALID_GID = 12, IB_CM_REJ_INVALID_LID = 13, IB_CM_REJ_INVALID_SL = 14, IB_CM_REJ_INVALID_TRAFFIC_CLASS = 15, IB_CM_REJ_INVALID_HOP_LIMIT = 16, IB_CM_REJ_INVALID_PACKET_RATE = 17, IB_CM_REJ_INVALID_ALT_GID = 18, IB_CM_REJ_INVALID_ALT_LID = 19, IB_CM_REJ_INVALID_ALT_SL = 20, IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS = 21, IB_CM_REJ_INVALID_ALT_HOP_LIMIT = 22, IB_CM_REJ_INVALID_ALT_PACKET_RATE = 23, IB_CM_REJ_PORT_CM_REDIRECT = 24, IB_CM_REJ_PORT_REDIRECT = 25, IB_CM_REJ_INVALID_MTU = 26, IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES = 27, IB_CM_REJ_CONSUMER_DEFINED = 28, IB_CM_REJ_INVALID_RNR_RETRY = 29, IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID = 30, IB_CM_REJ_INVALID_CLASS_VERSION = 31, IB_CM_REJ_INVALID_FLOW_LABEL = 32, IB_CM_REJ_INVALID_ALT_FLOW_LABEL = 33 }; struct ib_cm_rej_event_param { enum ib_cm_rej_reason reason; void *ari; u8 ari_length; }; struct ib_cm_mra_event_param { u8 service_timeout; }; struct ib_cm_lap_event_param { struct ib_sa_path_rec *alternate_path; }; enum ib_cm_apr_status { IB_CM_APR_SUCCESS, IB_CM_APR_INVALID_COMM_ID, IB_CM_APR_UNSUPPORTED, IB_CM_APR_REJECT, IB_CM_APR_REDIRECT, IB_CM_APR_IS_CURRENT, IB_CM_APR_INVALID_QPN_EECN, IB_CM_APR_INVALID_LID, IB_CM_APR_INVALID_GID, IB_CM_APR_INVALID_FLOW_LABEL, IB_CM_APR_INVALID_TCLASS, IB_CM_APR_INVALID_HOP_LIMIT, IB_CM_APR_INVALID_PACKET_RATE, IB_CM_APR_INVALID_SL }; struct ib_cm_apr_event_param { enum ib_cm_apr_status ap_status; void *apr_info; u8 info_len; }; struct ib_cm_sidr_req_event_param { struct ib_cm_id *listen_id; + __be64 service_id; + /* P_Key that was used by the GMP's BTH header */ + u16 bth_pkey; u8 port; u16 pkey; }; enum ib_cm_sidr_status { IB_SIDR_SUCCESS, IB_SIDR_UNSUPPORTED, IB_SIDR_REJECT, IB_SIDR_NO_QP, IB_SIDR_REDIRECT, IB_SIDR_UNSUPPORTED_VERSION }; struct ib_cm_sidr_rep_event_param { enum ib_cm_sidr_status status; u32 qkey; u32 qpn; void *info; u8 info_len; }; struct ib_cm_event { enum ib_cm_event_type event; union { struct ib_cm_req_event_param req_rcvd; struct ib_cm_rep_event_param rep_rcvd; /* No data for RTU received events. */ struct ib_cm_rej_event_param rej_rcvd; struct ib_cm_mra_event_param mra_rcvd; struct ib_cm_lap_event_param lap_rcvd; struct ib_cm_apr_event_param apr_rcvd; /* No data for DREQ/DREP received events. */ struct ib_cm_sidr_req_event_param sidr_req_rcvd; struct ib_cm_sidr_rep_event_param sidr_rep_rcvd; enum ib_wc_status send_status; } param; void *private_data; }; #define CM_REQ_ATTR_ID cpu_to_be16(0x0010) #define CM_MRA_ATTR_ID cpu_to_be16(0x0011) #define CM_REJ_ATTR_ID cpu_to_be16(0x0012) #define CM_REP_ATTR_ID cpu_to_be16(0x0013) #define CM_RTU_ATTR_ID cpu_to_be16(0x0014) #define CM_DREQ_ATTR_ID cpu_to_be16(0x0015) #define CM_DREP_ATTR_ID cpu_to_be16(0x0016) #define CM_SIDR_REQ_ATTR_ID cpu_to_be16(0x0017) #define CM_SIDR_REP_ATTR_ID cpu_to_be16(0x0018) #define CM_LAP_ATTR_ID cpu_to_be16(0x0019) #define CM_APR_ATTR_ID cpu_to_be16(0x001A) /** * ib_cm_handler - User-defined callback to process communication events. * @cm_id: Communication identifier associated with the reported event. * @event: Information about the communication event. * * IB_CM_REQ_RECEIVED and IB_CM_SIDR_REQ_RECEIVED communication events * generated as a result of listen requests result in the allocation of a * new @cm_id. The new @cm_id is returned to the user through this callback. * Clients are responsible for destroying the new @cm_id. For peer-to-peer * IB_CM_REQ_RECEIVED and all other events, the returned @cm_id corresponds * to a user's existing communication identifier. * * Users may not call ib_destroy_cm_id while in the context of this callback; * however, returning a non-zero value instructs the communication manager to * destroy the @cm_id after the callback completes. */ typedef int (*ib_cm_handler)(struct ib_cm_id *cm_id, struct ib_cm_event *event); struct ib_cm_id { ib_cm_handler cm_handler; void *context; struct ib_device *device; __be64 service_id; __be64 service_mask; enum ib_cm_state state; /* internal CM/debug use */ enum ib_cm_lap_state lap_state; /* internal CM/debug use */ __be32 local_id; __be32 remote_id; u32 remote_cm_qpn; /* 1 unless redirected */ }; /** * ib_create_cm_id - Allocate a communication identifier. * @device: Device associated with the cm_id. All related communication will * be associated with the specified device. * @cm_handler: Callback invoked to notify the user of CM events. * @context: User specified context associated with the communication * identifier. * * Communication identifiers are used to track connection states, service * ID resolution requests, and listen requests. */ struct ib_cm_id *ib_create_cm_id(struct ib_device *device, ib_cm_handler cm_handler, void *context); /** * ib_destroy_cm_id - Destroy a connection identifier. * @cm_id: Connection identifier to destroy. * * This call blocks until the connection identifier is destroyed. */ void ib_destroy_cm_id(struct ib_cm_id *cm_id); #define IB_SERVICE_ID_AGN_MASK cpu_to_be64(0xFF00000000000000ULL) #define IB_CM_ASSIGN_SERVICE_ID cpu_to_be64(0x0200000000000000ULL) #define IB_CMA_SERVICE_ID cpu_to_be64(0x0000000001000000ULL) #define IB_CMA_SERVICE_ID_MASK cpu_to_be64(0xFFFFFFFFFF000000ULL) #define IB_SDP_SERVICE_ID cpu_to_be64(0x0000000000010000ULL) #define IB_SDP_SERVICE_ID_MASK cpu_to_be64(0xFFFFFFFFFFFF0000ULL) -struct ib_cm_compare_data { - u8 data[IB_CM_COMPARE_SIZE]; - u8 mask[IB_CM_COMPARE_SIZE]; -}; - /** * ib_cm_listen - Initiates listening on the specified service ID for * connection and service ID resolution requests. * @cm_id: Connection identifier associated with the listen request. * @service_id: Service identifier matched against incoming connection * and service ID resolution requests. The service ID should be specified * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will * assign a service ID to the caller. * @service_mask: Mask applied to service ID used to listen across a * range of service IDs. If set to 0, the service ID is matched * exactly. This parameter is ignored if %service_id is set to * IB_CM_ASSIGN_SERVICE_ID. - * @compare_data: This parameter is optional. It specifies data that must - * appear in the private data of a connection request for the specified - * listen request. */ -int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask, - struct ib_cm_compare_data *compare_data); +int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, + __be64 service_mask); +struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device, + ib_cm_handler cm_handler, + __be64 service_id); + struct ib_cm_req_param { struct ib_sa_path_rec *primary_path; struct ib_sa_path_rec *alternate_path; __be64 service_id; u32 qp_num; enum ib_qp_type qp_type; u32 starting_psn; const void *private_data; u8 private_data_len; u8 peer_to_peer; u8 responder_resources; u8 initiator_depth; u8 remote_cm_response_timeout; u8 flow_control; u8 local_cm_response_timeout; u8 retry_count; u8 rnr_retry_count; u8 max_cm_retries; u8 srq; }; /** * ib_send_cm_req - Sends a connection request to the remote node. * @cm_id: Connection identifier that will be associated with the * connection request. * @param: Connection request information needed to establish the * connection. */ int ib_send_cm_req(struct ib_cm_id *cm_id, struct ib_cm_req_param *param); struct ib_cm_rep_param { u32 qp_num; u32 starting_psn; const void *private_data; u8 private_data_len; u8 responder_resources; u8 initiator_depth; u8 failover_accepted; u8 flow_control; u8 rnr_retry_count; u8 srq; }; /** * ib_send_cm_rep - Sends a connection reply in response to a connection * request. * @cm_id: Connection identifier that will be associated with the * connection request. * @param: Connection reply information needed to establish the * connection. */ int ib_send_cm_rep(struct ib_cm_id *cm_id, struct ib_cm_rep_param *param); /** * ib_send_cm_rtu - Sends a connection ready to use message in response * to a connection reply message. * @cm_id: Connection identifier associated with the connection request. * @private_data: Optional user-defined private data sent with the * ready to use message. * @private_data_len: Size of the private data buffer, in bytes. */ int ib_send_cm_rtu(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len); /** * ib_send_cm_dreq - Sends a disconnection request for an existing * connection. * @cm_id: Connection identifier associated with the connection being * released. * @private_data: Optional user-defined private data sent with the * disconnection request message. * @private_data_len: Size of the private data buffer, in bytes. */ int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len); /** * ib_send_cm_drep - Sends a disconnection reply to a disconnection request. * @cm_id: Connection identifier associated with the connection being * released. * @private_data: Optional user-defined private data sent with the * disconnection reply message. * @private_data_len: Size of the private data buffer, in bytes. * * If the cm_id is in the correct state, the CM will transition the connection * to the timewait state, even if an error occurs sending the DREP message. */ int ib_send_cm_drep(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len); /** * ib_cm_notify - Notifies the CM of an event reported to the consumer. * @cm_id: Connection identifier to transition to established. * @event: Type of event. * * This routine should be invoked by users to notify the CM of relevant * communication events. Events that should be reported to the CM and * when to report them are: * * IB_EVENT_COMM_EST - Used when a message is received on a connected * QP before an RTU has been received. * IB_EVENT_PATH_MIG - Notifies the CM that the connection has failed over * to the alternate path. */ int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event); /** * ib_send_cm_rej - Sends a connection rejection message to the * remote node. * @cm_id: Connection identifier associated with the connection being * rejected. * @reason: Reason for the connection request rejection. * @ari: Optional additional rejection information. * @ari_length: Size of the additional rejection information, in bytes. * @private_data: Optional user-defined private data sent with the * rejection message. * @private_data_len: Size of the private data buffer, in bytes. */ int ib_send_cm_rej(struct ib_cm_id *cm_id, enum ib_cm_rej_reason reason, void *ari, u8 ari_length, const void *private_data, u8 private_data_len); #define IB_CM_MRA_FLAG_DELAY 0x80 /* Send MRA only after a duplicate msg */ /** * ib_send_cm_mra - Sends a message receipt acknowledgement to a connection * message. * @cm_id: Connection identifier associated with the connection message. * @service_timeout: The lower 5-bits specify the maximum time required for * the sender to reply to the connection message. The upper 3-bits * specify additional control flags. * @private_data: Optional user-defined private data sent with the * message receipt acknowledgement. * @private_data_len: Size of the private data buffer, in bytes. */ int ib_send_cm_mra(struct ib_cm_id *cm_id, u8 service_timeout, const void *private_data, u8 private_data_len); /** * ib_send_cm_lap - Sends a load alternate path request. * @cm_id: Connection identifier associated with the load alternate path * message. * @alternate_path: A path record that identifies the alternate path to * load. * @private_data: Optional user-defined private data sent with the * load alternate path message. * @private_data_len: Size of the private data buffer, in bytes. */ int ib_send_cm_lap(struct ib_cm_id *cm_id, struct ib_sa_path_rec *alternate_path, const void *private_data, u8 private_data_len); /** * ib_cm_init_qp_attr - Initializes the QP attributes for use in transitioning * to a specified QP state. * @cm_id: Communication identifier associated with the QP attributes to * initialize. * @qp_attr: On input, specifies the desired QP state. On output, the * mandatory and desired optional attributes will be set in order to * modify the QP to the specified state. * @qp_attr_mask: The QP attribute mask that may be used to transition the * QP to the specified state. * * Users must set the @qp_attr->qp_state to the desired QP state. This call * will set all required attributes for the given transition, along with * known optional attributes. Users may override the attributes returned from * this call before calling ib_modify_qp. */ int ib_cm_init_qp_attr(struct ib_cm_id *cm_id, struct ib_qp_attr *qp_attr, int *qp_attr_mask); /** * ib_send_cm_apr - Sends an alternate path response message in response to * a load alternate path request. * @cm_id: Connection identifier associated with the alternate path response. * @status: Reply status sent with the alternate path response. * @info: Optional additional information sent with the alternate path * response. * @info_length: Size of the additional information, in bytes. * @private_data: Optional user-defined private data sent with the * alternate path response message. * @private_data_len: Size of the private data buffer, in bytes. */ int ib_send_cm_apr(struct ib_cm_id *cm_id, enum ib_cm_apr_status status, void *info, u8 info_length, const void *private_data, u8 private_data_len); struct ib_cm_sidr_req_param { struct ib_sa_path_rec *path; __be64 service_id; int timeout_ms; const void *private_data; u8 private_data_len; u8 max_cm_retries; }; /** * ib_send_cm_sidr_req - Sends a service ID resolution request to the * remote node. * @cm_id: Communication identifier that will be associated with the * service ID resolution request. * @param: Service ID resolution request information. */ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, struct ib_cm_sidr_req_param *param); struct ib_cm_sidr_rep_param { u32 qp_num; u32 qkey; enum ib_cm_sidr_status status; const void *info; u8 info_length; const void *private_data; u8 private_data_len; }; /** * ib_send_cm_sidr_rep - Sends a service ID resolution reply to the * remote node. * @cm_id: Communication identifier associated with the received service ID * resolution request. * @param: Service ID resolution reply information. */ int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, struct ib_cm_sidr_rep_param *param); - -int ib_update_cm_av(struct ib_cm_id *id, const u8 *smac, const u8 *alt_smac); #endif /* IB_CM_H */ Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_hdrs.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_hdrs.h (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_hdrs.h (revision 319974) @@ -0,0 +1,178 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef IB_HDRS_H +#define IB_HDRS_H + +#include +#include +#include + +#define IB_SEQ_NAK (3 << 29) + +/* AETH NAK opcode values */ +#define IB_RNR_NAK 0x20 +#define IB_NAK_PSN_ERROR 0x60 +#define IB_NAK_INVALID_REQUEST 0x61 +#define IB_NAK_REMOTE_ACCESS_ERROR 0x62 +#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63 +#define IB_NAK_INVALID_RD_REQUEST 0x64 + +#define IB_BTH_REQ_ACK BIT(31) +#define IB_BTH_SOLICITED BIT(23) +#define IB_BTH_MIG_REQ BIT(22) + +#define IB_GRH_VERSION 6 +#define IB_GRH_VERSION_MASK 0xF +#define IB_GRH_VERSION_SHIFT 28 +#define IB_GRH_TCLASS_MASK 0xFF +#define IB_GRH_TCLASS_SHIFT 20 +#define IB_GRH_FLOW_MASK 0xFFFFF +#define IB_GRH_FLOW_SHIFT 0 +#define IB_GRH_NEXT_HDR 0x1B + +struct ib_reth { + __be64 vaddr; /* potentially unaligned */ + __be32 rkey; + __be32 length; +} __packed; + +struct ib_atomic_eth { + __be64 vaddr; /* potentially unaligned */ + __be32 rkey; + __be64 swap_data; /* potentially unaligned */ + __be64 compare_data; /* potentially unaligned */ +} __packed; + +union ib_ehdrs { + struct { + __be32 deth[2]; + __be32 imm_data; + } ud; + struct { + struct ib_reth reth; + __be32 imm_data; + } rc; + struct { + __be32 aeth; + __be64 atomic_ack_eth; /* potentially unaligned */ + } __packed at; + __be32 imm_data; + __be32 aeth; + __be32 ieth; + struct ib_atomic_eth atomic_eth; +} __packed; + +struct ib_other_headers { + __be32 bth[3]; + union ib_ehdrs u; +} __packed; + +struct ib_header { + __be16 lrh[4]; + union { + struct { + struct ib_grh grh; + struct ib_other_headers oth; + } l; + struct ib_other_headers oth; + } u; +} __packed; + +/* accessors for unaligned __be64 items */ + +static inline u64 ib_u64_get(__be64 *p) +{ + return get_unaligned_be64(p); +} + +static inline void ib_u64_put(u64 val, __be64 *p) +{ + put_unaligned_be64(val, p); +} + +static inline u64 get_ib_reth_vaddr(struct ib_reth *reth) +{ + return ib_u64_get(&reth->vaddr); +} + +static inline void put_ib_reth_vaddr(u64 val, struct ib_reth *reth) +{ + ib_u64_put(val, &reth->vaddr); +} + +static inline u64 get_ib_ateth_vaddr(struct ib_atomic_eth *ateth) +{ + return ib_u64_get(&ateth->vaddr); +} + +static inline void put_ib_ateth_vaddr(u64 val, struct ib_atomic_eth *ateth) +{ + ib_u64_put(val, &ateth->vaddr); +} + +static inline u64 get_ib_ateth_swap(struct ib_atomic_eth *ateth) +{ + return ib_u64_get(&ateth->swap_data); +} + +static inline void put_ib_ateth_swap(u64 val, struct ib_atomic_eth *ateth) +{ + ib_u64_put(val, &ateth->swap_data); +} + +static inline u64 get_ib_ateth_compare(struct ib_atomic_eth *ateth) +{ + return ib_u64_get(&ateth->compare_data); +} + +static inline void put_ib_ateth_compare(u64 val, struct ib_atomic_eth *ateth) +{ + ib_u64_put(val, &ateth->compare_data); +} + +#endif /* IB_HDRS_H */ Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_hdrs.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_mad.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_mad.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_mad.h (revision 319974) @@ -1,664 +1,849 @@ /* * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004-2006 Voltaire Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if !defined(IB_MAD_H) #define IB_MAD_H #include #include +#include -/* Management base version */ +/* Management base versions */ #define IB_MGMT_BASE_VERSION 1 +#define OPA_MGMT_BASE_VERSION 0x80 +#define OPA_SMP_CLASS_VERSION 0x80 + /* Management classes */ #define IB_MGMT_CLASS_SUBN_LID_ROUTED 0x01 #define IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE 0x81 #define IB_MGMT_CLASS_SUBN_ADM 0x03 #define IB_MGMT_CLASS_PERF_MGMT 0x04 #define IB_MGMT_CLASS_BM 0x05 #define IB_MGMT_CLASS_DEVICE_MGMT 0x06 #define IB_MGMT_CLASS_CM 0x07 #define IB_MGMT_CLASS_SNMP 0x08 #define IB_MGMT_CLASS_DEVICE_ADM 0x10 #define IB_MGMT_CLASS_BOOT_MGMT 0x11 #define IB_MGMT_CLASS_BIS 0x12 #define IB_MGMT_CLASS_CONG_MGMT 0x21 #define IB_MGMT_CLASS_VENDOR_RANGE2_START 0x30 #define IB_MGMT_CLASS_VENDOR_RANGE2_END 0x4F #define IB_OPENIB_OUI (0x001405) /* Management methods */ #define IB_MGMT_METHOD_GET 0x01 #define IB_MGMT_METHOD_SET 0x02 #define IB_MGMT_METHOD_GET_RESP 0x81 #define IB_MGMT_METHOD_SEND 0x03 #define IB_MGMT_METHOD_TRAP 0x05 #define IB_MGMT_METHOD_REPORT 0x06 #define IB_MGMT_METHOD_REPORT_RESP 0x86 #define IB_MGMT_METHOD_TRAP_REPRESS 0x07 #define IB_MGMT_METHOD_RESP 0x80 #define IB_BM_ATTR_MOD_RESP cpu_to_be32(1) #define IB_MGMT_MAX_METHODS 128 /* MAD Status field bit masks */ #define IB_MGMT_MAD_STATUS_SUCCESS 0x0000 #define IB_MGMT_MAD_STATUS_BUSY 0x0001 #define IB_MGMT_MAD_STATUS_REDIRECT_REQD 0x0002 #define IB_MGMT_MAD_STATUS_BAD_VERSION 0x0004 #define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD 0x0008 #define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB 0x000c #define IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE 0x001c /* RMPP information */ #define IB_MGMT_RMPP_VERSION 1 #define IB_MGMT_RMPP_TYPE_DATA 1 #define IB_MGMT_RMPP_TYPE_ACK 2 #define IB_MGMT_RMPP_TYPE_STOP 3 #define IB_MGMT_RMPP_TYPE_ABORT 4 #define IB_MGMT_RMPP_FLAG_ACTIVE 1 #define IB_MGMT_RMPP_FLAG_FIRST (1<<1) #define IB_MGMT_RMPP_FLAG_LAST (1<<2) #define IB_MGMT_RMPP_NO_RESPTIME 0x1F #define IB_MGMT_RMPP_STATUS_SUCCESS 0 #define IB_MGMT_RMPP_STATUS_RESX 1 #define IB_MGMT_RMPP_STATUS_ABORT_MIN 118 #define IB_MGMT_RMPP_STATUS_T2L 118 #define IB_MGMT_RMPP_STATUS_BAD_LEN 119 #define IB_MGMT_RMPP_STATUS_BAD_SEG 120 #define IB_MGMT_RMPP_STATUS_BADT 121 #define IB_MGMT_RMPP_STATUS_W2S 122 #define IB_MGMT_RMPP_STATUS_S2B 123 #define IB_MGMT_RMPP_STATUS_BAD_STATUS 124 #define IB_MGMT_RMPP_STATUS_UNV 125 #define IB_MGMT_RMPP_STATUS_TMR 126 #define IB_MGMT_RMPP_STATUS_UNSPEC 127 #define IB_MGMT_RMPP_STATUS_ABORT_MAX 127 #define IB_QP0 0 #define IB_QP1 cpu_to_be32(1) #define IB_QP1_QKEY 0x80010000 #define IB_QP_SET_QKEY 0x80000000 #define IB_DEFAULT_PKEY_PARTIAL 0x7FFF #define IB_DEFAULT_PKEY_FULL 0xFFFF +/* + * Generic trap/notice types + */ +#define IB_NOTICE_TYPE_FATAL 0x80 +#define IB_NOTICE_TYPE_URGENT 0x81 +#define IB_NOTICE_TYPE_SECURITY 0x82 +#define IB_NOTICE_TYPE_SM 0x83 +#define IB_NOTICE_TYPE_INFO 0x84 + +/* + * Generic trap/notice producers + */ +#define IB_NOTICE_PROD_CA cpu_to_be16(1) +#define IB_NOTICE_PROD_SWITCH cpu_to_be16(2) +#define IB_NOTICE_PROD_ROUTER cpu_to_be16(3) +#define IB_NOTICE_PROD_CLASS_MGR cpu_to_be16(4) + enum { IB_MGMT_MAD_HDR = 24, IB_MGMT_MAD_DATA = 232, IB_MGMT_RMPP_HDR = 36, IB_MGMT_RMPP_DATA = 220, IB_MGMT_VENDOR_HDR = 40, IB_MGMT_VENDOR_DATA = 216, IB_MGMT_SA_HDR = 56, IB_MGMT_SA_DATA = 200, IB_MGMT_DEVICE_HDR = 64, IB_MGMT_DEVICE_DATA = 192, + IB_MGMT_MAD_SIZE = IB_MGMT_MAD_HDR + IB_MGMT_MAD_DATA, + OPA_MGMT_MAD_DATA = 2024, + OPA_MGMT_RMPP_DATA = 2012, + OPA_MGMT_MAD_SIZE = IB_MGMT_MAD_HDR + OPA_MGMT_MAD_DATA, }; struct ib_mad_hdr { u8 base_version; u8 mgmt_class; u8 class_version; u8 method; __be16 status; __be16 class_specific; __be64 tid; __be16 attr_id; __be16 resv; __be32 attr_mod; }; struct ib_rmpp_hdr { u8 rmpp_version; u8 rmpp_type; u8 rmpp_rtime_flags; u8 rmpp_status; __be32 seg_num; __be32 paylen_newwin; }; typedef u64 __bitwise ib_sa_comp_mask; #define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << (n))) /* * ib_sa_hdr and ib_sa_mad structures must be packed because they have * 64-bit fields that are only 32-bit aligned. 64-bit architectures will * lay them out wrong otherwise. (And unfortunately they are sent on * the wire so we can't change the layout) */ struct ib_sa_hdr { __be64 sm_key; __be16 attr_offset; __be16 reserved; ib_sa_comp_mask comp_mask; } __attribute__ ((packed)); struct ib_mad { struct ib_mad_hdr mad_hdr; u8 data[IB_MGMT_MAD_DATA]; }; +struct opa_mad { + struct ib_mad_hdr mad_hdr; + u8 data[OPA_MGMT_MAD_DATA]; +}; + struct ib_rmpp_mad { struct ib_mad_hdr mad_hdr; struct ib_rmpp_hdr rmpp_hdr; u8 data[IB_MGMT_RMPP_DATA]; }; +struct opa_rmpp_mad { + struct ib_mad_hdr mad_hdr; + struct ib_rmpp_hdr rmpp_hdr; + u8 data[OPA_MGMT_RMPP_DATA]; +}; + struct ib_sa_mad { struct ib_mad_hdr mad_hdr; struct ib_rmpp_hdr rmpp_hdr; struct ib_sa_hdr sa_hdr; u8 data[IB_MGMT_SA_DATA]; } __attribute__ ((packed)); struct ib_vendor_mad { struct ib_mad_hdr mad_hdr; struct ib_rmpp_hdr rmpp_hdr; u8 reserved; u8 oui[3]; u8 data[IB_MGMT_VENDOR_DATA]; }; +#define IB_MGMT_CLASSPORTINFO_ATTR_ID cpu_to_be16(0x0001) + +#define IB_CLASS_PORT_INFO_RESP_TIME_MASK 0x1F +#define IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE 5 + struct ib_class_port_info { u8 base_version; u8 class_version; __be16 capability_mask; - u8 reserved[3]; - u8 resp_time_value; + /* 27 bits for cap_mask2, 5 bits for resp_time */ + __be32 cap_mask2_resp_time; u8 redirect_gid[16]; __be32 redirect_tcslfl; __be16 redirect_lid; __be16 redirect_pkey; __be32 redirect_qp; __be32 redirect_qkey; u8 trap_gid[16]; __be32 trap_tcslfl; __be16 trap_lid; __be16 trap_pkey; __be32 trap_hlqp; __be32 trap_qkey; }; /** + * ib_get_cpi_resp_time - Returns the resp_time value from + * cap_mask2_resp_time in ib_class_port_info. + * @cpi: A struct ib_class_port_info mad. + */ +static inline u8 ib_get_cpi_resp_time(struct ib_class_port_info *cpi) +{ + return (u8)(be32_to_cpu(cpi->cap_mask2_resp_time) & + IB_CLASS_PORT_INFO_RESP_TIME_MASK); +} + +/** + * ib_set_cpi_resptime - Sets the response time in an + * ib_class_port_info mad. + * @cpi: A struct ib_class_port_info. + * @rtime: The response time to set. + */ +static inline void ib_set_cpi_resp_time(struct ib_class_port_info *cpi, + u8 rtime) +{ + cpi->cap_mask2_resp_time = + (cpi->cap_mask2_resp_time & + cpu_to_be32(~IB_CLASS_PORT_INFO_RESP_TIME_MASK)) | + cpu_to_be32(rtime & IB_CLASS_PORT_INFO_RESP_TIME_MASK); +} + +/** + * ib_get_cpi_capmask2 - Returns the capmask2 value from + * cap_mask2_resp_time in ib_class_port_info. + * @cpi: A struct ib_class_port_info mad. + */ +static inline u32 ib_get_cpi_capmask2(struct ib_class_port_info *cpi) +{ + return (be32_to_cpu(cpi->cap_mask2_resp_time) >> + IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE); +} + +/** + * ib_set_cpi_capmask2 - Sets the capmask2 in an + * ib_class_port_info mad. + * @cpi: A struct ib_class_port_info. + * @capmask2: The capmask2 to set. + */ +static inline void ib_set_cpi_capmask2(struct ib_class_port_info *cpi, + u32 capmask2) +{ + cpi->cap_mask2_resp_time = + (cpi->cap_mask2_resp_time & + cpu_to_be32(IB_CLASS_PORT_INFO_RESP_TIME_MASK)) | + cpu_to_be32(capmask2 << + IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE); +} + +struct ib_mad_notice_attr { + u8 generic_type; + u8 prod_type_msb; + __be16 prod_type_lsb; + __be16 trap_num; + __be16 issuer_lid; + __be16 toggle_count; + + union { + struct { + u8 details[54]; + } raw_data; + + struct { + __be16 reserved; + __be16 lid; /* where violation happened */ + u8 port_num; /* where violation happened */ + } __packed ntc_129_131; + + struct { + __be16 reserved; + __be16 lid; /* LID where change occurred */ + u8 reserved2; + u8 local_changes; /* low bit - local changes */ + __be32 new_cap_mask; /* new capability mask */ + u8 reserved3; + u8 change_flags; /* low 3 bits only */ + } __packed ntc_144; + + struct { + __be16 reserved; + __be16 lid; /* lid where sys guid changed */ + __be16 reserved2; + __be64 new_sys_guid; + } __packed ntc_145; + + struct { + __be16 reserved; + __be16 lid; + __be16 dr_slid; + u8 method; + u8 reserved2; + __be16 attr_id; + __be32 attr_mod; + __be64 mkey; + u8 reserved3; + u8 dr_trunc_hop; + u8 dr_rtn_path[30]; + } __packed ntc_256; + + struct { + __be16 reserved; + __be16 lid1; + __be16 lid2; + __be32 key; + __be32 sl_qp1; /* SL: high 4 bits */ + __be32 qp2; /* high 8 bits reserved */ + union ib_gid gid1; + union ib_gid gid2; + } __packed ntc_257_258; + + } details; +}; + +/** * ib_mad_send_buf - MAD data buffer and work request for sends. * @next: A pointer used to chain together MADs for posting. * @mad: References an allocated MAD data buffer for MADs that do not have * RMPP active. For MADs using RMPP, references the common and management * class specific headers. * @mad_agent: MAD agent that allocated the buffer. * @ah: The address handle to use when sending the MAD. * @context: User-controlled context fields. * @hdr_len: Indicates the size of the data header of the MAD. This length * includes the common MAD, RMPP, and class specific headers. * @data_len: Indicates the total size of user-transferred data. * @seg_count: The number of RMPP segments allocated for this send. - * @seg_size: Size of each RMPP segment. + * @seg_size: Size of the data in each RMPP segment. This does not include + * class specific headers. + * @seg_rmpp_size: Size of each RMPP segment including the class specific + * headers. * @timeout_ms: Time to wait for a response. * @retries: Number of times to retry a request for a response. For MADs * using RMPP, this applies per window. On completion, returns the number * of retries needed to complete the transfer. * * Users are responsible for initializing the MAD buffer itself, with the * exception of any RMPP header. Additional segment buffer space allocated * beyond data_len is padding. */ struct ib_mad_send_buf { struct ib_mad_send_buf *next; void *mad; struct ib_mad_agent *mad_agent; struct ib_ah *ah; void *context[2]; int hdr_len; int data_len; int seg_count; int seg_size; + int seg_rmpp_size; int timeout_ms; int retries; }; /** * ib_response_mad - Returns if the specified MAD has been generated in * response to a sent request or trap. */ -int ib_response_mad(struct ib_mad *mad); +int ib_response_mad(const struct ib_mad_hdr *hdr); /** * ib_get_rmpp_resptime - Returns the RMPP response time. * @rmpp_hdr: An RMPP header. */ static inline u8 ib_get_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr) { return rmpp_hdr->rmpp_rtime_flags >> 3; } /** * ib_get_rmpp_flags - Returns the RMPP flags. * @rmpp_hdr: An RMPP header. */ -static inline u8 ib_get_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr) +static inline u8 ib_get_rmpp_flags(const struct ib_rmpp_hdr *rmpp_hdr) { return rmpp_hdr->rmpp_rtime_flags & 0x7; } /** * ib_set_rmpp_resptime - Sets the response time in an RMPP header. * @rmpp_hdr: An RMPP header. * @rtime: The response time to set. */ static inline void ib_set_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr, u8 rtime) { rmpp_hdr->rmpp_rtime_flags = ib_get_rmpp_flags(rmpp_hdr) | (rtime << 3); } /** * ib_set_rmpp_flags - Sets the flags in an RMPP header. * @rmpp_hdr: An RMPP header. * @flags: The flags to set. */ static inline void ib_set_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr, u8 flags) { rmpp_hdr->rmpp_rtime_flags = (rmpp_hdr->rmpp_rtime_flags & 0xF8) | (flags & 0x7); } struct ib_mad_agent; struct ib_mad_send_wc; struct ib_mad_recv_wc; /** * ib_mad_send_handler - callback handler for a sent MAD. * @mad_agent: MAD agent that sent the MAD. * @mad_send_wc: Send work completion information on the sent MAD. */ typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_send_wc); /** * ib_mad_snoop_handler - Callback handler for snooping sent MADs. * @mad_agent: MAD agent that snooped the MAD. - * @send_wr: Work request information on the sent MAD. + * @send_buf: send MAD data buffer. * @mad_send_wc: Work completion information on the sent MAD. Valid * only for snooping that occurs on a send completion. * - * Clients snooping MADs should not modify data referenced by the @send_wr + * Clients snooping MADs should not modify data referenced by the @send_buf * or @mad_send_wc. */ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, struct ib_mad_send_wc *mad_send_wc); /** * ib_mad_recv_handler - callback handler for a received MAD. * @mad_agent: MAD agent requesting the received MAD. + * @send_buf: Send buffer if found, else NULL * @mad_recv_wc: Received work completion information on the received MAD. * * MADs received in response to a send request operation will be handed to * the user before the send operation completes. All data buffers given * to registered agents through this routine are owned by the receiving * client, except for snooping agents. Clients snooping MADs should not * modify the data referenced by @mad_recv_wc. */ typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc); /** * ib_mad_agent - Used to track MAD registration with the access layer. * @device: Reference to device registration is on. * @qp: Reference to QP used for sending and receiving MADs. * @mr: Memory region for system memory usable for DMA. * @recv_handler: Callback handler for a received MAD. * @send_handler: Callback handler for a sent MAD. * @snoop_handler: Callback handler for snooped sent MADs. * @context: User-specified context associated with this registration. * @hi_tid: Access layer assigned transaction ID for this client. * Unsolicited MADs sent by this client will have the upper 32-bits * of their TID set to this value. + * @flags: registration flags * @port_num: Port number on which QP is registered * @rmpp_version: If set, indicates the RMPP version used by this agent. */ +enum { + IB_MAD_USER_RMPP = IB_USER_MAD_USER_RMPP, +}; struct ib_mad_agent { struct ib_device *device; struct ib_qp *qp; - struct ib_mr *mr; ib_mad_recv_handler recv_handler; ib_mad_send_handler send_handler; ib_mad_snoop_handler snoop_handler; void *context; u32 hi_tid; + u32 flags; u8 port_num; u8 rmpp_version; }; /** * ib_mad_send_wc - MAD send completion information. * @send_buf: Send MAD data buffer associated with the send MAD request. * @status: Completion status. * @vendor_err: Optional vendor error information returned with a failed * request. */ struct ib_mad_send_wc { struct ib_mad_send_buf *send_buf; enum ib_wc_status status; u32 vendor_err; }; /** * ib_mad_recv_buf - received MAD buffer information. * @list: Reference to next data buffer for a received RMPP MAD. * @grh: References a data buffer containing the global route header. * The data refereced by this buffer is only valid if the GRH is * valid. * @mad: References the start of the received MAD. */ struct ib_mad_recv_buf { struct list_head list; struct ib_grh *grh; - struct ib_mad *mad; + union { + struct ib_mad *mad; + struct opa_mad *opa_mad; + }; }; /** * ib_mad_recv_wc - received MAD information. * @wc: Completion information for the received data. * @recv_buf: Specifies the location of the received data buffer(s). * @rmpp_list: Specifies a list of RMPP reassembled received MAD buffers. * @mad_len: The length of the received MAD, without duplicated headers. + * @mad_seg_size: The size of individual MAD segments * * For received response, the wr_id contains a pointer to the ib_mad_send_buf * for the corresponding send request. */ struct ib_mad_recv_wc { struct ib_wc *wc; struct ib_mad_recv_buf recv_buf; struct list_head rmpp_list; int mad_len; + size_t mad_seg_size; }; /** * ib_mad_reg_req - MAD registration request * @mgmt_class: Indicates which management class of MADs should be receive * by the caller. This field is only required if the user wishes to * receive unsolicited MADs, otherwise it should be 0. * @mgmt_class_version: Indicates which version of MADs for the given * management class to receive. * @oui: Indicates IEEE OUI when mgmt_class is a vendor class * in the range from 0x30 to 0x4f. Otherwise not used. * @method_mask: The caller will receive unsolicited MADs for any method * where @method_mask = 1. + * */ struct ib_mad_reg_req { u8 mgmt_class; u8 mgmt_class_version; u8 oui[3]; DECLARE_BITMAP(method_mask, IB_MGMT_MAX_METHODS); }; /** * ib_register_mad_agent - Register to send/receive MADs. * @device: The device to register with. * @port_num: The port on the specified device to use. * @qp_type: Specifies which QP to access. Must be either * IB_QPT_SMI or IB_QPT_GSI. * @mad_reg_req: Specifies which unsolicited MADs should be received * by the caller. This parameter may be NULL if the caller only * wishes to receive solicited responses. * @rmpp_version: If set, indicates that the client will send * and receive MADs that contain the RMPP header for the given version. * If set to 0, indicates that RMPP is not used by this client. * @send_handler: The completion callback routine invoked after a send * request has completed. * @recv_handler: The completion callback routine invoked for a received * MAD. * @context: User specified context associated with the registration. + * @registration_flags: Registration flags to set for this agent */ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, u8 port_num, enum ib_qp_type qp_type, struct ib_mad_reg_req *mad_reg_req, u8 rmpp_version, ib_mad_send_handler send_handler, ib_mad_recv_handler recv_handler, - void *context); + void *context, + u32 registration_flags); enum ib_mad_snoop_flags { /*IB_MAD_SNOOP_POSTED_SENDS = 1,*/ /*IB_MAD_SNOOP_RMPP_SENDS = (1<<1),*/ IB_MAD_SNOOP_SEND_COMPLETIONS = (1<<2), /*IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS = (1<<3),*/ IB_MAD_SNOOP_RECVS = (1<<4) /*IB_MAD_SNOOP_RMPP_RECVS = (1<<5),*/ /*IB_MAD_SNOOP_REDIRECTED_QPS = (1<<6)*/ }; /** * ib_register_mad_snoop - Register to snoop sent and received MADs. * @device: The device to register with. * @port_num: The port on the specified device to use. * @qp_type: Specifies which QP traffic to snoop. Must be either * IB_QPT_SMI or IB_QPT_GSI. * @mad_snoop_flags: Specifies information where snooping occurs. * @send_handler: The callback routine invoked for a snooped send. * @recv_handler: The callback routine invoked for a snooped receive. * @context: User specified context associated with the registration. */ struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device, u8 port_num, enum ib_qp_type qp_type, int mad_snoop_flags, ib_mad_snoop_handler snoop_handler, ib_mad_recv_handler recv_handler, void *context); /** * ib_unregister_mad_agent - Unregisters a client from using MAD services. * @mad_agent: Corresponding MAD registration request to deregister. * * After invoking this routine, MAD services are no longer usable by the * client on the associated QP. */ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent); /** * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated * with the registered client. * @send_buf: Specifies the information needed to send the MAD(s). * @bad_send_buf: Specifies the MAD on which an error was encountered. This * parameter is optional if only a single MAD is posted. * * Sent MADs are not guaranteed to complete in the order that they were posted. * * If the MAD requires RMPP, the data buffer should contain a single copy * of the common MAD, RMPP, and class specific headers, followed by the class * defined data. If the class defined data would not divide evenly into * RMPP segments, then space must be allocated at the end of the referenced * buffer for any required padding. To indicate the amount of class defined * data being transferred, the paylen_newwin field in the RMPP header should * be set to the size of the class specific header plus the amount of class * defined data being transferred. The paylen_newwin field should be * specified in network-byte order. */ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, struct ib_mad_send_buf **bad_send_buf); /** * ib_free_recv_mad - Returns data buffers used to receive a MAD. * @mad_recv_wc: Work completion information for a received MAD. * * Clients receiving MADs through their ib_mad_recv_handler must call this * routine to return the work completion buffers to the access layer. */ void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc); /** * ib_cancel_mad - Cancels an outstanding send MAD operation. * @mad_agent: Specifies the registration associated with sent MAD. * @send_buf: Indicates the MAD to cancel. * * MADs will be returned to the user through the corresponding * ib_mad_send_handler. */ void ib_cancel_mad(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf); /** * ib_modify_mad - Modifies an outstanding send MAD operation. * @mad_agent: Specifies the registration associated with sent MAD. * @send_buf: Indicates the MAD to modify. * @timeout_ms: New timeout value for sent MAD. * * This call will reset the timeout value for a sent MAD to the specified * value. */ int ib_modify_mad(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, u32 timeout_ms); /** * ib_redirect_mad_qp - Registers a QP for MAD services. * @qp: Reference to a QP that requires MAD services. * @rmpp_version: If set, indicates that the client will send * and receive MADs that contain the RMPP header for the given version. * If set to 0, indicates that RMPP is not used by this client. * @send_handler: The completion callback routine invoked after a send * request has completed. * @recv_handler: The completion callback routine invoked for a received * MAD. * @context: User specified context associated with the registration. * * Use of this call allows clients to use MAD services, such as RMPP, * on user-owned QPs. After calling this routine, users may send * MADs on the specified QP by calling ib_mad_post_send. */ struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp, u8 rmpp_version, ib_mad_send_handler send_handler, ib_mad_recv_handler recv_handler, void *context); /** * ib_process_mad_wc - Processes a work completion associated with a * MAD sent or received on a redirected QP. * @mad_agent: Specifies the registered MAD service using the redirected QP. * @wc: References a work completion associated with a sent or received * MAD segment. * * This routine is used to complete or continue processing on a MAD request. * If the work completion is associated with a send operation, calling * this routine is required to continue an RMPP transfer or to wait for a * corresponding response, if it is a request. If the work completion is * associated with a receive operation, calling this routine is required to * process an inbound or outbound RMPP transfer, or to match a response MAD * with its corresponding request. */ int ib_process_mad_wc(struct ib_mad_agent *mad_agent, struct ib_wc *wc); /** * ib_create_send_mad - Allocate and initialize a data buffer and work request * for sending a MAD. * @mad_agent: Specifies the registered MAD service to associate with the MAD. * @remote_qpn: Specifies the QPN of the receiving node. * @pkey_index: Specifies which PKey the MAD will be sent using. This field * is valid only if the remote_qpn is QP 1. * @rmpp_active: Indicates if the send will enable RMPP. * @hdr_len: Indicates the size of the data header of the MAD. This length * should include the common MAD header, RMPP header, plus any class * specific header. * @data_len: Indicates the size of any user-transferred data. The call will * automatically adjust the allocated buffer size to account for any * additional padding that may be necessary. * @gfp_mask: GFP mask used for the memory allocation. + * @base_version: Base Version of this MAD * * This routine allocates a MAD for sending. The returned MAD send buffer * will reference a data buffer usable for sending a MAD, along * with an initialized work request structure. Users may modify the returned * MAD data buffer before posting the send. * * The returned MAD header, class specific headers, and any padding will be * cleared. Users are responsible for initializing the common MAD header, * any class specific header, and MAD data area. * If @rmpp_active is set, the RMPP header will be initialized for sending. */ struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent, u32 remote_qpn, u16 pkey_index, int rmpp_active, int hdr_len, int data_len, - gfp_t gfp_mask); + gfp_t gfp_mask, + u8 base_version); /** * ib_is_mad_class_rmpp - returns whether given management class * supports RMPP. * @mgmt_class: management class * * This routine returns whether the management class supports RMPP. */ int ib_is_mad_class_rmpp(u8 mgmt_class); /** * ib_get_mad_data_offset - returns the data offset for a given * management class. * @mgmt_class: management class * * This routine returns the data offset in the MAD for the management * class requested. */ int ib_get_mad_data_offset(u8 mgmt_class); /** * ib_get_rmpp_segment - returns the data buffer for a given RMPP segment. * @send_buf: Previously allocated send data buffer. * @seg_num: number of segment to return * * This routine returns a pointer to the data buffer of an RMPP MAD. * Users must provide synchronization to @send_buf around this call. */ void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num); /** * ib_free_send_mad - Returns data buffers used to send a MAD. * @send_buf: Previously allocated send data buffer. */ void ib_free_send_mad(struct ib_mad_send_buf *send_buf); + +/** + * ib_mad_kernel_rmpp_agent - Returns if the agent is performing RMPP. + * @agent: the agent in question + * @return: true if agent is performing rmpp, false otherwise. + */ +int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent); #endif /* IB_MAD_H */ Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_pack.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_pack.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_pack.h (revision 319974) @@ -1,267 +1,305 @@ /* * Copyright (c) 2004 Topspin Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef IB_PACK_H #define IB_PACK_H #include enum { IB_LRH_BYTES = 8, IB_ETH_BYTES = 14, IB_VLAN_BYTES = 4, IB_GRH_BYTES = 40, + IB_IP4_BYTES = 20, + IB_UDP_BYTES = 8, IB_BTH_BYTES = 12, IB_DETH_BYTES = 8 }; struct ib_field { size_t struct_offset_bytes; size_t struct_size_bytes; int offset_words; int offset_bits; int size_bits; char *field_name; }; #define RESERVED \ .field_name = "reserved" /* * This macro cleans up the definitions of constants for BTH opcodes. * It is used to define constants such as IB_OPCODE_UD_SEND_ONLY, * which becomes IB_OPCODE_UD + IB_OPCODE_SEND_ONLY, and this gives * the correct value. * * In short, user code should use the constants defined using the * macro rather than worrying about adding together other constants. */ #define IB_OPCODE(transport, op) \ IB_OPCODE_ ## transport ## _ ## op = \ IB_OPCODE_ ## transport + IB_OPCODE_ ## op enum { /* transport types -- just used to define real constants */ IB_OPCODE_RC = 0x00, IB_OPCODE_UC = 0x20, IB_OPCODE_RD = 0x40, IB_OPCODE_UD = 0x60, + /* per IBTA 1.3 vol 1 Table 38, A10.3.2 */ + IB_OPCODE_CNP = 0x80, /* operations -- just used to define real constants */ IB_OPCODE_SEND_FIRST = 0x00, IB_OPCODE_SEND_MIDDLE = 0x01, IB_OPCODE_SEND_LAST = 0x02, IB_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03, IB_OPCODE_SEND_ONLY = 0x04, IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05, IB_OPCODE_RDMA_WRITE_FIRST = 0x06, IB_OPCODE_RDMA_WRITE_MIDDLE = 0x07, IB_OPCODE_RDMA_WRITE_LAST = 0x08, IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09, IB_OPCODE_RDMA_WRITE_ONLY = 0x0a, IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b, IB_OPCODE_RDMA_READ_REQUEST = 0x0c, IB_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d, IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e, IB_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f, IB_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10, IB_OPCODE_ACKNOWLEDGE = 0x11, IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12, IB_OPCODE_COMPARE_SWAP = 0x13, IB_OPCODE_FETCH_ADD = 0x14, + /* opcode 0x15 is reserved */ + IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16, + IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17, /* real constants follow -- see comment about above IB_OPCODE() macro for more details */ /* RC */ IB_OPCODE(RC, SEND_FIRST), IB_OPCODE(RC, SEND_MIDDLE), IB_OPCODE(RC, SEND_LAST), IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE), IB_OPCODE(RC, SEND_ONLY), IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE), IB_OPCODE(RC, RDMA_WRITE_FIRST), IB_OPCODE(RC, RDMA_WRITE_MIDDLE), IB_OPCODE(RC, RDMA_WRITE_LAST), IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE), IB_OPCODE(RC, RDMA_WRITE_ONLY), IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE), IB_OPCODE(RC, RDMA_READ_REQUEST), IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST), IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE), IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST), IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY), IB_OPCODE(RC, ACKNOWLEDGE), IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE), IB_OPCODE(RC, COMPARE_SWAP), IB_OPCODE(RC, FETCH_ADD), + IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE), + IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE), /* UC */ IB_OPCODE(UC, SEND_FIRST), IB_OPCODE(UC, SEND_MIDDLE), IB_OPCODE(UC, SEND_LAST), IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE), IB_OPCODE(UC, SEND_ONLY), IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE), IB_OPCODE(UC, RDMA_WRITE_FIRST), IB_OPCODE(UC, RDMA_WRITE_MIDDLE), IB_OPCODE(UC, RDMA_WRITE_LAST), IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE), IB_OPCODE(UC, RDMA_WRITE_ONLY), IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE), /* RD */ IB_OPCODE(RD, SEND_FIRST), IB_OPCODE(RD, SEND_MIDDLE), IB_OPCODE(RD, SEND_LAST), IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE), IB_OPCODE(RD, SEND_ONLY), IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE), IB_OPCODE(RD, RDMA_WRITE_FIRST), IB_OPCODE(RD, RDMA_WRITE_MIDDLE), IB_OPCODE(RD, RDMA_WRITE_LAST), IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE), IB_OPCODE(RD, RDMA_WRITE_ONLY), IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE), IB_OPCODE(RD, RDMA_READ_REQUEST), IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST), IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE), IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST), IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY), IB_OPCODE(RD, ACKNOWLEDGE), IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE), IB_OPCODE(RD, COMPARE_SWAP), IB_OPCODE(RD, FETCH_ADD), /* UD */ IB_OPCODE(UD, SEND_ONLY), IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE) }; enum { IB_LNH_RAW = 0, IB_LNH_IP = 1, IB_LNH_IBA_LOCAL = 2, IB_LNH_IBA_GLOBAL = 3 }; struct ib_unpacked_lrh { u8 virtual_lane; u8 link_version; u8 service_level; u8 link_next_header; __be16 destination_lid; __be16 packet_length; __be16 source_lid; }; struct ib_unpacked_grh { u8 ip_version; u8 traffic_class; __be32 flow_label; __be16 payload_length; u8 next_header; u8 hop_limit; union ib_gid source_gid; union ib_gid destination_gid; }; struct ib_unpacked_bth { u8 opcode; u8 solicited_event; u8 mig_req; u8 pad_count; u8 transport_header_version; __be16 pkey; __be32 destination_qpn; u8 ack_req; __be32 psn; }; struct ib_unpacked_deth { __be32 qkey; __be32 source_qpn; }; struct ib_unpacked_eth { u8 dmac_h[4]; u8 dmac_l[2]; u8 smac_h[2]; u8 smac_l[4]; __be16 type; }; +struct ib_unpacked_ip4 { + u8 ver; + u8 hdr_len; + u8 tos; + __be16 tot_len; + __be16 id; + __be16 frag_off; + u8 ttl; + u8 protocol; + __sum16 check; + __be32 saddr; + __be32 daddr; +}; + +struct ib_unpacked_udp { + __be16 sport; + __be16 dport; + __be16 length; + __be16 csum; +}; + struct ib_unpacked_vlan { __be16 tag; __be16 type; }; struct ib_ud_header { int lrh_present; struct ib_unpacked_lrh lrh; - int eth_present; - struct ib_unpacked_eth eth; + int eth_present; + struct ib_unpacked_eth eth; int vlan_present; struct ib_unpacked_vlan vlan; - int grh_present; - struct ib_unpacked_grh grh; - struct ib_unpacked_bth bth; + int grh_present; + struct ib_unpacked_grh grh; + int ipv4_present; + struct ib_unpacked_ip4 ip4; + int udp_present; + struct ib_unpacked_udp udp; + struct ib_unpacked_bth bth; struct ib_unpacked_deth deth; - int immediate_present; - __be32 immediate_data; + int immediate_present; + __be32 immediate_data; }; void ib_pack(const struct ib_field *desc, int desc_len, void *structure, void *buf); void ib_unpack(const struct ib_field *desc, int desc_len, void *buf, void *structure); -void ib_ud_header_init(int payload_bytes, - int lrh_present, - int eth_present, - int vlan_present, - int grh_present, - int immediate_present, - struct ib_ud_header *header); +__sum16 ib_ud_ip4_csum(struct ib_ud_header *header); + +int ib_ud_header_init(int payload_bytes, + int lrh_present, + int eth_present, + int vlan_present, + int grh_present, + int ip_version, + int udp_present, + int immediate_present, + struct ib_ud_header *header); int ib_ud_header_pack(struct ib_ud_header *header, void *buf); int ib_ud_header_unpack(void *buf, struct ib_ud_header *header); #endif /* IB_PACK_H */ Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_pma.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_pma.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_pma.h (revision 319974) @@ -1,175 +1,157 @@ /* * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. * All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if !defined(IB_PMA_H) #define IB_PMA_H #include -#define MAX_U32 0xffffffffULL -#define MAX_U16 0xffffUL - -/* Counters should be saturate once they reach their maximum value */ -#define ASSIGN_32BIT_COUNTER(counter, value) do { \ - if ((value) > MAX_U32) \ - counter = cpu_to_be32(MAX_U32); \ - else \ - counter = cpu_to_be32(value); \ -} while (0) - -/* Counters should be saturate once they reach their maximum value */ -#define ASSIGN_16BIT_COUNTER(counter, value) do { \ - if ((value) > MAX_U16) \ - counter = cpu_to_be16(MAX_U16); \ - else \ - counter = cpu_to_be16(value); \ -} while (0) - /* * PMA class portinfo capability mask bits */ #define IB_PMA_CLASS_CAP_ALLPORTSELECT cpu_to_be16(1 << 8) #define IB_PMA_CLASS_CAP_EXT_WIDTH cpu_to_be16(1 << 9) +#define IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF cpu_to_be16(1 << 10) #define IB_PMA_CLASS_CAP_XMIT_WAIT cpu_to_be16(1 << 12) #define IB_PMA_CLASS_PORT_INFO cpu_to_be16(0x0001) #define IB_PMA_PORT_SAMPLES_CONTROL cpu_to_be16(0x0010) #define IB_PMA_PORT_SAMPLES_RESULT cpu_to_be16(0x0011) #define IB_PMA_PORT_COUNTERS cpu_to_be16(0x0012) #define IB_PMA_PORT_COUNTERS_EXT cpu_to_be16(0x001D) #define IB_PMA_PORT_SAMPLES_RESULT_EXT cpu_to_be16(0x001E) struct ib_pma_mad { struct ib_mad_hdr mad_hdr; u8 reserved[40]; u8 data[192]; } __packed; struct ib_pma_portsamplescontrol { u8 opcode; u8 port_select; u8 tick; u8 counter_width; /* resv: 7:3, counter width: 2:0 */ __be32 counter_mask0_9; /* 2, 10 3-bit fields */ __be16 counter_mask10_14; /* 1, 5 3-bit fields */ u8 sample_mechanisms; u8 sample_status; /* only lower 2 bits */ __be64 option_mask; __be64 vendor_mask; __be32 sample_start; __be32 sample_interval; __be16 tag; __be16 counter_select[15]; __be32 reserved1; __be64 samples_only_option_mask; __be32 reserved2[28]; }; struct ib_pma_portsamplesresult { __be16 tag; __be16 sample_status; /* only lower 2 bits */ __be32 counter[15]; }; struct ib_pma_portsamplesresult_ext { __be16 tag; __be16 sample_status; /* only lower 2 bits */ __be32 extended_width; /* only upper 2 bits */ __be64 counter[15]; }; struct ib_pma_portcounters { u8 reserved; u8 port_select; __be16 counter_select; __be16 symbol_error_counter; u8 link_error_recovery_counter; u8 link_downed_counter; __be16 port_rcv_errors; __be16 port_rcv_remphys_errors; __be16 port_rcv_switch_relay_errors; __be16 port_xmit_discards; u8 port_xmit_constraint_errors; u8 port_rcv_constraint_errors; u8 reserved1; u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */ __be16 reserved2; __be16 vl15_dropped; __be32 port_xmit_data; __be32 port_rcv_data; __be32 port_xmit_packets; __be32 port_rcv_packets; __be32 port_xmit_wait; } __packed; #define IB_PMA_SEL_SYMBOL_ERROR cpu_to_be16(0x0001) #define IB_PMA_SEL_LINK_ERROR_RECOVERY cpu_to_be16(0x0002) #define IB_PMA_SEL_LINK_DOWNED cpu_to_be16(0x0004) #define IB_PMA_SEL_PORT_RCV_ERRORS cpu_to_be16(0x0008) #define IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS cpu_to_be16(0x0010) #define IB_PMA_SEL_PORT_XMIT_DISCARDS cpu_to_be16(0x0040) #define IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS cpu_to_be16(0x0200) #define IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS cpu_to_be16(0x0400) #define IB_PMA_SEL_PORT_VL15_DROPPED cpu_to_be16(0x0800) #define IB_PMA_SEL_PORT_XMIT_DATA cpu_to_be16(0x1000) #define IB_PMA_SEL_PORT_RCV_DATA cpu_to_be16(0x2000) #define IB_PMA_SEL_PORT_XMIT_PACKETS cpu_to_be16(0x4000) #define IB_PMA_SEL_PORT_RCV_PACKETS cpu_to_be16(0x8000) struct ib_pma_portcounters_ext { u8 reserved; u8 port_select; __be16 counter_select; __be32 reserved1; __be64 port_xmit_data; __be64 port_rcv_data; __be64 port_xmit_packets; __be64 port_rcv_packets; __be64 port_unicast_xmit_packets; __be64 port_unicast_rcv_packets; __be64 port_multicast_xmit_packets; __be64 port_multicast_rcv_packets; } __packed; #define IB_PMA_SELX_PORT_XMIT_DATA cpu_to_be16(0x0001) #define IB_PMA_SELX_PORT_RCV_DATA cpu_to_be16(0x0002) #define IB_PMA_SELX_PORT_XMIT_PACKETS cpu_to_be16(0x0004) #define IB_PMA_SELX_PORT_RCV_PACKETS cpu_to_be16(0x0008) #define IB_PMA_SELX_PORT_UNI_XMIT_PACKETS cpu_to_be16(0x0010) #define IB_PMA_SELX_PORT_UNI_RCV_PACKETS cpu_to_be16(0x0020) #define IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS cpu_to_be16(0x0040) #define IB_PMA_SELX_PORT_MULTI_RCV_PACKETS cpu_to_be16(0x0080) #endif /* IB_PMA_H */ Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_sa.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_sa.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_sa.h (revision 319974) @@ -1,424 +1,465 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef IB_SA_H #define IB_SA_H #include #include +#include -#include - #include #include enum { IB_SA_CLASS_VERSION = 2, /* IB spec version 1.1/1.2 */ IB_SA_METHOD_GET_TABLE = 0x12, IB_SA_METHOD_GET_TABLE_RESP = 0x92, IB_SA_METHOD_DELETE = 0x15, IB_SA_METHOD_DELETE_RESP = 0x95, IB_SA_METHOD_GET_MULTI = 0x14, IB_SA_METHOD_GET_MULTI_RESP = 0x94, IB_SA_METHOD_GET_TRACE_TBL = 0x13 }; enum { IB_SA_ATTR_CLASS_PORTINFO = 0x01, IB_SA_ATTR_NOTICE = 0x02, IB_SA_ATTR_INFORM_INFO = 0x03, IB_SA_ATTR_NODE_REC = 0x11, IB_SA_ATTR_PORT_INFO_REC = 0x12, IB_SA_ATTR_SL2VL_REC = 0x13, IB_SA_ATTR_SWITCH_REC = 0x14, IB_SA_ATTR_LINEAR_FDB_REC = 0x15, IB_SA_ATTR_RANDOM_FDB_REC = 0x16, IB_SA_ATTR_MCAST_FDB_REC = 0x17, IB_SA_ATTR_SM_INFO_REC = 0x18, IB_SA_ATTR_LINK_REC = 0x20, IB_SA_ATTR_GUID_INFO_REC = 0x30, IB_SA_ATTR_SERVICE_REC = 0x31, IB_SA_ATTR_PARTITION_REC = 0x33, IB_SA_ATTR_PATH_REC = 0x35, IB_SA_ATTR_VL_ARB_REC = 0x36, IB_SA_ATTR_MC_MEMBER_REC = 0x38, IB_SA_ATTR_TRACE_REC = 0x39, IB_SA_ATTR_MULTI_PATH_REC = 0x3a, IB_SA_ATTR_SERVICE_ASSOC_REC = 0x3b, IB_SA_ATTR_INFORM_INFO_REC = 0xf3 }; enum ib_sa_selector { IB_SA_GT = 0, IB_SA_LT = 1, IB_SA_EQ = 2, /* * The meaning of "best" depends on the attribute: for * example, for MTU best will return the largest available * MTU, while for packet life time, best will return the * smallest available life time. */ IB_SA_BEST = 3 }; /* + * There are 4 types of join states: + * FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember. + * The order corresponds to JoinState bits in MCMemberRecord. + */ +enum ib_sa_mc_join_states { + FULLMEMBER_JOIN, + NONMEMBER_JOIN, + SENDONLY_NONMEBER_JOIN, + SENDONLY_FULLMEMBER_JOIN, + NUM_JOIN_MEMBERSHIP_TYPES, +}; + +#define IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT BIT(12) + +/* * Structures for SA records are named "struct ib_sa_xxx_rec." No * attempt is made to pack structures to match the physical layout of * SA records in SA MADs; all packing and unpacking is handled by the * SA query code. * * For a record with structure ib_sa_xxx_rec, the naming convention * for the component mask value for field yyy is IB_SA_XXX_REC_YYY (we * never use different abbreviations or otherwise change the spelling * of xxx/yyy between ib_sa_xxx_rec.yyy and IB_SA_XXX_REC_YYY). * * Reserved rows are indicated with comments to help maintainability. */ #define IB_SA_PATH_REC_SERVICE_ID (IB_SA_COMP_MASK( 0) |\ IB_SA_COMP_MASK( 1)) #define IB_SA_PATH_REC_DGID IB_SA_COMP_MASK( 2) #define IB_SA_PATH_REC_SGID IB_SA_COMP_MASK( 3) #define IB_SA_PATH_REC_DLID IB_SA_COMP_MASK( 4) #define IB_SA_PATH_REC_SLID IB_SA_COMP_MASK( 5) #define IB_SA_PATH_REC_RAW_TRAFFIC IB_SA_COMP_MASK( 6) /* reserved: 7 */ #define IB_SA_PATH_REC_FLOW_LABEL IB_SA_COMP_MASK( 8) #define IB_SA_PATH_REC_HOP_LIMIT IB_SA_COMP_MASK( 9) #define IB_SA_PATH_REC_TRAFFIC_CLASS IB_SA_COMP_MASK(10) #define IB_SA_PATH_REC_REVERSIBLE IB_SA_COMP_MASK(11) #define IB_SA_PATH_REC_NUMB_PATH IB_SA_COMP_MASK(12) #define IB_SA_PATH_REC_PKEY IB_SA_COMP_MASK(13) #define IB_SA_PATH_REC_QOS_CLASS IB_SA_COMP_MASK(14) #define IB_SA_PATH_REC_SL IB_SA_COMP_MASK(15) #define IB_SA_PATH_REC_MTU_SELECTOR IB_SA_COMP_MASK(16) #define IB_SA_PATH_REC_MTU IB_SA_COMP_MASK(17) #define IB_SA_PATH_REC_RATE_SELECTOR IB_SA_COMP_MASK(18) #define IB_SA_PATH_REC_RATE IB_SA_COMP_MASK(19) #define IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(20) #define IB_SA_PATH_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(21) #define IB_SA_PATH_REC_PREFERENCE IB_SA_COMP_MASK(22) struct ib_sa_path_rec { __be64 service_id; union ib_gid dgid; union ib_gid sgid; __be16 dlid; __be16 slid; int raw_traffic; /* reserved */ __be32 flow_label; u8 hop_limit; u8 traffic_class; int reversible; u8 numb_path; __be16 pkey; __be16 qos_class; u8 sl; u8 mtu_selector; u8 mtu; u8 rate_selector; u8 rate; u8 packet_life_time_selector; u8 packet_life_time; u8 preference; - u8 smac[ETH_ALEN]; - u8 dmac[6]; - __be16 vlan_id; + u8 dmac[ETH_ALEN]; + /* ignored in IB */ + int ifindex; + /* ignored in IB */ + struct vnet *net; + enum ib_gid_type gid_type; }; +static inline struct net_device *ib_get_ndev_from_path(struct ib_sa_path_rec *rec) +{ + return rec->net ? dev_get_by_index(rec->net, rec->ifindex) : NULL; +} + #define IB_SA_MCMEMBER_REC_MGID IB_SA_COMP_MASK( 0) #define IB_SA_MCMEMBER_REC_PORT_GID IB_SA_COMP_MASK( 1) #define IB_SA_MCMEMBER_REC_QKEY IB_SA_COMP_MASK( 2) #define IB_SA_MCMEMBER_REC_MLID IB_SA_COMP_MASK( 3) #define IB_SA_MCMEMBER_REC_MTU_SELECTOR IB_SA_COMP_MASK( 4) #define IB_SA_MCMEMBER_REC_MTU IB_SA_COMP_MASK( 5) #define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS IB_SA_COMP_MASK( 6) #define IB_SA_MCMEMBER_REC_PKEY IB_SA_COMP_MASK( 7) #define IB_SA_MCMEMBER_REC_RATE_SELECTOR IB_SA_COMP_MASK( 8) #define IB_SA_MCMEMBER_REC_RATE IB_SA_COMP_MASK( 9) #define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(10) #define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(11) #define IB_SA_MCMEMBER_REC_SL IB_SA_COMP_MASK(12) #define IB_SA_MCMEMBER_REC_FLOW_LABEL IB_SA_COMP_MASK(13) #define IB_SA_MCMEMBER_REC_HOP_LIMIT IB_SA_COMP_MASK(14) #define IB_SA_MCMEMBER_REC_SCOPE IB_SA_COMP_MASK(15) #define IB_SA_MCMEMBER_REC_JOIN_STATE IB_SA_COMP_MASK(16) #define IB_SA_MCMEMBER_REC_PROXY_JOIN IB_SA_COMP_MASK(17) struct ib_sa_mcmember_rec { union ib_gid mgid; union ib_gid port_gid; __be32 qkey; __be16 mlid; u8 mtu_selector; u8 mtu; u8 traffic_class; __be16 pkey; u8 rate_selector; u8 rate; u8 packet_life_time_selector; u8 packet_life_time; u8 sl; __be32 flow_label; u8 hop_limit; u8 scope; u8 join_state; int proxy_join; }; /* Service Record Component Mask Sec 15.2.5.14 Ver 1.1 */ #define IB_SA_SERVICE_REC_SERVICE_ID IB_SA_COMP_MASK( 0) #define IB_SA_SERVICE_REC_SERVICE_GID IB_SA_COMP_MASK( 1) #define IB_SA_SERVICE_REC_SERVICE_PKEY IB_SA_COMP_MASK( 2) /* reserved: 3 */ #define IB_SA_SERVICE_REC_SERVICE_LEASE IB_SA_COMP_MASK( 4) #define IB_SA_SERVICE_REC_SERVICE_KEY IB_SA_COMP_MASK( 5) #define IB_SA_SERVICE_REC_SERVICE_NAME IB_SA_COMP_MASK( 6) #define IB_SA_SERVICE_REC_SERVICE_DATA8_0 IB_SA_COMP_MASK( 7) #define IB_SA_SERVICE_REC_SERVICE_DATA8_1 IB_SA_COMP_MASK( 8) #define IB_SA_SERVICE_REC_SERVICE_DATA8_2 IB_SA_COMP_MASK( 9) #define IB_SA_SERVICE_REC_SERVICE_DATA8_3 IB_SA_COMP_MASK(10) #define IB_SA_SERVICE_REC_SERVICE_DATA8_4 IB_SA_COMP_MASK(11) #define IB_SA_SERVICE_REC_SERVICE_DATA8_5 IB_SA_COMP_MASK(12) #define IB_SA_SERVICE_REC_SERVICE_DATA8_6 IB_SA_COMP_MASK(13) #define IB_SA_SERVICE_REC_SERVICE_DATA8_7 IB_SA_COMP_MASK(14) #define IB_SA_SERVICE_REC_SERVICE_DATA8_8 IB_SA_COMP_MASK(15) #define IB_SA_SERVICE_REC_SERVICE_DATA8_9 IB_SA_COMP_MASK(16) #define IB_SA_SERVICE_REC_SERVICE_DATA8_10 IB_SA_COMP_MASK(17) #define IB_SA_SERVICE_REC_SERVICE_DATA8_11 IB_SA_COMP_MASK(18) #define IB_SA_SERVICE_REC_SERVICE_DATA8_12 IB_SA_COMP_MASK(19) #define IB_SA_SERVICE_REC_SERVICE_DATA8_13 IB_SA_COMP_MASK(20) #define IB_SA_SERVICE_REC_SERVICE_DATA8_14 IB_SA_COMP_MASK(21) #define IB_SA_SERVICE_REC_SERVICE_DATA8_15 IB_SA_COMP_MASK(22) #define IB_SA_SERVICE_REC_SERVICE_DATA16_0 IB_SA_COMP_MASK(23) #define IB_SA_SERVICE_REC_SERVICE_DATA16_1 IB_SA_COMP_MASK(24) #define IB_SA_SERVICE_REC_SERVICE_DATA16_2 IB_SA_COMP_MASK(25) #define IB_SA_SERVICE_REC_SERVICE_DATA16_3 IB_SA_COMP_MASK(26) #define IB_SA_SERVICE_REC_SERVICE_DATA16_4 IB_SA_COMP_MASK(27) #define IB_SA_SERVICE_REC_SERVICE_DATA16_5 IB_SA_COMP_MASK(28) #define IB_SA_SERVICE_REC_SERVICE_DATA16_6 IB_SA_COMP_MASK(29) #define IB_SA_SERVICE_REC_SERVICE_DATA16_7 IB_SA_COMP_MASK(30) #define IB_SA_SERVICE_REC_SERVICE_DATA32_0 IB_SA_COMP_MASK(31) #define IB_SA_SERVICE_REC_SERVICE_DATA32_1 IB_SA_COMP_MASK(32) #define IB_SA_SERVICE_REC_SERVICE_DATA32_2 IB_SA_COMP_MASK(33) #define IB_SA_SERVICE_REC_SERVICE_DATA32_3 IB_SA_COMP_MASK(34) #define IB_SA_SERVICE_REC_SERVICE_DATA64_0 IB_SA_COMP_MASK(35) #define IB_SA_SERVICE_REC_SERVICE_DATA64_1 IB_SA_COMP_MASK(36) #define IB_DEFAULT_SERVICE_LEASE 0xFFFFFFFF struct ib_sa_service_rec { u64 id; union ib_gid gid; __be16 pkey; /* reserved */ u32 lease; u8 key[16]; u8 name[64]; u8 data8[16]; u16 data16[8]; u32 data32[4]; u64 data64[2]; }; #define IB_SA_GUIDINFO_REC_LID IB_SA_COMP_MASK(0) #define IB_SA_GUIDINFO_REC_BLOCK_NUM IB_SA_COMP_MASK(1) #define IB_SA_GUIDINFO_REC_RES1 IB_SA_COMP_MASK(2) #define IB_SA_GUIDINFO_REC_RES2 IB_SA_COMP_MASK(3) #define IB_SA_GUIDINFO_REC_GID0 IB_SA_COMP_MASK(4) #define IB_SA_GUIDINFO_REC_GID1 IB_SA_COMP_MASK(5) #define IB_SA_GUIDINFO_REC_GID2 IB_SA_COMP_MASK(6) #define IB_SA_GUIDINFO_REC_GID3 IB_SA_COMP_MASK(7) #define IB_SA_GUIDINFO_REC_GID4 IB_SA_COMP_MASK(8) #define IB_SA_GUIDINFO_REC_GID5 IB_SA_COMP_MASK(9) #define IB_SA_GUIDINFO_REC_GID6 IB_SA_COMP_MASK(10) #define IB_SA_GUIDINFO_REC_GID7 IB_SA_COMP_MASK(11) struct ib_sa_guidinfo_rec { __be16 lid; u8 block_num; /* reserved */ u8 res1; __be32 res2; u8 guid_info_list[64]; }; struct ib_sa_client { atomic_t users; struct completion comp; }; /** * ib_sa_register_client - Register an SA client. */ void ib_sa_register_client(struct ib_sa_client *client); /** * ib_sa_unregister_client - Deregister an SA client. * @client: Client object to deregister. */ void ib_sa_unregister_client(struct ib_sa_client *client); struct ib_sa_query; void ib_sa_cancel_query(int id, struct ib_sa_query *query); int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_path_rec *rec, ib_sa_comp_mask comp_mask, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_path_rec *resp, void *context), void *context, struct ib_sa_query **query); int ib_sa_service_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_service_rec *rec, ib_sa_comp_mask comp_mask, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_service_rec *resp, void *context), void *context, struct ib_sa_query **sa_query); struct ib_sa_multicast { struct ib_sa_mcmember_rec rec; ib_sa_comp_mask comp_mask; int (*callback)(int status, struct ib_sa_multicast *multicast); void *context; }; /** * ib_sa_join_multicast - Initiates a join request to the specified multicast * group. * @client: SA client * @device: Device associated with the multicast group. * @port_num: Port on the specified device to associate with the multicast * group. * @rec: SA multicast member record specifying group attributes. * @comp_mask: Component mask indicating which group attributes of %rec are * valid. * @gfp_mask: GFP mask for memory allocations. * @callback: User callback invoked once the join operation completes. * @context: User specified context stored with the ib_sa_multicast structure. * * This call initiates a multicast join request with the SA for the specified * multicast group. If the join operation is started successfully, it returns * an ib_sa_multicast structure that is used to track the multicast operation. * Users must free this structure by calling ib_free_multicast, even if the * join operation later fails. (The callback status is non-zero.) * * If the join operation fails; status will be non-zero, with the following * failures possible: * -ETIMEDOUT: The request timed out. * -EIO: An error occurred sending the query. * -EINVAL: The MCMemberRecord values differed from the existing group's. * -ENETRESET: Indicates that an fatal error has occurred on the multicast * group, and the user must rejoin the group to continue using it. */ struct ib_sa_multicast *ib_sa_join_multicast(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, gfp_t gfp_mask, int (*callback)(int status, struct ib_sa_multicast *multicast), void *context); /** * ib_free_multicast - Frees the multicast tracking structure, and releases * any reference on the multicast group. * @multicast: Multicast tracking structure allocated by ib_join_multicast. * * This call blocks until the multicast identifier is destroyed. It may * not be called from within the multicast callback; however, returning a non- * zero value from the callback will result in destroying the multicast * tracking structure. */ void ib_sa_free_multicast(struct ib_sa_multicast *multicast); /** * ib_get_mcmember_rec - Looks up a multicast member record by its MGID and * returns it if found. * @device: Device associated with the multicast group. * @port_num: Port on the specified device to associate with the multicast * group. * @mgid: MGID of multicast group. * @rec: Location to copy SA multicast member record. */ int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num, union ib_gid *mgid, struct ib_sa_mcmember_rec *rec); /** * ib_init_ah_from_mcmember - Initialize address handle attributes based on * an SA multicast member record. */ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, + struct net_device *ndev, + enum ib_gid_type gid_type, struct ib_ah_attr *ah_attr); /** * ib_init_ah_from_path - Initialize address handle attributes based on an SA * path record. */ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr); /** + * ib_sa_pack_path - Conert a path record from struct ib_sa_path_rec + * to IB MAD wire format. + */ +void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute); + +/** * ib_sa_unpack_path - Convert a path record from MAD format to struct * ib_sa_path_rec. */ void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec); /* Support GuidInfoRecord */ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - struct ib_sa_guidinfo_rec *rec, - ib_sa_comp_mask comp_mask, u8 method, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_guidinfo_rec *resp, - void *context), - void *context, - struct ib_sa_query **sa_query); + struct ib_device *device, u8 port_num, + struct ib_sa_guidinfo_rec *rec, + ib_sa_comp_mask comp_mask, u8 method, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_guidinfo_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query); + +/* Support get SA ClassPortInfo */ +int ib_sa_classport_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_class_port_info *resp, + void *context), + void *context, + struct ib_sa_query **sa_query); + #endif /* IB_SA_H */ Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_smi.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_smi.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_smi.h (revision 319974) @@ -1,129 +1,175 @@ /* * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if !defined(IB_SMI_H) #define IB_SMI_H #include -#include #define IB_SMP_DATA_SIZE 64 #define IB_SMP_MAX_PATH_HOPS 64 struct ib_smp { u8 base_version; u8 mgmt_class; u8 class_version; u8 method; __be16 status; u8 hop_ptr; u8 hop_cnt; __be64 tid; __be16 attr_id; __be16 resv; __be32 attr_mod; __be64 mkey; __be16 dr_slid; __be16 dr_dlid; u8 reserved[28]; u8 data[IB_SMP_DATA_SIZE]; u8 initial_path[IB_SMP_MAX_PATH_HOPS]; u8 return_path[IB_SMP_MAX_PATH_HOPS]; } __attribute__ ((packed)); #define IB_SMP_DIRECTION cpu_to_be16(0x8000) /* Subnet management attributes */ #define IB_SMP_ATTR_NOTICE cpu_to_be16(0x0002) #define IB_SMP_ATTR_NODE_DESC cpu_to_be16(0x0010) #define IB_SMP_ATTR_NODE_INFO cpu_to_be16(0x0011) #define IB_SMP_ATTR_SWITCH_INFO cpu_to_be16(0x0012) #define IB_SMP_ATTR_GUID_INFO cpu_to_be16(0x0014) #define IB_SMP_ATTR_PORT_INFO cpu_to_be16(0x0015) #define IB_SMP_ATTR_PKEY_TABLE cpu_to_be16(0x0016) #define IB_SMP_ATTR_SL_TO_VL_TABLE cpu_to_be16(0x0017) #define IB_SMP_ATTR_VL_ARB_TABLE cpu_to_be16(0x0018) #define IB_SMP_ATTR_LINEAR_FORWARD_TABLE cpu_to_be16(0x0019) #define IB_SMP_ATTR_RANDOM_FORWARD_TABLE cpu_to_be16(0x001A) #define IB_SMP_ATTR_MCAST_FORWARD_TABLE cpu_to_be16(0x001B) #define IB_SMP_ATTR_SM_INFO cpu_to_be16(0x0020) #define IB_SMP_ATTR_VENDOR_DIAG cpu_to_be16(0x0030) #define IB_SMP_ATTR_LED_INFO cpu_to_be16(0x0031) #define IB_SMP_ATTR_VENDOR_MASK cpu_to_be16(0xFF00) struct ib_port_info { __be64 mkey; __be64 gid_prefix; __be16 lid; __be16 sm_lid; __be32 cap_mask; __be16 diag_code; __be16 mkey_lease_period; u8 local_port_num; u8 link_width_enabled; u8 link_width_supported; u8 link_width_active; u8 linkspeed_portstate; /* 4 bits, 4 bits */ u8 portphysstate_linkdown; /* 4 bits, 4 bits */ u8 mkeyprot_resv_lmc; /* 2 bits, 3, 3 */ u8 linkspeedactive_enabled; /* 4 bits, 4 bits */ u8 neighbormtu_mastersmsl; /* 4 bits, 4 bits */ u8 vlcap_inittype; /* 4 bits, 4 bits */ u8 vl_high_limit; u8 vl_arb_high_cap; u8 vl_arb_low_cap; u8 inittypereply_mtucap; /* 4 bits, 4 bits */ u8 vlstallcnt_hoqlife; /* 3 bits, 5 bits */ u8 operationalvl_pei_peo_fpi_fpo; /* 4 bits, 1, 1, 1, 1 */ __be16 mkey_violations; __be16 pkey_violations; __be16 qkey_violations; u8 guid_cap; u8 clientrereg_resv_subnetto; /* 1 bit, 2 bits, 5 */ u8 resv_resptimevalue; /* 3 bits, 5 bits */ u8 localphyerrors_overrunerrors; /* 4 bits, 4 bits */ __be16 max_credit_hint; u8 resv; u8 link_roundtrip_latency[3]; }; +struct ib_node_info { + u8 base_version; + u8 class_version; + u8 node_type; + u8 num_ports; + __be64 sys_guid; + __be64 node_guid; + __be64 port_guid; + __be16 partition_cap; + __be16 device_id; + __be32 revision; + u8 local_port_num; + u8 vendor_id[3]; +} __packed; + +struct ib_vl_weight_elem { + u8 vl; /* IB: VL is low 4 bits, upper 4 bits reserved */ + /* OPA: VL is low 5 bits, upper 3 bits reserved */ + u8 weight; +}; + static inline u8 -ib_get_smp_direction(struct ib_smp *smp) +ib_get_smp_direction(const struct ib_smp *smp) { return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION); } + +/* + * SM Trap/Notice numbers + */ +#define IB_NOTICE_TRAP_LLI_THRESH cpu_to_be16(129) +#define IB_NOTICE_TRAP_EBO_THRESH cpu_to_be16(130) +#define IB_NOTICE_TRAP_FLOW_UPDATE cpu_to_be16(131) +#define IB_NOTICE_TRAP_CAP_MASK_CHG cpu_to_be16(144) +#define IB_NOTICE_TRAP_SYS_GUID_CHG cpu_to_be16(145) +#define IB_NOTICE_TRAP_BAD_MKEY cpu_to_be16(256) +#define IB_NOTICE_TRAP_BAD_PKEY cpu_to_be16(257) +#define IB_NOTICE_TRAP_BAD_QKEY cpu_to_be16(258) + +/* + * Other local changes flags (trap 144). + */ +#define IB_NOTICE_TRAP_LSE_CHG 0x04 /* Link Speed Enable changed */ +#define IB_NOTICE_TRAP_LWE_CHG 0x02 /* Link Width Enable changed */ +#define IB_NOTICE_TRAP_NODE_DESC_CHG 0x01 + +/* + * M_Key volation flags in dr_trunc_hop (trap 256). + */ +#define IB_NOTICE_TRAP_DR_NOTICE 0x80 +#define IB_NOTICE_TRAP_DR_TRUNC 0x40 + #endif /* IB_SMI_H */ Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_umem.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_umem.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_umem.h (revision 319974) @@ -1,93 +1,108 @@ /* * Copyright (c) 2007 Cisco Systems. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef IB_UMEM_H #define IB_UMEM_H #include #include #include -#include -#include -#include struct ib_ucontext; -struct ib_umem; +struct ib_umem_odp; -typedef void (*umem_invalidate_func_t)(void *invalidation_cookie, - struct ib_umem *umem, - unsigned long addr, size_t size); - -struct invalidation_ctx { - struct ib_umem *umem; - umem_invalidate_func_t func; - void *cookie; - unsigned long context_ticket; - int peer_callback; - int inflight_invalidation; - int peer_invalidated; - struct completion comp; -}; - struct ib_umem { struct ib_ucontext *context; size_t length; - int offset; + unsigned long address; int page_size; int writable; - int hugetlb; struct work_struct work; + pid_t pid; + struct mm_struct *mm; unsigned long diff; - unsigned long start; + struct ib_umem_odp *odp_data; struct sg_table sg_head; - int nmap; + int nmap; int npages; - /* peer memory that manages this umem*/ - struct ib_peer_memory_client *ib_peer_mem; - struct invalidation_ctx *invalidation_ctx; - int peer_mem_srcu_key; - /* peer memory private context */ - void *peer_mem_client_context; }; +/* Returns the offset of the umem start relative to the first page. */ +static inline int ib_umem_offset(struct ib_umem *umem) +{ + return umem->address & ((unsigned long)umem->page_size - 1); +} + +/* Returns the first page of an ODP umem. */ +static inline unsigned long ib_umem_start(struct ib_umem *umem) +{ + return umem->address - ib_umem_offset(umem); +} + +/* Returns the address of the page after the last one of an ODP umem. */ +static inline unsigned long ib_umem_end(struct ib_umem *umem) +{ + return PAGE_ALIGN(umem->address + umem->length); +} + +static inline size_t ib_umem_num_pages(struct ib_umem *umem) +{ + return (ib_umem_end(umem) - ib_umem_start(umem)) >> PAGE_SHIFT; +} + +#ifdef CONFIG_INFINIBAND_USER_MEM + struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, size_t size, int access, int dmasync); -struct ib_umem *ib_umem_get_ex(struct ib_ucontext *context, unsigned long addr, - size_t size, int access, int dmasync, - int invalidation_supported); -void ib_umem_activate_invalidation_notifier(struct ib_umem *umem, - umem_invalidate_func_t func, - void *cookie); void ib_umem_release(struct ib_umem *umem); int ib_umem_page_count(struct ib_umem *umem); +int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, + size_t length); + +#else /* CONFIG_INFINIBAND_USER_MEM */ + +#include + +static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context, + unsigned long addr, size_t size, + int access, int dmasync) { + return ERR_PTR(-EINVAL); +} +static inline void ib_umem_release(struct ib_umem *umem) { } +static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; } +static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, + size_t length) { + return -EINVAL; +} +#endif /* CONFIG_INFINIBAND_USER_MEM */ #endif /* IB_UMEM_H */ Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_umem_odp.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_umem_odp.h (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_umem_odp.h (revision 319974) @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_UMEM_ODP_H +#define IB_UMEM_ODP_H + +#include + +#include +#include + +struct umem_odp_node { + u64 __subtree_last; + struct rb_node rb; +}; + +struct ib_umem_odp { + /* + * An array of the pages included in the on-demand paging umem. + * Indices of pages that are currently not mapped into the device will + * contain NULL. + */ + struct page **page_list; + /* + * An array of the same size as page_list, with DMA addresses mapped + * for pages the pages in page_list. The lower two bits designate + * access permissions. See ODP_READ_ALLOWED_BIT and + * ODP_WRITE_ALLOWED_BIT. + */ + dma_addr_t *dma_list; + /* + * The umem_mutex protects the page_list and dma_list fields of an ODP + * umem, allowing only a single thread to map/unmap pages. The mutex + * also protects access to the mmu notifier counters. + */ + struct mutex umem_mutex; + void *private; /* for the HW driver to use. */ + + /* When false, use the notifier counter in the ucontext struct. */ + bool mn_counters_active; + int notifiers_seq; + int notifiers_count; + + /* A linked list of umems that don't have private mmu notifier + * counters yet. */ + struct list_head no_private_counters; + struct ib_umem *umem; + + /* Tree tracking */ + struct umem_odp_node interval_tree; + + struct completion notifier_completion; + int dying; +}; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + +int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem); + +void ib_umem_odp_release(struct ib_umem *umem); + +/* + * The lower 2 bits of the DMA address signal the R/W permissions for + * the entry. To upgrade the permissions, provide the appropriate + * bitmask to the map_dma_pages function. + * + * Be aware that upgrading a mapped address might result in change of + * the DMA address for the page. + */ +#define ODP_READ_ALLOWED_BIT (1<<0ULL) +#define ODP_WRITE_ALLOWED_BIT (1<<1ULL) + +#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) + +int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt, + u64 access_mask, unsigned long current_seq); + +void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, + u64 bound); + +void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root); +void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root); +typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, + void *cookie); +/* + * Call the callback on each ib_umem in the range. Returns the logical or of + * the return values of the functions called. + */ +int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end, + umem_call_back cb, void *cookie); + +struct umem_odp_node *rbt_ib_umem_iter_first(struct rb_root *root, + u64 start, u64 last); +struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node, + u64 start, u64 last); + +static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, + unsigned long mmu_seq) +{ + /* + * This code is strongly based on the KVM code from + * mmu_notifier_retry. Should be called with + * the relevant locks taken (item->odp_data->umem_mutex + * and the ucontext umem_mutex semaphore locked for read). + */ + + /* Do not allow page faults while the new ib_umem hasn't seen a state + * with zero notifiers yet, and doesn't have its own valid set of + * private counters. */ + if (!item->odp_data->mn_counters_active) + return 1; + + if (unlikely(item->odp_data->notifiers_count)) + return 1; + if (item->odp_data->notifiers_seq != mmu_seq) + return 1; + return 0; +} + +#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + +static inline int ib_umem_odp_get(struct ib_ucontext *context, + struct ib_umem *umem) +{ + return -EINVAL; +} + +static inline void ib_umem_odp_release(struct ib_umem *umem) {} + +#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + +#endif /* IB_UMEM_ODP_H */ Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_umem_odp.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_user_mad.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_user_mad.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_user_mad.h (revision 319974) @@ -1,202 +1,245 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef IB_USER_MAD_H #define IB_USER_MAD_H #include #include /* * Increment this value if any changes that break userspace ABI * compatibility are made. */ #define IB_USER_MAD_ABI_VERSION 5 /* * Make sure that all structs defined in this file remain laid out so * that they pack the same way on 32-bit and 64-bit architectures (to * avoid incompatibility between 32-bit userspace and 64-bit kernels). */ /** * ib_user_mad_hdr_old - Old version of MAD packet header without pkey_index * @id - ID of agent MAD received with/to be sent with * @status - 0 on successful receive, ETIMEDOUT if no response * received (transaction ID in data[] will be set to TID of original * request) (ignored on send) * @timeout_ms - Milliseconds to wait for response (unset on receive) * @retries - Number of automatic retries to attempt * @qpn - Remote QP number received from/to be sent to * @qkey - Remote Q_Key to be sent with (unset on receive) * @lid - Remote lid received from/to be sent to * @sl - Service level received with/to be sent with * @path_bits - Local path bits received with/to be sent with * @grh_present - If set, GRH was received/should be sent * @gid_index - Local GID index to send with (unset on receive) * @hop_limit - Hop limit in GRH * @traffic_class - Traffic class in GRH * @gid - Remote GID in GRH * @flow_label - Flow label in GRH */ struct ib_user_mad_hdr_old { __u32 id; __u32 status; __u32 timeout_ms; __u32 retries; __u32 length; __be32 qpn; __be32 qkey; __be16 lid; __u8 sl; __u8 path_bits; __u8 grh_present; __u8 gid_index; __u8 hop_limit; __u8 traffic_class; __u8 gid[16]; __be32 flow_label; }; /** * ib_user_mad_hdr - MAD packet header * This layout allows specifying/receiving the P_Key index. To use * this capability, an application must call the * IB_USER_MAD_ENABLE_PKEY ioctl on the user MAD file handle before * any other actions with the file handle. * @id - ID of agent MAD received with/to be sent with * @status - 0 on successful receive, ETIMEDOUT if no response * received (transaction ID in data[] will be set to TID of original * request) (ignored on send) * @timeout_ms - Milliseconds to wait for response (unset on receive) * @retries - Number of automatic retries to attempt * @qpn - Remote QP number received from/to be sent to * @qkey - Remote Q_Key to be sent with (unset on receive) * @lid - Remote lid received from/to be sent to * @sl - Service level received with/to be sent with * @path_bits - Local path bits received with/to be sent with * @grh_present - If set, GRH was received/should be sent * @gid_index - Local GID index to send with (unset on receive) * @hop_limit - Hop limit in GRH * @traffic_class - Traffic class in GRH * @gid - Remote GID in GRH * @flow_label - Flow label in GRH * @pkey_index - P_Key index */ struct ib_user_mad_hdr { __u32 id; __u32 status; __u32 timeout_ms; __u32 retries; __u32 length; __be32 qpn; __be32 qkey; __be16 lid; __u8 sl; __u8 path_bits; __u8 grh_present; __u8 gid_index; __u8 hop_limit; __u8 traffic_class; __u8 gid[16]; __be32 flow_label; __u16 pkey_index; __u8 reserved[6]; }; /** * ib_user_mad - MAD packet * @hdr - MAD packet header * @data - Contents of MAD * */ struct ib_user_mad { struct ib_user_mad_hdr hdr; __u64 data[0]; }; /* * Earlier versions of this interface definition declared the * method_mask[] member as an array of __u32 but treated it as a * bitmap made up of longs in the kernel. This ambiguity meant that * 32-bit big-endian applications that can run on both 32-bit and * 64-bit kernels had no consistent ABI to rely on, and 64-bit * big-endian applications that treated method_mask as being made up * of 32-bit words would have their bitmap misinterpreted. * * To clear up this confusion, we change the declaration of * method_mask[] to use unsigned long and handle the conversion from * 32-bit userspace to 64-bit kernel for big-endian systems in the * compat_ioctl method. Unfortunately, to keep the structure layout * the same, we need the method_mask[] array to be aligned only to 4 * bytes even when long is 64 bits, which forces us into this ugly * typedef. */ typedef unsigned long __attribute__((aligned(4))) packed_ulong; #define IB_USER_MAD_LONGS_PER_METHOD_MASK (128 / (8 * sizeof (long))) /** * ib_user_mad_reg_req - MAD registration request * @id - Set by the kernel; used to identify agent in future requests. * @qpn - Queue pair number; must be 0 or 1. * @method_mask - The caller will receive unsolicited MADs for any method * where @method_mask = 1. * @mgmt_class - Indicates which management class of MADs should be receive * by the caller. This field is only required if the user wishes to * receive unsolicited MADs, otherwise it should be 0. * @mgmt_class_version - Indicates which version of MADs for the given * management class to receive. * @oui: Indicates IEEE OUI when mgmt_class is a vendor class * in the range from 0x30 to 0x4f. Otherwise not used. * @rmpp_version: If set, indicates the RMPP version used. * */ struct ib_user_mad_reg_req { __u32 id; packed_ulong method_mask[IB_USER_MAD_LONGS_PER_METHOD_MASK]; __u8 qpn; __u8 mgmt_class; __u8 mgmt_class_version; __u8 oui[3]; __u8 rmpp_version; }; +/** + * ib_user_mad_reg_req2 - MAD registration request + * + * @id - Set by the _kernel_; used by userspace to identify the + * registered agent in future requests. + * @qpn - Queue pair number; must be 0 or 1. + * @mgmt_class - Indicates which management class of MADs should be + * receive by the caller. This field is only required if + * the user wishes to receive unsolicited MADs, otherwise + * it should be 0. + * @mgmt_class_version - Indicates which version of MADs for the given + * management class to receive. + * @res - Ignored. + * @flags - additional registration flags; Must be in the set of + * flags defined in IB_USER_MAD_REG_FLAGS_CAP + * @method_mask - The caller wishes to receive unsolicited MADs for the + * methods whose bit(s) is(are) set. + * @oui - Indicates IEEE OUI to use when mgmt_class is a vendor + * class in the range from 0x30 to 0x4f. Otherwise not + * used. + * @rmpp_version - If set, indicates the RMPP version to use. + */ +enum { + IB_USER_MAD_USER_RMPP = (1 << 0), +}; +#define IB_USER_MAD_REG_FLAGS_CAP (IB_USER_MAD_USER_RMPP) +struct ib_user_mad_reg_req2 { + __u32 id; + __u32 qpn; + __u8 mgmt_class; + __u8 mgmt_class_version; + __u16 res; + __u32 flags; + __u64 method_mask[2]; + __u32 oui; + __u8 rmpp_version; + __u8 reserved[3]; +}; + #define IB_IOCTL_MAGIC 0x1b -#define IB_USER_MAD_REGISTER_AGENT _IO(IB_IOCTL_MAGIC, 1) +#define IB_USER_MAD_REGISTER_AGENT _IOWR(IB_IOCTL_MAGIC, 1, \ + struct ib_user_mad_reg_req) -#define IB_USER_MAD_UNREGISTER_AGENT _IO(IB_IOCTL_MAGIC, 2) +#define IB_USER_MAD_UNREGISTER_AGENT _IOW(IB_IOCTL_MAGIC, 2, __u32) #define IB_USER_MAD_ENABLE_PKEY _IO(IB_IOCTL_MAGIC, 3) + +#define IB_USER_MAD_REGISTER_AGENT2 _IOWR(IB_IOCTL_MAGIC, 4, \ + struct ib_user_mad_reg_req2) #endif /* IB_USER_MAD_H */ Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_user_verbs.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_user_verbs.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_user_verbs.h (revision 319974) @@ -1,980 +1,1067 @@ /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * Copyright (c) 2006 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef IB_USER_VERBS_H #define IB_USER_VERBS_H #include /* * Increment this value if any changes that break userspace ABI * compatibility are made. */ #define IB_USER_VERBS_ABI_VERSION 6 #define IB_USER_VERBS_CMD_THRESHOLD 50 -/* - * To support 6 legacy commands using the old extension style - */ -#define IB_USER_VERBS_LEGACY_CMD_FIRST 52 -#define IB_USER_VERBS_LEGACY_EX_CMD_LAST 56 - enum { IB_USER_VERBS_CMD_GET_CONTEXT, IB_USER_VERBS_CMD_QUERY_DEVICE, IB_USER_VERBS_CMD_QUERY_PORT, IB_USER_VERBS_CMD_ALLOC_PD, IB_USER_VERBS_CMD_DEALLOC_PD, IB_USER_VERBS_CMD_CREATE_AH, IB_USER_VERBS_CMD_MODIFY_AH, IB_USER_VERBS_CMD_QUERY_AH, IB_USER_VERBS_CMD_DESTROY_AH, IB_USER_VERBS_CMD_REG_MR, IB_USER_VERBS_CMD_REG_SMR, IB_USER_VERBS_CMD_REREG_MR, IB_USER_VERBS_CMD_QUERY_MR, IB_USER_VERBS_CMD_DEREG_MR, IB_USER_VERBS_CMD_ALLOC_MW, IB_USER_VERBS_CMD_BIND_MW, IB_USER_VERBS_CMD_DEALLOC_MW, IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL, IB_USER_VERBS_CMD_CREATE_CQ, IB_USER_VERBS_CMD_RESIZE_CQ, IB_USER_VERBS_CMD_DESTROY_CQ, IB_USER_VERBS_CMD_POLL_CQ, IB_USER_VERBS_CMD_PEEK_CQ, IB_USER_VERBS_CMD_REQ_NOTIFY_CQ, IB_USER_VERBS_CMD_CREATE_QP, IB_USER_VERBS_CMD_QUERY_QP, IB_USER_VERBS_CMD_MODIFY_QP, IB_USER_VERBS_CMD_DESTROY_QP, IB_USER_VERBS_CMD_POST_SEND, IB_USER_VERBS_CMD_POST_RECV, IB_USER_VERBS_CMD_ATTACH_MCAST, IB_USER_VERBS_CMD_DETACH_MCAST, IB_USER_VERBS_CMD_CREATE_SRQ, IB_USER_VERBS_CMD_MODIFY_SRQ, IB_USER_VERBS_CMD_QUERY_SRQ, IB_USER_VERBS_CMD_DESTROY_SRQ, IB_USER_VERBS_CMD_POST_SRQ_RECV, IB_USER_VERBS_CMD_OPEN_XRCD, IB_USER_VERBS_CMD_CLOSE_XRCD, IB_USER_VERBS_CMD_CREATE_XSRQ, - IB_USER_VERBS_CMD_OPEN_QP + IB_USER_VERBS_CMD_OPEN_QP, }; enum { + IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE, + IB_USER_VERBS_EX_CMD_CREATE_CQ = IB_USER_VERBS_CMD_CREATE_CQ, + IB_USER_VERBS_EX_CMD_CREATE_QP = IB_USER_VERBS_CMD_CREATE_QP, IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, - IB_USER_VERBS_EX_CMD_DESTROY_FLOW + IB_USER_VERBS_EX_CMD_DESTROY_FLOW, + IB_USER_VERBS_EX_CMD_CREATE_WQ, + IB_USER_VERBS_EX_CMD_MODIFY_WQ, + IB_USER_VERBS_EX_CMD_DESTROY_WQ, + IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL, + IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL }; - /* * Make sure that all structs defined in this file remain laid out so * that they pack the same way on 32-bit and 64-bit architectures (to * avoid incompatibility between 32-bit userspace and 64-bit kernels). * Specifically: * - Do not use pointer types -- pass pointers in __u64 instead. * - Make sure that any structure larger than 4 bytes is padded to a * multiple of 8 bytes. Otherwise the structure size will be * different between 32-bit and 64-bit architectures. */ struct ib_uverbs_async_event_desc { __u64 element; __u32 event_type; /* enum ib_event_type */ __u32 reserved; }; struct ib_uverbs_comp_event_desc { __u64 cq_handle; }; /* * All commands from userspace should start with a __u32 command field * followed by __u16 in_words and out_words fields (which give the * length of the command block and response buffer if any in 32-bit * words). The kernel driver will read these fields first and read * the rest of the command struct based on these value. */ -#define IBV_RESP_TO_VERBS_RESP_EX_RAW(ex_ptr, ex_type, ibv_type, field) \ - ((ibv_type *)((void *)(ex_ptr) + offsetof(ex_type, \ - field) + sizeof((ex_ptr)->field))) - -#define IBV_RESP_TO_VERBS_RESP_EX(ex_ptr, ex_type, ibv_type) \ - IBV_RESP_TO_VERBS_RESP_EX_RAW(ex_ptr, ex_type, ibv_type, comp_mask) - - #define IB_USER_VERBS_CMD_COMMAND_MASK 0xff #define IB_USER_VERBS_CMD_FLAGS_MASK 0xff000000u #define IB_USER_VERBS_CMD_FLAGS_SHIFT 24 #define IB_USER_VERBS_CMD_FLAG_EXTENDED 0x80 struct ib_uverbs_cmd_hdr { __u32 command; __u16 in_words; __u16 out_words; }; struct ib_uverbs_ex_cmd_hdr { __u64 response; __u16 provider_in_words; __u16 provider_out_words; __u32 cmd_hdr_reserved; }; struct ib_uverbs_get_context { __u64 response; __u64 driver_data[0]; }; struct ib_uverbs_get_context_resp { __u32 async_fd; __u32 num_comp_vectors; }; struct ib_uverbs_query_device { __u64 response; __u64 driver_data[0]; }; -struct ib_uverbs_query_device_ex { - __u64 comp_mask; - __u64 driver_data[0]; -}; - struct ib_uverbs_query_device_resp { __u64 fw_ver; __be64 node_guid; __be64 sys_image_guid; __u64 max_mr_size; __u64 page_size_cap; __u32 vendor_id; __u32 vendor_part_id; __u32 hw_ver; __u32 max_qp; __u32 max_qp_wr; __u32 device_cap_flags; __u32 max_sge; __u32 max_sge_rd; __u32 max_cq; __u32 max_cqe; __u32 max_mr; __u32 max_pd; __u32 max_qp_rd_atom; __u32 max_ee_rd_atom; __u32 max_res_rd_atom; __u32 max_qp_init_rd_atom; __u32 max_ee_init_rd_atom; __u32 atomic_cap; __u32 max_ee; __u32 max_rdd; __u32 max_mw; __u32 max_raw_ipv6_qp; __u32 max_raw_ethy_qp; __u32 max_mcast_grp; __u32 max_mcast_qp_attach; __u32 max_total_mcast_qp_attach; __u32 max_ah; __u32 max_fmr; __u32 max_map_per_fmr; __u32 max_srq; __u32 max_srq_wr; __u32 max_srq_sge; __u16 max_pkeys; __u8 local_ca_ack_delay; __u8 phys_port_cnt; __u8 reserved[4]; }; +struct ib_uverbs_ex_query_device { + __u32 comp_mask; + __u32 reserved; +}; + +struct ib_uverbs_odp_caps { + __u64 general_caps; + struct { + __u32 rc_odp_caps; + __u32 uc_odp_caps; + __u32 ud_odp_caps; + } per_transport_caps; + __u32 reserved; +}; + +struct ib_uverbs_rss_caps { + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_UD + */ + __u32 supported_qpts; + __u32 max_rwq_indirection_tables; + __u32 max_rwq_indirection_table_size; + __u32 reserved; +}; + +struct ib_uverbs_ex_query_device_resp { + struct ib_uverbs_query_device_resp base; + __u32 comp_mask; + __u32 response_length; + struct ib_uverbs_odp_caps odp_caps; + __u64 timestamp_mask; + __u64 hca_core_clock; /* in KHZ */ + __u64 device_cap_flags_ex; + struct ib_uverbs_rss_caps rss_caps; + __u32 max_wq_type_rq; + __u32 reserved; +}; + struct ib_uverbs_query_port { __u64 response; __u8 port_num; __u8 reserved[7]; __u64 driver_data[0]; }; struct ib_uverbs_query_port_resp { __u32 port_cap_flags; __u32 max_msg_sz; __u32 bad_pkey_cntr; __u32 qkey_viol_cntr; __u32 gid_tbl_len; __u16 pkey_tbl_len; __u16 lid; __u16 sm_lid; __u8 state; __u8 max_mtu; __u8 active_mtu; __u8 lmc; __u8 max_vl_num; __u8 sm_sl; __u8 subnet_timeout; __u8 init_type_reply; __u8 active_width; __u8 active_speed; __u8 phys_state; __u8 link_layer; __u8 reserved[2]; }; struct ib_uverbs_alloc_pd { __u64 response; __u64 driver_data[0]; }; struct ib_uverbs_alloc_pd_resp { __u32 pd_handle; }; struct ib_uverbs_dealloc_pd { __u32 pd_handle; }; struct ib_uverbs_open_xrcd { __u64 response; __u32 fd; __u32 oflags; __u64 driver_data[0]; }; struct ib_uverbs_open_xrcd_resp { __u32 xrcd_handle; }; struct ib_uverbs_close_xrcd { __u32 xrcd_handle; }; struct ib_uverbs_reg_mr { __u64 response; __u64 start; __u64 length; __u64 hca_va; __u32 pd_handle; __u32 access_flags; __u64 driver_data[0]; }; struct ib_uverbs_reg_mr_resp { __u32 mr_handle; __u32 lkey; __u32 rkey; }; +struct ib_uverbs_rereg_mr { + __u64 response; + __u32 mr_handle; + __u32 flags; + __u64 start; + __u64 length; + __u64 hca_va; + __u32 pd_handle; + __u32 access_flags; +}; + +struct ib_uverbs_rereg_mr_resp { + __u32 lkey; + __u32 rkey; +}; + struct ib_uverbs_dereg_mr { __u32 mr_handle; }; struct ib_uverbs_alloc_mw { __u64 response; __u32 pd_handle; __u8 mw_type; __u8 reserved[3]; }; struct ib_uverbs_alloc_mw_resp { __u32 mw_handle; __u32 rkey; }; struct ib_uverbs_dealloc_mw { __u32 mw_handle; }; struct ib_uverbs_create_comp_channel { __u64 response; }; struct ib_uverbs_create_comp_channel_resp { __u32 fd; }; struct ib_uverbs_create_cq { __u64 response; __u64 user_handle; __u32 cqe; __u32 comp_vector; __s32 comp_channel; __u32 reserved; __u64 driver_data[0]; }; -struct ib_uverbs_create_cq_resp { - __u32 cq_handle; - __u32 cqe; -}; - -enum ib_uverbs_create_cq_ex_comp_mask { - IB_UVERBS_CREATE_CQ_EX_CAP_FLAGS = (u64)1 << 0, -}; - -struct ib_uverbs_create_cq_ex { - __u64 comp_mask; +struct ib_uverbs_ex_create_cq { __u64 user_handle; __u32 cqe; __u32 comp_vector; __s32 comp_channel; + __u32 comp_mask; + __u32 flags; __u32 reserved; - __u64 create_flags; - __u64 driver_data[0]; }; -struct ib_uverbs_modify_cq_ex { - __u64 comp_mask; +struct ib_uverbs_create_cq_resp { __u32 cq_handle; - __u32 attr_mask; - __u16 cq_count; - __u16 cq_period; - __u32 cq_cap_flags; + __u32 cqe; }; +struct ib_uverbs_ex_create_cq_resp { + struct ib_uverbs_create_cq_resp base; + __u32 comp_mask; + __u32 response_length; +}; + struct ib_uverbs_resize_cq { __u64 response; __u32 cq_handle; __u32 cqe; __u64 driver_data[0]; }; struct ib_uverbs_resize_cq_resp { __u32 cqe; __u32 reserved; __u64 driver_data[0]; }; struct ib_uverbs_poll_cq { __u64 response; __u32 cq_handle; __u32 ne; }; struct ib_uverbs_wc { __u64 wr_id; __u32 status; __u32 opcode; __u32 vendor_err; __u32 byte_len; union { __u32 imm_data; __u32 invalidate_rkey; } ex; __u32 qp_num; __u32 src_qp; __u32 wc_flags; __u16 pkey_index; __u16 slid; __u8 sl; __u8 dlid_path_bits; __u8 port_num; __u8 reserved; }; struct ib_uverbs_poll_cq_resp { __u32 count; __u32 reserved; struct ib_uverbs_wc wc[0]; }; struct ib_uverbs_req_notify_cq { __u32 cq_handle; __u32 solicited_only; }; struct ib_uverbs_destroy_cq { __u64 response; __u32 cq_handle; __u32 reserved; }; struct ib_uverbs_destroy_cq_resp { __u32 comp_events_reported; __u32 async_events_reported; }; struct ib_uverbs_global_route { __u8 dgid[16]; __u32 flow_label; __u8 sgid_index; __u8 hop_limit; __u8 traffic_class; __u8 reserved; }; struct ib_uverbs_ah_attr { struct ib_uverbs_global_route grh; __u16 dlid; __u8 sl; __u8 src_path_bits; __u8 static_rate; __u8 is_global; __u8 port_num; __u8 reserved; }; struct ib_uverbs_qp_attr { __u32 qp_attr_mask; __u32 qp_state; __u32 cur_qp_state; __u32 path_mtu; __u32 path_mig_state; __u32 qkey; __u32 rq_psn; __u32 sq_psn; __u32 dest_qp_num; __u32 qp_access_flags; struct ib_uverbs_ah_attr ah_attr; struct ib_uverbs_ah_attr alt_ah_attr; /* ib_qp_cap */ __u32 max_send_wr; __u32 max_recv_wr; __u32 max_send_sge; __u32 max_recv_sge; __u32 max_inline_data; __u16 pkey_index; __u16 alt_pkey_index; __u8 en_sqd_async_notify; __u8 sq_draining; __u8 max_rd_atomic; __u8 max_dest_rd_atomic; __u8 min_rnr_timer; __u8 port_num; __u8 timeout; __u8 retry_cnt; __u8 rnr_retry; __u8 alt_port_num; __u8 alt_timeout; __u8 reserved[5]; }; struct ib_uverbs_create_qp { __u64 response; __u64 user_handle; __u32 pd_handle; __u32 send_cq_handle; __u32 recv_cq_handle; __u32 srq_handle; __u32 max_send_wr; __u32 max_recv_wr; __u32 max_send_sge; __u32 max_recv_sge; __u32 max_inline_data; __u8 sq_sig_all; __u8 qp_type; __u8 is_srq; __u8 reserved; __u64 driver_data[0]; }; +enum ib_uverbs_create_qp_mask { + IB_UVERBS_CREATE_QP_MASK_IND_TABLE = 1UL << 0, +}; + +enum { + IB_UVERBS_CREATE_QP_SUP_COMP_MASK = IB_UVERBS_CREATE_QP_MASK_IND_TABLE, +}; + +struct ib_uverbs_ex_create_qp { + __u64 user_handle; + __u32 pd_handle; + __u32 send_cq_handle; + __u32 recv_cq_handle; + __u32 srq_handle; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u8 sq_sig_all; + __u8 qp_type; + __u8 is_srq; + __u8 reserved; + __u32 comp_mask; + __u32 create_flags; + __u32 rwq_ind_tbl_handle; + __u32 reserved1; +}; + struct ib_uverbs_open_qp { __u64 response; __u64 user_handle; __u32 pd_handle; __u32 qpn; __u8 qp_type; __u8 reserved[7]; __u64 driver_data[0]; }; /* also used for open response */ struct ib_uverbs_create_qp_resp { __u32 qp_handle; __u32 qpn; __u32 max_send_wr; __u32 max_recv_wr; __u32 max_send_sge; __u32 max_recv_sge; __u32 max_inline_data; __u32 reserved; }; +struct ib_uverbs_ex_create_qp_resp { + struct ib_uverbs_create_qp_resp base; + __u32 comp_mask; + __u32 response_length; +}; + /* * This struct needs to remain a multiple of 8 bytes to keep the * alignment of the modify QP parameters. */ struct ib_uverbs_qp_dest { __u8 dgid[16]; __u32 flow_label; __u16 dlid; __u16 reserved; __u8 sgid_index; __u8 hop_limit; __u8 traffic_class; __u8 sl; __u8 src_path_bits; __u8 static_rate; __u8 is_global; __u8 port_num; }; struct ib_uverbs_query_qp { __u64 response; __u32 qp_handle; __u32 attr_mask; __u64 driver_data[0]; }; struct ib_uverbs_query_qp_resp { struct ib_uverbs_qp_dest dest; struct ib_uverbs_qp_dest alt_dest; __u32 max_send_wr; __u32 max_recv_wr; __u32 max_send_sge; __u32 max_recv_sge; __u32 max_inline_data; __u32 qkey; __u32 rq_psn; __u32 sq_psn; __u32 dest_qp_num; __u32 qp_access_flags; __u16 pkey_index; __u16 alt_pkey_index; __u8 qp_state; __u8 cur_qp_state; __u8 path_mtu; __u8 path_mig_state; __u8 sq_draining; __u8 max_rd_atomic; __u8 max_dest_rd_atomic; __u8 min_rnr_timer; __u8 port_num; __u8 timeout; __u8 retry_cnt; __u8 rnr_retry; __u8 alt_port_num; __u8 alt_timeout; __u8 sq_sig_all; __u8 reserved[5]; __u64 driver_data[0]; }; struct ib_uverbs_modify_qp { struct ib_uverbs_qp_dest dest; struct ib_uverbs_qp_dest alt_dest; __u32 qp_handle; __u32 attr_mask; __u32 qkey; __u32 rq_psn; __u32 sq_psn; __u32 dest_qp_num; __u32 qp_access_flags; __u16 pkey_index; __u16 alt_pkey_index; __u8 qp_state; __u8 cur_qp_state; __u8 path_mtu; __u8 path_mig_state; __u8 en_sqd_async_notify; __u8 max_rd_atomic; __u8 max_dest_rd_atomic; __u8 min_rnr_timer; __u8 port_num; __u8 timeout; __u8 retry_cnt; __u8 rnr_retry; __u8 alt_port_num; __u8 alt_timeout; __u8 reserved[2]; __u64 driver_data[0]; }; -enum ib_uverbs_modify_qp_ex_comp_mask { - IB_UVERBS_QP_ATTR_DCT_KEY = 1ULL << 0, -}; - -struct ib_uverbs_modify_qp_ex { - __u32 comp_mask; - struct ib_uverbs_qp_dest dest; - struct ib_uverbs_qp_dest alt_dest; - __u32 qp_handle; - __u32 attr_mask; - __u32 qkey; - __u32 rq_psn; - __u32 sq_psn; - __u32 dest_qp_num; - __u32 qp_access_flags; - __u16 pkey_index; - __u16 alt_pkey_index; - __u8 qp_state; - __u8 cur_qp_state; - __u8 path_mtu; - __u8 path_mig_state; - __u8 en_sqd_async_notify; - __u8 max_rd_atomic; - __u8 max_dest_rd_atomic; - __u8 min_rnr_timer; - __u8 port_num; - __u8 timeout; - __u8 retry_cnt; - __u8 rnr_retry; - __u8 alt_port_num; - __u8 alt_timeout; - __u8 reserved[2]; - __u64 dct_key; - __u64 driver_data[0]; -}; - struct ib_uverbs_modify_qp_resp { }; struct ib_uverbs_destroy_qp { __u64 response; __u32 qp_handle; __u32 reserved; }; struct ib_uverbs_destroy_qp_resp { __u32 events_reported; }; /* * The ib_uverbs_sge structure isn't used anywhere, since we assume * the ib_sge structure is packed the same way on 32-bit and 64-bit * architectures in both kernel and user space. It's just here to * document the ABI. */ struct ib_uverbs_sge { __u64 addr; __u32 length; __u32 lkey; }; struct ib_uverbs_send_wr { __u64 wr_id; __u32 num_sge; __u32 opcode; __u32 send_flags; union { __u32 imm_data; __u32 invalidate_rkey; } ex; union { struct { __u64 remote_addr; __u32 rkey; __u32 reserved; } rdma; struct { __u64 remote_addr; __u64 compare_add; __u64 swap; __u32 rkey; __u32 reserved; } atomic; struct { __u32 ah; __u32 remote_qpn; __u32 remote_qkey; __u32 reserved; } ud; } wr; }; struct ib_uverbs_post_send { __u64 response; __u32 qp_handle; __u32 wr_count; __u32 sge_count; __u32 wqe_size; struct ib_uverbs_send_wr send_wr[0]; }; struct ib_uverbs_post_send_resp { __u32 bad_wr; }; struct ib_uverbs_recv_wr { __u64 wr_id; __u32 num_sge; __u32 reserved; }; struct ib_uverbs_post_recv { __u64 response; __u32 qp_handle; __u32 wr_count; __u32 sge_count; __u32 wqe_size; struct ib_uverbs_recv_wr recv_wr[0]; }; struct ib_uverbs_post_recv_resp { __u32 bad_wr; }; struct ib_uverbs_post_srq_recv { __u64 response; __u32 srq_handle; __u32 wr_count; __u32 sge_count; __u32 wqe_size; struct ib_uverbs_recv_wr recv[0]; }; struct ib_uverbs_post_srq_recv_resp { __u32 bad_wr; }; struct ib_uverbs_create_ah { __u64 response; __u64 user_handle; __u32 pd_handle; __u32 reserved; struct ib_uverbs_ah_attr attr; }; struct ib_uverbs_create_ah_resp { __u32 ah_handle; }; struct ib_uverbs_destroy_ah { __u32 ah_handle; }; struct ib_uverbs_attach_mcast { __u8 gid[16]; __u32 qp_handle; __u16 mlid; __u16 reserved; __u64 driver_data[0]; }; struct ib_uverbs_detach_mcast { __u8 gid[16]; __u32 qp_handle; __u16 mlid; __u16 reserved; __u64 driver_data[0]; }; struct ib_uverbs_flow_spec_hdr { - __u32 type; + __u32 type; __u16 size; __u16 reserved; /* followed by flow_spec */ __u64 flow_spec_data[0]; }; -struct ib_kern_eth_filter { - __u8 dst_mac[6]; - __u8 src_mac[6]; - __be16 ether_type; - __be16 vlan_tag; +struct ib_uverbs_flow_eth_filter { + __u8 dst_mac[6]; + __u8 src_mac[6]; + __be16 ether_type; + __be16 vlan_tag; }; struct ib_uverbs_flow_spec_eth { union { struct ib_uverbs_flow_spec_hdr hdr; struct { __u32 type; __u16 size; __u16 reserved; }; }; - struct ib_kern_eth_filter val; - struct ib_kern_eth_filter mask; + struct ib_uverbs_flow_eth_filter val; + struct ib_uverbs_flow_eth_filter mask; }; -struct ib_kern_ib_filter { - __be32 l3_type_qpn; - __u8 dst_gid[16]; +struct ib_uverbs_flow_ipv4_filter { + __be32 src_ip; + __be32 dst_ip; + __u8 proto; + __u8 tos; + __u8 ttl; + __u8 flags; }; -struct ib_uverbs_flow_spec_ib { +struct ib_uverbs_flow_spec_ipv4 { union { struct ib_uverbs_flow_spec_hdr hdr; struct { __u32 type; __u16 size; __u16 reserved; }; }; - struct ib_kern_ib_filter val; - struct ib_kern_ib_filter mask; + struct ib_uverbs_flow_ipv4_filter val; + struct ib_uverbs_flow_ipv4_filter mask; }; -struct ib_kern_ipv4_filter { - __be32 src_ip; - __be32 dst_ip; +struct ib_uverbs_flow_tcp_udp_filter { + __be16 dst_port; + __be16 src_port; }; -struct ib_uverbs_flow_spec_ipv4 { +struct ib_uverbs_flow_spec_tcp_udp { union { struct ib_uverbs_flow_spec_hdr hdr; struct { __u32 type; __u16 size; __u16 reserved; }; }; - struct ib_kern_ipv4_filter val; - struct ib_kern_ipv4_filter mask; + struct ib_uverbs_flow_tcp_udp_filter val; + struct ib_uverbs_flow_tcp_udp_filter mask; }; -struct ib_kern_tcp_udp_filter { - __be16 dst_port; - __be16 src_port; +struct ib_uverbs_flow_ipv6_filter { + __u8 src_ip[16]; + __u8 dst_ip[16]; + __be32 flow_label; + __u8 next_hdr; + __u8 traffic_class; + __u8 hop_limit; + __u8 reserved; }; -struct ib_uverbs_flow_spec_tcp_udp { +struct ib_uverbs_flow_spec_ipv6 { union { struct ib_uverbs_flow_spec_hdr hdr; struct { __u32 type; __u16 size; __u16 reserved; }; }; - struct ib_kern_tcp_udp_filter val; - struct ib_kern_tcp_udp_filter mask; + struct ib_uverbs_flow_ipv6_filter val; + struct ib_uverbs_flow_ipv6_filter mask; }; struct ib_uverbs_flow_attr { - __u32 type; - __u16 size; - __u16 priority; - __u8 num_of_specs; - __u8 reserved[2]; - __u8 port; - __u32 flags; + __u32 type; + __u16 size; + __u16 priority; + __u8 num_of_specs; + __u8 reserved[2]; + __u8 port; + __u32 flags; /* Following are the optional layers according to user request * struct ib_flow_spec_xxx * struct ib_flow_spec_yyy */ struct ib_uverbs_flow_spec_hdr flow_specs[0]; }; struct ib_uverbs_create_flow { __u32 comp_mask; __u32 qp_handle; struct ib_uverbs_flow_attr flow_attr; }; struct ib_uverbs_create_flow_resp { __u32 comp_mask; __u32 flow_handle; }; struct ib_uverbs_destroy_flow { __u32 comp_mask; __u32 flow_handle; }; struct ib_uverbs_create_srq { __u64 response; __u64 user_handle; __u32 pd_handle; __u32 max_wr; __u32 max_sge; __u32 srq_limit; __u64 driver_data[0]; }; struct ib_uverbs_create_xsrq { __u64 response; __u64 user_handle; __u32 srq_type; __u32 pd_handle; __u32 max_wr; __u32 max_sge; __u32 srq_limit; __u32 reserved; __u32 xrcd_handle; __u32 cq_handle; __u64 driver_data[0]; }; struct ib_uverbs_create_srq_resp { __u32 srq_handle; __u32 max_wr; __u32 max_sge; __u32 srqn; }; struct ib_uverbs_modify_srq { __u32 srq_handle; __u32 attr_mask; __u32 max_wr; __u32 srq_limit; __u64 driver_data[0]; }; struct ib_uverbs_query_srq { __u64 response; __u32 srq_handle; __u32 reserved; __u64 driver_data[0]; }; struct ib_uverbs_query_srq_resp { __u32 max_wr; __u32 max_sge; __u32 srq_limit; __u32 reserved; }; struct ib_uverbs_destroy_srq { __u64 response; __u32 srq_handle; __u32 reserved; }; struct ib_uverbs_destroy_srq_resp { __u32 events_reported; }; +struct ib_uverbs_ex_create_wq { + __u32 comp_mask; + __u32 wq_type; + __u64 user_handle; + __u32 pd_handle; + __u32 cq_handle; + __u32 max_wr; + __u32 max_sge; +}; -/* - * Legacy extended verbs related structures - */ -struct ib_uverbs_ex_cmd_hdr_legacy { - __u32 command; - __u16 in_words; - __u16 out_words; - __u16 provider_in_words; - __u16 provider_out_words; - __u32 cmd_hdr_reserved; +struct ib_uverbs_ex_create_wq_resp { + __u32 comp_mask; + __u32 response_length; + __u32 wq_handle; + __u32 max_wr; + __u32 max_sge; + __u32 wqn; }; -struct ib_uverbs_ex_cmd_resp1_legacy { - __u64 comp_mask; - __u64 response; +struct ib_uverbs_ex_destroy_wq { + __u32 comp_mask; + __u32 wq_handle; +}; + +struct ib_uverbs_ex_destroy_wq_resp { + __u32 comp_mask; + __u32 response_length; + __u32 events_reported; + __u32 reserved; +}; + +struct ib_uverbs_ex_modify_wq { + __u32 attr_mask; + __u32 wq_handle; + __u32 wq_state; + __u32 curr_wq_state; +}; + +/* Prevent memory allocation rather than max expected size */ +#define IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE 0x0d +struct ib_uverbs_ex_create_rwq_ind_table { + __u32 comp_mask; + __u32 log_ind_tbl_size; + /* Following are the wq handles according to log_ind_tbl_size + * wq_handle1 + * wq_handle2 + */ + __u32 wq_handles[0]; +}; + +struct ib_uverbs_ex_create_rwq_ind_table_resp { + __u32 comp_mask; + __u32 response_length; + __u32 ind_tbl_handle; + __u32 ind_tbl_num; +}; + +struct ib_uverbs_ex_destroy_rwq_ind_table { + __u32 comp_mask; + __u32 ind_tbl_handle; }; #endif /* IB_USER_VERBS_H */ Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_verbs.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_verbs.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/ib_verbs.h (revision 319974) @@ -1,2898 +1,3359 @@ /* * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if !defined(IB_VERBS_H) #define IB_VERBS_H #include #include #include #include #include #include #include #include #include +#include #include -#include +#include +#include +#include +#include +#include +#include +#include +#include #include +struct ifla_vf_info; +struct ifla_vf_stats; + extern struct workqueue_struct *ib_wq; +extern struct workqueue_struct *ib_comp_wq; union ib_gid { u8 raw[16]; struct { __be64 subnet_prefix; __be64 interface_id; } global; }; +extern union ib_gid zgid; + +enum ib_gid_type { + /* If link layer is Ethernet, this is RoCE V1 */ + IB_GID_TYPE_IB = 0, + IB_GID_TYPE_ROCE = 0, + IB_GID_TYPE_ROCE_UDP_ENCAP = 1, + IB_GID_TYPE_SIZE +}; + +#define ROCE_V2_UDP_DPORT 4791 +struct ib_gid_attr { + enum ib_gid_type gid_type; + struct net_device *ndev; +}; + enum rdma_node_type { /* IB values map to NodeInfo:NodeType. */ RDMA_NODE_IB_CA = 1, RDMA_NODE_IB_SWITCH, RDMA_NODE_IB_ROUTER, RDMA_NODE_RNIC, - RDMA_NODE_MIC + RDMA_NODE_USNIC, + RDMA_NODE_USNIC_UDP, }; +enum { + /* set the local administered indication */ + IB_SA_WELL_KNOWN_GUID = BIT_ULL(57) | 2, +}; + enum rdma_transport_type { RDMA_TRANSPORT_IB, RDMA_TRANSPORT_IWARP, - RDMA_TRANSPORT_SCIF + RDMA_TRANSPORT_USNIC, + RDMA_TRANSPORT_USNIC_UDP }; -enum rdma_transport_type -rdma_node_get_transport(enum rdma_node_type node_type) __attribute_const__; +enum rdma_protocol_type { + RDMA_PROTOCOL_IB, + RDMA_PROTOCOL_IBOE, + RDMA_PROTOCOL_IWARP, + RDMA_PROTOCOL_USNIC_UDP +}; +__attribute_const__ enum rdma_transport_type +rdma_node_get_transport(enum rdma_node_type node_type); + +enum rdma_network_type { + RDMA_NETWORK_IB, + RDMA_NETWORK_ROCE_V1 = RDMA_NETWORK_IB, + RDMA_NETWORK_IPV4, + RDMA_NETWORK_IPV6 +}; + +static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type) +{ + if (network_type == RDMA_NETWORK_IPV4 || + network_type == RDMA_NETWORK_IPV6) + return IB_GID_TYPE_ROCE_UDP_ENCAP; + + /* IB_GID_TYPE_IB same as RDMA_NETWORK_ROCE_V1 */ + return IB_GID_TYPE_IB; +} + +static inline enum rdma_network_type ib_gid_to_network_type(enum ib_gid_type gid_type, + union ib_gid *gid) +{ + if (gid_type == IB_GID_TYPE_IB) + return RDMA_NETWORK_IB; + + if (ipv6_addr_v4mapped((struct in6_addr *)gid)) + return RDMA_NETWORK_IPV4; + else + return RDMA_NETWORK_IPV6; +} + enum rdma_link_layer { IB_LINK_LAYER_UNSPECIFIED, IB_LINK_LAYER_INFINIBAND, IB_LINK_LAYER_ETHERNET, - IB_LINK_LAYER_SCIF }; enum ib_device_cap_flags { - IB_DEVICE_RESIZE_MAX_WR = 1, - IB_DEVICE_BAD_PKEY_CNTR = (1<<1), - IB_DEVICE_BAD_QKEY_CNTR = (1<<2), - IB_DEVICE_RAW_MULTI = (1<<3), - IB_DEVICE_AUTO_PATH_MIG = (1<<4), - IB_DEVICE_CHANGE_PHY_PORT = (1<<5), - IB_DEVICE_UD_AV_PORT_ENFORCE = (1<<6), - IB_DEVICE_CURR_QP_STATE_MOD = (1<<7), - IB_DEVICE_SHUTDOWN_PORT = (1<<8), - IB_DEVICE_INIT_TYPE = (1<<9), - IB_DEVICE_PORT_ACTIVE_EVENT = (1<<10), - IB_DEVICE_SYS_IMAGE_GUID = (1<<11), - IB_DEVICE_RC_RNR_NAK_GEN = (1<<12), - IB_DEVICE_SRQ_RESIZE = (1<<13), - IB_DEVICE_N_NOTIFY_CQ = (1<<14), - IB_DEVICE_LOCAL_DMA_LKEY = (1<<15), - IB_DEVICE_RESERVED = (1<<16), /* old SEND_W_INV */ - IB_DEVICE_MEM_WINDOW = (1<<17), + IB_DEVICE_RESIZE_MAX_WR = (1 << 0), + IB_DEVICE_BAD_PKEY_CNTR = (1 << 1), + IB_DEVICE_BAD_QKEY_CNTR = (1 << 2), + IB_DEVICE_RAW_MULTI = (1 << 3), + IB_DEVICE_AUTO_PATH_MIG = (1 << 4), + IB_DEVICE_CHANGE_PHY_PORT = (1 << 5), + IB_DEVICE_UD_AV_PORT_ENFORCE = (1 << 6), + IB_DEVICE_CURR_QP_STATE_MOD = (1 << 7), + IB_DEVICE_SHUTDOWN_PORT = (1 << 8), + IB_DEVICE_INIT_TYPE = (1 << 9), + IB_DEVICE_PORT_ACTIVE_EVENT = (1 << 10), + IB_DEVICE_SYS_IMAGE_GUID = (1 << 11), + IB_DEVICE_RC_RNR_NAK_GEN = (1 << 12), + IB_DEVICE_SRQ_RESIZE = (1 << 13), + IB_DEVICE_N_NOTIFY_CQ = (1 << 14), + /* + * This device supports a per-device lkey or stag that can be + * used without performing a memory registration for the local + * memory. Note that ULPs should never check this flag, but + * instead of use the local_dma_lkey flag in the ib_pd structure, + * which will always contain a usable lkey. + */ + IB_DEVICE_LOCAL_DMA_LKEY = (1 << 15), + IB_DEVICE_RESERVED /* old SEND_W_INV */ = (1 << 16), + IB_DEVICE_MEM_WINDOW = (1 << 17), + /* * Devices should set IB_DEVICE_UD_IP_SUM if they support * insertion of UDP and TCP checksum on outgoing UD IPoIB * messages and can verify the validity of checksum for * incoming messages. Setting this flag implies that the * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode. */ - IB_DEVICE_UD_IP_CSUM = (1<<18), - IB_DEVICE_UD_TSO = (1<<19), - IB_DEVICE_XRC = (1<<20), - IB_DEVICE_MEM_MGT_EXTENSIONS = (1<<21), - IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22), - IB_DEVICE_MR_ALLOCATE = (1<<23), - IB_DEVICE_SHARED_MR = (1<<24), - IB_DEVICE_QPG = (1<<25), - IB_DEVICE_UD_RSS = (1<<26), - IB_DEVICE_UD_TSS = (1<<27), - IB_DEVICE_CROSS_CHANNEL = (1<<28), - IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29), + IB_DEVICE_UD_IP_CSUM = (1 << 18), + IB_DEVICE_UD_TSO = (1 << 19), + IB_DEVICE_XRC = (1 << 20), + /* - * Devices can set either IB_DEVICE_MEM_WINDOW_TYPE_2A or - * IB_DEVICE_MEM_WINDOW_TYPE_2B if it supports type 2A or type 2B - * memory windows. It can set neither to indicate it doesn't support - * type 2 windows at all. + * This device supports the IB "base memory management extension", + * which includes support for fast registrations (IB_WR_REG_MR, + * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs). This flag should + * also be set by any iWarp device which must support FRs to comply + * to the iWarp verbs spec. iWarp devices also support the + * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the + * stag. */ - IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<30), - IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<31), - IB_DEVICE_SIGNATURE_HANDOVER = (1LL<<32) + IB_DEVICE_MEM_MGT_EXTENSIONS = (1 << 21), + IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1 << 22), + IB_DEVICE_MEM_WINDOW_TYPE_2A = (1 << 23), + IB_DEVICE_MEM_WINDOW_TYPE_2B = (1 << 24), + IB_DEVICE_RC_IP_CSUM = (1 << 25), + IB_DEVICE_RAW_IP_CSUM = (1 << 26), + /* + * Devices should set IB_DEVICE_CROSS_CHANNEL if they + * support execution of WQEs that involve synchronization + * of I/O operations with single completion queue managed + * by hardware. + */ + IB_DEVICE_CROSS_CHANNEL = (1 << 27), + IB_DEVICE_MANAGED_FLOW_STEERING = (1 << 29), + IB_DEVICE_SIGNATURE_HANDOVER = (1 << 30), + IB_DEVICE_ON_DEMAND_PAGING = (1ULL << 31), + IB_DEVICE_SG_GAPS_REG = (1ULL << 32), + IB_DEVICE_VIRTUAL_FUNCTION = (1ULL << 33), + IB_DEVICE_RAW_SCATTER_FCS = (1ULL << 34), }; enum ib_signature_prot_cap { IB_PROT_T10DIF_TYPE_1 = 1, IB_PROT_T10DIF_TYPE_2 = 1 << 1, IB_PROT_T10DIF_TYPE_3 = 1 << 2, }; enum ib_signature_guard_cap { IB_GUARD_T10DIF_CRC = 1, IB_GUARD_T10DIF_CSUM = 1 << 1, }; enum ib_atomic_cap { IB_ATOMIC_NONE, IB_ATOMIC_HCA, IB_ATOMIC_GLOB }; -enum ib_cq_create_flags { - IB_CQ_CREATE_CROSS_CHANNEL = 1 << 0, - IB_CQ_TIMESTAMP = 1 << 1, - IB_CQ_TIMESTAMP_TO_SYS_TIME = 1 << 2 +enum ib_odp_general_cap_bits { + IB_ODP_SUPPORT = 1 << 0, }; +enum ib_odp_transport_cap_bits { + IB_ODP_SUPPORT_SEND = 1 << 0, + IB_ODP_SUPPORT_RECV = 1 << 1, + IB_ODP_SUPPORT_WRITE = 1 << 2, + IB_ODP_SUPPORT_READ = 1 << 3, + IB_ODP_SUPPORT_ATOMIC = 1 << 4, +}; + +struct ib_odp_caps { + uint64_t general_caps; + struct { + uint32_t rc_odp_caps; + uint32_t uc_odp_caps; + uint32_t ud_odp_caps; + } per_transport_caps; +}; + +struct ib_rss_caps { + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_UD + */ + u32 supported_qpts; + u32 max_rwq_indirection_tables; + u32 max_rwq_indirection_table_size; +}; + +enum ib_cq_creation_flags { + IB_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0, + IB_CQ_FLAGS_IGNORE_OVERRUN = 1 << 1, +}; + +struct ib_cq_init_attr { + unsigned int cqe; + int comp_vector; + u32 flags; +}; + struct ib_device_attr { u64 fw_ver; __be64 sys_image_guid; u64 max_mr_size; u64 page_size_cap; u32 vendor_id; u32 vendor_part_id; u32 hw_ver; int max_qp; int max_qp_wr; u64 device_cap_flags; int max_sge; int max_sge_rd; int max_cq; int max_cqe; int max_mr; int max_pd; int max_qp_rd_atom; int max_ee_rd_atom; int max_res_rd_atom; int max_qp_init_rd_atom; int max_ee_init_rd_atom; enum ib_atomic_cap atomic_cap; enum ib_atomic_cap masked_atomic_cap; int max_ee; int max_rdd; int max_mw; int max_raw_ipv6_qp; int max_raw_ethy_qp; int max_mcast_grp; int max_mcast_qp_attach; int max_total_mcast_qp_attach; int max_ah; int max_fmr; int max_map_per_fmr; int max_srq; int max_srq_wr; int max_srq_sge; unsigned int max_fast_reg_page_list_len; - int max_rss_tbl_sz; u16 max_pkeys; u8 local_ca_ack_delay; - int comp_mask; - uint64_t timestamp_mask; - uint64_t hca_core_clock; - unsigned int sig_prot_cap; - unsigned int sig_guard_cap; + int sig_prot_cap; + int sig_guard_cap; + struct ib_odp_caps odp_caps; + uint64_t timestamp_mask; + uint64_t hca_core_clock; /* in KHZ */ + struct ib_rss_caps rss_caps; + u32 max_wq_type_rq; }; -enum ib_device_attr_comp_mask { - IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK = 1ULL << 1, - IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK = 1ULL << 2 -}; - enum ib_mtu { IB_MTU_256 = 1, IB_MTU_512 = 2, IB_MTU_1024 = 3, IB_MTU_2048 = 4, IB_MTU_4096 = 5 }; static inline int ib_mtu_enum_to_int(enum ib_mtu mtu) { switch (mtu) { case IB_MTU_256: return 256; case IB_MTU_512: return 512; case IB_MTU_1024: return 1024; case IB_MTU_2048: return 2048; case IB_MTU_4096: return 4096; default: return -1; } } enum ib_port_state { IB_PORT_NOP = 0, IB_PORT_DOWN = 1, IB_PORT_INIT = 2, IB_PORT_ARMED = 3, IB_PORT_ACTIVE = 4, IB_PORT_ACTIVE_DEFER = 5, - IB_PORT_DUMMY = -1 /* force enum signed */ + IB_PORT_DUMMY = -1, /* force enum signed */ }; enum ib_port_cap_flags { IB_PORT_SM = 1 << 1, IB_PORT_NOTICE_SUP = 1 << 2, IB_PORT_TRAP_SUP = 1 << 3, IB_PORT_OPT_IPD_SUP = 1 << 4, IB_PORT_AUTO_MIGR_SUP = 1 << 5, IB_PORT_SL_MAP_SUP = 1 << 6, IB_PORT_MKEY_NVRAM = 1 << 7, IB_PORT_PKEY_NVRAM = 1 << 8, IB_PORT_LED_INFO_SUP = 1 << 9, IB_PORT_SM_DISABLED = 1 << 10, IB_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, IB_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, IB_PORT_CM_SUP = 1 << 16, IB_PORT_SNMP_TUNNEL_SUP = 1 << 17, IB_PORT_REINIT_SUP = 1 << 18, IB_PORT_DEVICE_MGMT_SUP = 1 << 19, IB_PORT_VENDOR_CLASS_SUP = 1 << 20, IB_PORT_DR_NOTICE_SUP = 1 << 21, IB_PORT_CAP_MASK_NOTICE_SUP = 1 << 22, IB_PORT_BOOT_MGMT_SUP = 1 << 23, IB_PORT_LINK_LATENCY_SUP = 1 << 24, - IB_PORT_CLIENT_REG_SUP = 1 << 25 + IB_PORT_CLIENT_REG_SUP = 1 << 25, + IB_PORT_IP_BASED_GIDS = 1 << 26, }; enum ib_port_width { IB_WIDTH_1X = 1, IB_WIDTH_4X = 2, IB_WIDTH_8X = 4, IB_WIDTH_12X = 8 }; static inline int ib_width_enum_to_int(enum ib_port_width width) { switch (width) { case IB_WIDTH_1X: return 1; case IB_WIDTH_4X: return 4; case IB_WIDTH_8X: return 8; case IB_WIDTH_12X: return 12; default: return -1; } } enum ib_port_speed { IB_SPEED_SDR = 1, IB_SPEED_DDR = 2, IB_SPEED_QDR = 4, IB_SPEED_FDR10 = 8, IB_SPEED_FDR = 16, IB_SPEED_EDR = 32 }; -struct ib_protocol_stats { - /* TBD... */ +/** + * struct rdma_hw_stats + * @timestamp - Used by the core code to track when the last update was + * @lifespan - Used by the core code to determine how old the counters + * should be before being updated again. Stored in jiffies, defaults + * to 10 milliseconds, drivers can override the default be specifying + * their own value during their allocation routine. + * @name - Array of pointers to static names used for the counters in + * directory. + * @num_counters - How many hardware counters there are. If name is + * shorter than this number, a kernel oops will result. Driver authors + * are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters) + * in their code to prevent this. + * @value - Array of u64 counters that are accessed by the sysfs code and + * filled in by the drivers get_stats routine + */ +struct rdma_hw_stats { + unsigned long timestamp; + unsigned long lifespan; + const char * const *names; + int num_counters; + u64 value[]; }; -struct iw_protocol_stats { - u64 ipInReceives; - u64 ipInHdrErrors; - u64 ipInTooBigErrors; - u64 ipInNoRoutes; - u64 ipInAddrErrors; - u64 ipInUnknownProtos; - u64 ipInTruncatedPkts; - u64 ipInDiscards; - u64 ipInDelivers; - u64 ipOutForwDatagrams; - u64 ipOutRequests; - u64 ipOutDiscards; - u64 ipOutNoRoutes; - u64 ipReasmTimeout; - u64 ipReasmReqds; - u64 ipReasmOKs; - u64 ipReasmFails; - u64 ipFragOKs; - u64 ipFragFails; - u64 ipFragCreates; - u64 ipInMcastPkts; - u64 ipOutMcastPkts; - u64 ipInBcastPkts; - u64 ipOutBcastPkts; +#define RDMA_HW_STATS_DEFAULT_LIFESPAN 10 +/** + * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct + * for drivers. + * @names - Array of static const char * + * @num_counters - How many elements in array + * @lifespan - How many milliseconds between updates + */ +static inline struct rdma_hw_stats *rdma_alloc_hw_stats_struct( + const char * const *names, int num_counters, + unsigned long lifespan) +{ + struct rdma_hw_stats *stats; - u64 tcpRtoAlgorithm; - u64 tcpRtoMin; - u64 tcpRtoMax; - u64 tcpMaxConn; - u64 tcpActiveOpens; - u64 tcpPassiveOpens; - u64 tcpAttemptFails; - u64 tcpEstabResets; - u64 tcpCurrEstab; - u64 tcpInSegs; - u64 tcpOutSegs; - u64 tcpRetransSegs; - u64 tcpInErrs; - u64 tcpOutRsts; -}; + stats = kzalloc(sizeof(*stats) + num_counters * sizeof(u64), + GFP_KERNEL); + if (!stats) + return NULL; + stats->names = names; + stats->num_counters = num_counters; + stats->lifespan = msecs_to_jiffies(lifespan); -union rdma_protocol_stats { - struct ib_protocol_stats ib; - struct iw_protocol_stats iw; -}; + return stats; +} + +/* Define bits for the various functionality this port needs to be supported by + * the core. + */ +/* Management 0x00000FFF */ +#define RDMA_CORE_CAP_IB_MAD 0x00000001 +#define RDMA_CORE_CAP_IB_SMI 0x00000002 +#define RDMA_CORE_CAP_IB_CM 0x00000004 +#define RDMA_CORE_CAP_IW_CM 0x00000008 +#define RDMA_CORE_CAP_IB_SA 0x00000010 +#define RDMA_CORE_CAP_OPA_MAD 0x00000020 + +/* Address format 0x000FF000 */ +#define RDMA_CORE_CAP_AF_IB 0x00001000 +#define RDMA_CORE_CAP_ETH_AH 0x00002000 + +/* Protocol 0xFFF00000 */ +#define RDMA_CORE_CAP_PROT_IB 0x00100000 +#define RDMA_CORE_CAP_PROT_ROCE 0x00200000 +#define RDMA_CORE_CAP_PROT_IWARP 0x00400000 +#define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000 + +#define RDMA_CORE_PORT_IBA_IB (RDMA_CORE_CAP_PROT_IB \ + | RDMA_CORE_CAP_IB_MAD \ + | RDMA_CORE_CAP_IB_SMI \ + | RDMA_CORE_CAP_IB_CM \ + | RDMA_CORE_CAP_IB_SA \ + | RDMA_CORE_CAP_AF_IB) +#define RDMA_CORE_PORT_IBA_ROCE (RDMA_CORE_CAP_PROT_ROCE \ + | RDMA_CORE_CAP_IB_MAD \ + | RDMA_CORE_CAP_IB_CM \ + | RDMA_CORE_CAP_AF_IB \ + | RDMA_CORE_CAP_ETH_AH) +#define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP \ + (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \ + | RDMA_CORE_CAP_IB_MAD \ + | RDMA_CORE_CAP_IB_CM \ + | RDMA_CORE_CAP_AF_IB \ + | RDMA_CORE_CAP_ETH_AH) +#define RDMA_CORE_PORT_IWARP (RDMA_CORE_CAP_PROT_IWARP \ + | RDMA_CORE_CAP_IW_CM) +#define RDMA_CORE_PORT_INTEL_OPA (RDMA_CORE_PORT_IBA_IB \ + | RDMA_CORE_CAP_OPA_MAD) + struct ib_port_attr { + u64 subnet_prefix; enum ib_port_state state; enum ib_mtu max_mtu; enum ib_mtu active_mtu; int gid_tbl_len; u32 port_cap_flags; u32 max_msg_sz; u32 bad_pkey_cntr; u32 qkey_viol_cntr; u16 pkey_tbl_len; u16 lid; u16 sm_lid; u8 lmc; u8 max_vl_num; u8 sm_sl; u8 subnet_timeout; u8 init_type_reply; u8 active_width; u8 active_speed; u8 phys_state; + bool grh_required; }; enum ib_device_modify_flags { IB_DEVICE_MODIFY_SYS_IMAGE_GUID = 1 << 0, IB_DEVICE_MODIFY_NODE_DESC = 1 << 1 }; +#define IB_DEVICE_NODE_DESC_MAX 64 + struct ib_device_modify { u64 sys_image_guid; - char node_desc[64]; + char node_desc[IB_DEVICE_NODE_DESC_MAX]; }; enum ib_port_modify_flags { IB_PORT_SHUTDOWN = 1, IB_PORT_INIT_TYPE = (1<<2), IB_PORT_RESET_QKEY_CNTR = (1<<3) }; struct ib_port_modify { u32 set_port_cap_mask; u32 clr_port_cap_mask; u8 init_type; }; enum ib_event_type { IB_EVENT_CQ_ERR, IB_EVENT_QP_FATAL, IB_EVENT_QP_REQ_ERR, IB_EVENT_QP_ACCESS_ERR, IB_EVENT_COMM_EST, IB_EVENT_SQ_DRAINED, IB_EVENT_PATH_MIG, IB_EVENT_PATH_MIG_ERR, IB_EVENT_DEVICE_FATAL, IB_EVENT_PORT_ACTIVE, IB_EVENT_PORT_ERR, IB_EVENT_LID_CHANGE, IB_EVENT_PKEY_CHANGE, IB_EVENT_SM_CHANGE, IB_EVENT_SRQ_ERR, IB_EVENT_SRQ_LIMIT_REACHED, IB_EVENT_QP_LAST_WQE_REACHED, IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, + IB_EVENT_WQ_FATAL, }; +const char *__attribute_const__ ib_event_msg(enum ib_event_type event); + struct ib_event { struct ib_device *device; union { struct ib_cq *cq; struct ib_qp *qp; struct ib_srq *srq; + struct ib_wq *wq; u8 port_num; } element; enum ib_event_type event; }; struct ib_event_handler { struct ib_device *device; void (*handler)(struct ib_event_handler *, struct ib_event *); struct list_head list; }; #define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \ do { \ (_ptr)->device = _device; \ (_ptr)->handler = _handler; \ INIT_LIST_HEAD(&(_ptr)->list); \ } while (0) struct ib_global_route { union ib_gid dgid; u32 flow_label; u8 sgid_index; u8 hop_limit; u8 traffic_class; }; struct ib_grh { __be32 version_tclass_flow; __be16 paylen; u8 next_hdr; u8 hop_limit; union ib_gid sgid; union ib_gid dgid; }; +union rdma_network_hdr { + struct ib_grh ibgrh; + struct { + /* The IB spec states that if it's IPv4, the header + * is located in the last 20 bytes of the header. + */ + u8 reserved[20]; + struct ip roce4grh; + }; +}; + enum { IB_MULTICAST_QPN = 0xffffff }; #define IB_LID_PERMISSIVE cpu_to_be16(0xFFFF) +#define IB_MULTICAST_LID_BASE cpu_to_be16(0xC000) enum ib_ah_flags { IB_AH_GRH = 1 }; enum ib_rate { IB_RATE_PORT_CURRENT = 0, IB_RATE_2_5_GBPS = 2, IB_RATE_5_GBPS = 5, IB_RATE_10_GBPS = 3, IB_RATE_20_GBPS = 6, IB_RATE_30_GBPS = 4, IB_RATE_40_GBPS = 7, IB_RATE_60_GBPS = 8, IB_RATE_80_GBPS = 9, IB_RATE_120_GBPS = 10, IB_RATE_14_GBPS = 11, IB_RATE_56_GBPS = 12, IB_RATE_112_GBPS = 13, IB_RATE_168_GBPS = 14, IB_RATE_25_GBPS = 15, IB_RATE_100_GBPS = 16, IB_RATE_200_GBPS = 17, IB_RATE_300_GBPS = 18 }; -enum ib_mr_create_flags { - IB_MR_SIGNATURE_EN = 1, -}; - /** - * ib_mr_init_attr - Memory region init attributes passed to routine - * ib_create_mr. - * @max_reg_descriptors: max number of registration descriptors that - * may be used with registration work requests. - * @flags: MR creation flags bit mask. - */ -struct ib_mr_init_attr { - int max_reg_descriptors; - u32 flags; -}; - -/** * ib_rate_to_mult - Convert the IB rate enum to a multiple of the * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. * @rate: rate to convert. */ -int ib_rate_to_mult(enum ib_rate rate) __attribute_const__; +__attribute_const__ int ib_rate_to_mult(enum ib_rate rate); /** * ib_rate_to_mbps - Convert the IB rate enum to Mbps. * For example, IB_RATE_2_5_GBPS will be converted to 2500. * @rate: rate to convert. */ -int ib_rate_to_mbps(enum ib_rate rate) __attribute_const__; +__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); -struct ib_cq_init_attr { - int cqe; - int comp_vector; - u32 flags; -}; -enum ib_signature_type { - IB_SIG_TYPE_T10_DIF, +/** + * enum ib_mr_type - memory region type + * @IB_MR_TYPE_MEM_REG: memory region that is used for + * normal registration + * @IB_MR_TYPE_SIGNATURE: memory region that is used for + * signature operations (data-integrity + * capable regions) + * @IB_MR_TYPE_SG_GAPS: memory region that is capable to + * register any arbitrary sg lists (without + * the normal mr constraints - see + * ib_map_mr_sg) + */ +enum ib_mr_type { + IB_MR_TYPE_MEM_REG, + IB_MR_TYPE_SIGNATURE, + IB_MR_TYPE_SG_GAPS, }; /** - * T10-DIF Signature types - * T10-DIF types are defined by SCSI - * specifications. + * Signature types + * IB_SIG_TYPE_NONE: Unprotected. + * IB_SIG_TYPE_T10_DIF: Type T10-DIF */ -enum ib_t10_dif_type { - IB_T10DIF_NONE, - IB_T10DIF_TYPE1, - IB_T10DIF_TYPE2, - IB_T10DIF_TYPE3 +enum ib_signature_type { + IB_SIG_TYPE_NONE, + IB_SIG_TYPE_T10_DIF, }; /** * Signature T10-DIF block-guard types * IB_T10DIF_CRC: Corresponds to T10-PI mandated CRC checksum rules. * IB_T10DIF_CSUM: Corresponds to IP checksum rules. */ enum ib_t10_dif_bg_type { IB_T10DIF_CRC, IB_T10DIF_CSUM }; /** * struct ib_t10_dif_domain - Parameters specific for T10-DIF * domain. - * @type: T10-DIF type (0|1|2|3) * @bg_type: T10-DIF block guard type (CRC|CSUM) * @pi_interval: protection information interval. * @bg: seed of guard computation. * @app_tag: application tag of guard block * @ref_tag: initial guard block reference tag. - * @type3_inc_reftag: T10-DIF type 3 does not state - * about the reference tag, it is the user - * choice to increment it or not. + * @ref_remap: Indicate wethear the reftag increments each block + * @app_escape: Indicate to skip block check if apptag=0xffff + * @ref_escape: Indicate to skip block check if reftag=0xffffffff + * @apptag_check_mask: check bitmask of application tag. */ struct ib_t10_dif_domain { - enum ib_t10_dif_type type; enum ib_t10_dif_bg_type bg_type; - u32 pi_interval; + u16 pi_interval; u16 bg; u16 app_tag; u32 ref_tag; - bool type3_inc_reftag; + bool ref_remap; + bool app_escape; + bool ref_escape; + u16 apptag_check_mask; }; /** * struct ib_sig_domain - Parameters for signature domain * @sig_type: specific signauture type * @sig: union of all signature domain attributes that may * be used to set domain layout. */ struct ib_sig_domain { enum ib_signature_type sig_type; union { struct ib_t10_dif_domain dif; } sig; }; /** * struct ib_sig_attrs - Parameters for signature handover operation * @check_mask: bitmask for signature byte check (8 bytes) * @mem: memory domain layout desciptor. * @wire: wire domain layout desciptor. */ struct ib_sig_attrs { u8 check_mask; struct ib_sig_domain mem; struct ib_sig_domain wire; }; enum ib_sig_err_type { IB_SIG_BAD_GUARD, IB_SIG_BAD_REFTAG, IB_SIG_BAD_APPTAG, }; /** * struct ib_sig_err - signature error descriptor */ struct ib_sig_err { enum ib_sig_err_type err_type; u32 expected; u32 actual; u64 sig_err_offset; u32 key; }; enum ib_mr_status_check { IB_MR_CHECK_SIG_STATUS = 1, }; /** * struct ib_mr_status - Memory region status container * * @fail_status: Bitmask of MR checks status. For each * failed check a corresponding status bit is set. * @sig_err: Additional info for IB_MR_CEHCK_SIG_STATUS * failure. */ struct ib_mr_status { u32 fail_status; struct ib_sig_err sig_err; }; /** * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate * enum. * @mult: multiple to convert. */ -enum ib_rate mult_to_ib_rate(int mult) __attribute_const__; +__attribute_const__ enum ib_rate mult_to_ib_rate(int mult); struct ib_ah_attr { struct ib_global_route grh; u16 dlid; u8 sl; u8 src_path_bits; u8 static_rate; u8 ah_flags; u8 port_num; - u8 dmac[6]; - u16 vlan_id; + u8 dmac[ETH_ALEN]; }; enum ib_wc_status { IB_WC_SUCCESS, IB_WC_LOC_LEN_ERR, IB_WC_LOC_QP_OP_ERR, IB_WC_LOC_EEC_OP_ERR, IB_WC_LOC_PROT_ERR, IB_WC_WR_FLUSH_ERR, IB_WC_MW_BIND_ERR, IB_WC_BAD_RESP_ERR, IB_WC_LOC_ACCESS_ERR, IB_WC_REM_INV_REQ_ERR, IB_WC_REM_ACCESS_ERR, IB_WC_REM_OP_ERR, IB_WC_RETRY_EXC_ERR, IB_WC_RNR_RETRY_EXC_ERR, IB_WC_LOC_RDD_VIOL_ERR, IB_WC_REM_INV_RD_REQ_ERR, IB_WC_REM_ABORT_ERR, IB_WC_INV_EECN_ERR, IB_WC_INV_EEC_STATE_ERR, IB_WC_FATAL_ERR, IB_WC_RESP_TIMEOUT_ERR, IB_WC_GENERAL_ERR }; +const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status); + enum ib_wc_opcode { IB_WC_SEND, IB_WC_RDMA_WRITE, IB_WC_RDMA_READ, IB_WC_COMP_SWAP, IB_WC_FETCH_ADD, - IB_WC_BIND_MW, IB_WC_LSO, IB_WC_LOCAL_INV, - IB_WC_FAST_REG_MR, + IB_WC_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, /* * Set value of IB_WC_RECV so consumers can test if a completion is a * receive by testing (opcode & IB_WC_RECV). */ IB_WC_RECV = 1 << 7, - IB_WC_RECV_RDMA_WITH_IMM + IB_WC_RECV_RDMA_WITH_IMM, + IB_WC_DUMMY = -1, /* force enum signed */ }; enum ib_wc_flags { IB_WC_GRH = 1, IB_WC_WITH_IMM = (1<<1), IB_WC_WITH_INVALIDATE = (1<<2), IB_WC_IP_CSUM_OK = (1<<3), - IB_WC_WITH_SL = (1<<4), - IB_WC_WITH_SLID = (1<<5), - IB_WC_WITH_TIMESTAMP = (1<<6), - IB_WC_WITH_SMAC = (1<<7), - IB_WC_WITH_VLAN = (1<<8), + IB_WC_WITH_SMAC = (1<<4), + IB_WC_WITH_VLAN = (1<<5), + IB_WC_WITH_NETWORK_HDR_TYPE = (1<<6), }; struct ib_wc { - u64 wr_id; + union { + u64 wr_id; + struct ib_cqe *wr_cqe; + }; enum ib_wc_status status; enum ib_wc_opcode opcode; u32 vendor_err; u32 byte_len; struct ib_qp *qp; union { __be32 imm_data; u32 invalidate_rkey; } ex; u32 src_qp; int wc_flags; u16 pkey_index; u16 slid; u8 sl; u8 dlid_path_bits; u8 port_num; /* valid only for DR SMPs on switches */ - int csum_ok; - struct { - uint64_t timestamp; /* timestamp = 0 indicates error*/ - } ts; - u8 smac[6]; + u8 smac[ETH_ALEN]; u16 vlan_id; + u8 network_hdr_type; }; enum ib_cq_notify_flags { IB_CQ_SOLICITED = 1 << 0, IB_CQ_NEXT_COMP = 1 << 1, IB_CQ_SOLICITED_MASK = IB_CQ_SOLICITED | IB_CQ_NEXT_COMP, IB_CQ_REPORT_MISSED_EVENTS = 1 << 2, }; enum ib_srq_type { IB_SRQT_BASIC, IB_SRQT_XRC }; enum ib_srq_attr_mask { IB_SRQ_MAX_WR = 1 << 0, IB_SRQ_LIMIT = 1 << 1, }; struct ib_srq_attr { u32 max_wr; u32 max_sge; u32 srq_limit; }; struct ib_srq_init_attr { void (*event_handler)(struct ib_event *, void *); void *srq_context; struct ib_srq_attr attr; enum ib_srq_type srq_type; union { struct { struct ib_xrcd *xrcd; struct ib_cq *cq; } xrc; } ext; }; struct ib_qp_cap { u32 max_send_wr; u32 max_recv_wr; u32 max_send_sge; u32 max_recv_sge; u32 max_inline_data; - u32 qpg_tss_mask_sz; + + /* + * Maximum number of rdma_rw_ctx structures in flight at a time. + * ib_create_qp() will calculate the right amount of neededed WRs + * and MRs based on this. + */ + u32 max_rdma_ctxs; }; enum ib_sig_type { IB_SIGNAL_ALL_WR, IB_SIGNAL_REQ_WR }; enum ib_qp_type { /* * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries * here (and in that order) since the MAD layer uses them as * indices into a 2-entry table. */ IB_QPT_SMI, IB_QPT_GSI, IB_QPT_RC, IB_QPT_UC, IB_QPT_UD, IB_QPT_RAW_IPV6, IB_QPT_RAW_ETHERTYPE, IB_QPT_RAW_PACKET = 8, IB_QPT_XRC_INI = 9, IB_QPT_XRC_TGT, - IB_QPT_DC_INI, IB_QPT_MAX, /* Reserve a range for qp types internal to the low level driver. * These qp types will not be visible at the IB core layer, so the * IB_QPT_MAX usages should not be affected in the core layer */ IB_QPT_RESERVED1 = 0x1000, IB_QPT_RESERVED2, IB_QPT_RESERVED3, IB_QPT_RESERVED4, IB_QPT_RESERVED5, IB_QPT_RESERVED6, IB_QPT_RESERVED7, IB_QPT_RESERVED8, IB_QPT_RESERVED9, IB_QPT_RESERVED10, }; enum ib_qp_create_flags { IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, - IB_QP_CREATE_CROSS_CHANNEL = 1 << 2, - IB_QP_CREATE_MANAGED_SEND = 1 << 3, - IB_QP_CREATE_MANAGED_RECV = 1 << 4, + IB_QP_CREATE_CROSS_CHANNEL = 1 << 2, + IB_QP_CREATE_MANAGED_SEND = 1 << 3, + IB_QP_CREATE_MANAGED_RECV = 1 << 4, IB_QP_CREATE_NETIF_QP = 1 << 5, IB_QP_CREATE_SIGNATURE_EN = 1 << 6, + IB_QP_CREATE_USE_GFP_NOIO = 1 << 7, + IB_QP_CREATE_SCATTER_FCS = 1 << 8, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, }; -enum ib_qpg_type { - IB_QPG_NONE = 0, - IB_QPG_PARENT = (1<<0), - IB_QPG_CHILD_RX = (1<<1), - IB_QPG_CHILD_TX = (1<<2) -}; +/* + * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler + * callback to destroy the passed in QP. + */ -struct ib_qpg_init_attrib { - u32 tss_child_count; - u32 rss_child_count; -}; - struct ib_qp_init_attr { void (*event_handler)(struct ib_event *, void *); void *qp_context; struct ib_cq *send_cq; struct ib_cq *recv_cq; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct ib_qp_cap cap; - union { - struct ib_qp *qpg_parent; /* see qpg_type */ - struct ib_qpg_init_attrib parent_attrib; - }; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; enum ib_qp_create_flags create_flags; - enum ib_qpg_type qpg_type; - u8 port_num; /* special QP types only */ -}; -enum { - IB_DCT_CREATE_FLAG_RCV_INLINE = 1 << 0, - IB_DCT_CREATE_FLAGS_MASK = IB_DCT_CREATE_FLAG_RCV_INLINE, + /* + * Only needed for special QP types, or when using the RW API. + */ + u8 port_num; + struct ib_rwq_ind_table *rwq_ind_tbl; }; -struct ib_dct_init_attr { - struct ib_pd *pd; - struct ib_cq *cq; - struct ib_srq *srq; - u64 dc_key; - u8 port; - u32 access_flags; - u8 min_rnr_timer; - u8 tclass; - u32 flow_label; - enum ib_mtu mtu; - u8 pkey_index; - u8 gid_index; - u8 hop_limit; - u32 create_flags; -}; - -struct ib_dct_attr { - u64 dc_key; - u8 port; - u32 access_flags; - u8 min_rnr_timer; - u8 tclass; - u32 flow_label; - enum ib_mtu mtu; - u8 pkey_index; - u8 gid_index; - u8 hop_limit; - u32 key_violations; - u8 state; -}; - struct ib_qp_open_attr { void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; enum ib_qp_type qp_type; }; enum ib_rnr_timeout { IB_RNR_TIMER_655_36 = 0, IB_RNR_TIMER_000_01 = 1, IB_RNR_TIMER_000_02 = 2, IB_RNR_TIMER_000_03 = 3, IB_RNR_TIMER_000_04 = 4, IB_RNR_TIMER_000_06 = 5, IB_RNR_TIMER_000_08 = 6, IB_RNR_TIMER_000_12 = 7, IB_RNR_TIMER_000_16 = 8, IB_RNR_TIMER_000_24 = 9, IB_RNR_TIMER_000_32 = 10, IB_RNR_TIMER_000_48 = 11, IB_RNR_TIMER_000_64 = 12, IB_RNR_TIMER_000_96 = 13, IB_RNR_TIMER_001_28 = 14, IB_RNR_TIMER_001_92 = 15, IB_RNR_TIMER_002_56 = 16, IB_RNR_TIMER_003_84 = 17, IB_RNR_TIMER_005_12 = 18, IB_RNR_TIMER_007_68 = 19, IB_RNR_TIMER_010_24 = 20, IB_RNR_TIMER_015_36 = 21, IB_RNR_TIMER_020_48 = 22, IB_RNR_TIMER_030_72 = 23, IB_RNR_TIMER_040_96 = 24, IB_RNR_TIMER_061_44 = 25, IB_RNR_TIMER_081_92 = 26, IB_RNR_TIMER_122_88 = 27, IB_RNR_TIMER_163_84 = 28, IB_RNR_TIMER_245_76 = 29, IB_RNR_TIMER_327_68 = 30, IB_RNR_TIMER_491_52 = 31 }; enum ib_qp_attr_mask { IB_QP_STATE = 1, IB_QP_CUR_STATE = (1<<1), IB_QP_EN_SQD_ASYNC_NOTIFY = (1<<2), IB_QP_ACCESS_FLAGS = (1<<3), IB_QP_PKEY_INDEX = (1<<4), IB_QP_PORT = (1<<5), IB_QP_QKEY = (1<<6), IB_QP_AV = (1<<7), IB_QP_PATH_MTU = (1<<8), IB_QP_TIMEOUT = (1<<9), IB_QP_RETRY_CNT = (1<<10), IB_QP_RNR_RETRY = (1<<11), IB_QP_RQ_PSN = (1<<12), IB_QP_MAX_QP_RD_ATOMIC = (1<<13), IB_QP_ALT_PATH = (1<<14), IB_QP_MIN_RNR_TIMER = (1<<15), IB_QP_SQ_PSN = (1<<16), IB_QP_MAX_DEST_RD_ATOMIC = (1<<17), IB_QP_PATH_MIG_STATE = (1<<18), IB_QP_CAP = (1<<19), IB_QP_DEST_QPN = (1<<20), - IB_QP_GROUP_RSS = (1<<21), - IB_QP_DC_KEY = (1<<22), - IB_QP_SMAC = (1<<23), - IB_QP_ALT_SMAC = (1<<24), - IB_QP_VID = (1<<25), - IB_QP_ALT_VID = (1<<26) + IB_QP_RESERVED1 = (1<<21), + IB_QP_RESERVED2 = (1<<22), + IB_QP_RESERVED3 = (1<<23), + IB_QP_RESERVED4 = (1<<24), }; enum ib_qp_state { IB_QPS_RESET, IB_QPS_INIT, IB_QPS_RTR, IB_QPS_RTS, IB_QPS_SQD, IB_QPS_SQE, IB_QPS_ERR, - IB_QPS_DUMMY = -1 /* force enum signed */ + IB_QPS_DUMMY = -1, /* force enum signed */ }; enum ib_mig_state { IB_MIG_MIGRATED, IB_MIG_REARM, IB_MIG_ARMED }; enum ib_mw_type { IB_MW_TYPE_1 = 1, IB_MW_TYPE_2 = 2 }; struct ib_qp_attr { enum ib_qp_state qp_state; enum ib_qp_state cur_qp_state; enum ib_mtu path_mtu; enum ib_mig_state path_mig_state; u32 qkey; u32 rq_psn; u32 sq_psn; u32 dest_qp_num; int qp_access_flags; struct ib_qp_cap cap; struct ib_ah_attr ah_attr; struct ib_ah_attr alt_ah_attr; u16 pkey_index; u16 alt_pkey_index; u8 en_sqd_async_notify; u8 sq_draining; u8 max_rd_atomic; u8 max_dest_rd_atomic; u8 min_rnr_timer; u8 port_num; u8 timeout; u8 retry_cnt; u8 rnr_retry; u8 alt_port_num; u8 alt_timeout; - u8 smac[ETH_ALEN]; - u8 alt_smac[ETH_ALEN]; - u16 vlan_id; - u16 alt_vlan_id; - }; -struct ib_qp_attr_ex { - enum ib_qp_state qp_state; - enum ib_qp_state cur_qp_state; - enum ib_mtu path_mtu; - enum ib_mig_state path_mig_state; - u32 qkey; - u32 rq_psn; - u32 sq_psn; - u32 dest_qp_num; - int qp_access_flags; - struct ib_qp_cap cap; - struct ib_ah_attr ah_attr; - struct ib_ah_attr alt_ah_attr; - u16 pkey_index; - u16 alt_pkey_index; - u8 en_sqd_async_notify; - u8 sq_draining; - u8 max_rd_atomic; - u8 max_dest_rd_atomic; - u8 min_rnr_timer; - u8 port_num; - u8 timeout; - u8 retry_cnt; - u8 rnr_retry; - u8 alt_port_num; - u8 alt_timeout; - u64 dct_key; -}; - enum ib_wr_opcode { IB_WR_RDMA_WRITE, IB_WR_RDMA_WRITE_WITH_IMM, IB_WR_SEND, IB_WR_SEND_WITH_IMM, IB_WR_RDMA_READ, IB_WR_ATOMIC_CMP_AND_SWP, IB_WR_ATOMIC_FETCH_AND_ADD, IB_WR_LSO, IB_WR_SEND_WITH_INV, IB_WR_RDMA_READ_WITH_INV, IB_WR_LOCAL_INV, - IB_WR_FAST_REG_MR, + IB_WR_REG_MR, IB_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD, - IB_WR_BIND_MW, IB_WR_REG_SIG_MR, /* reserve values for low level drivers' internal use. * These values will not be used at all in the ib core layer. */ IB_WR_RESERVED1 = 0xf0, IB_WR_RESERVED2, IB_WR_RESERVED3, IB_WR_RESERVED4, IB_WR_RESERVED5, IB_WR_RESERVED6, IB_WR_RESERVED7, IB_WR_RESERVED8, IB_WR_RESERVED9, IB_WR_RESERVED10, + IB_WR_DUMMY = -1, /* force enum signed */ }; enum ib_send_flags { IB_SEND_FENCE = 1, IB_SEND_SIGNALED = (1<<1), IB_SEND_SOLICITED = (1<<2), IB_SEND_INLINE = (1<<3), IB_SEND_IP_CSUM = (1<<4), /* reserve bits 26-31 for low level drivers' internal use */ IB_SEND_RESERVED_START = (1 << 26), IB_SEND_RESERVED_END = (1 << 31), - IB_SEND_UMR_UNREG = (1<<5) }; struct ib_sge { u64 addr; u32 length; u32 lkey; }; -struct ib_fast_reg_page_list { - struct ib_device *device; - u64 *page_list; - unsigned int max_page_list_len; +struct ib_cqe { + void (*done)(struct ib_cq *cq, struct ib_wc *wc); }; -/** - * struct ib_mw_bind_info - Parameters for a memory window bind operation. - * @mr: A memory region to bind the memory window to. - * @addr: The address where the memory window should begin. - * @length: The length of the memory window, in bytes. - * @mw_access_flags: Access flags from enum ib_access_flags for the window. - * - * This struct contains the shared parameters for type 1 and type 2 - * memory window bind operations. - */ -struct ib_mw_bind_info { - struct ib_mr *mr; - u64 addr; - u64 length; - int mw_access_flags; -}; - struct ib_send_wr { struct ib_send_wr *next; - u64 wr_id; + union { + u64 wr_id; + struct ib_cqe *wr_cqe; + }; struct ib_sge *sg_list; int num_sge; enum ib_wr_opcode opcode; int send_flags; union { __be32 imm_data; u32 invalidate_rkey; } ex; - union { - struct { - u64 remote_addr; - u32 rkey; - } rdma; - struct { - u64 remote_addr; - u64 compare_add; - u64 swap; - u64 compare_add_mask; - u64 swap_mask; - u32 rkey; - } atomic; - struct { - struct ib_ah *ah; - void *header; - int hlen; - int mss; - u32 remote_qpn; - u32 remote_qkey; - u16 pkey_index; /* valid for GSI only */ - u8 port_num; /* valid for DR SMPs on switch only */ - } ud; - struct { - u64 iova_start; - struct ib_fast_reg_page_list *page_list; - unsigned int page_shift; - unsigned int page_list_len; - u32 length; - int access_flags; - u32 rkey; - } fast_reg; - struct { - int npages; - int access_flags; - u32 mkey; - struct ib_pd *pd; - u64 virt_addr; - u64 length; - int page_shift; - } umr; - struct { - struct ib_mw *mw; - /* The new rkey for the memory window. */ - u32 rkey; - struct ib_mw_bind_info bind_info; - } bind_mw; - struct { - struct ib_sig_attrs *sig_attrs; - struct ib_mr *sig_mr; - int access_flags; - struct ib_sge *prot; - } sig_handover; - } wr; - u32 xrc_remote_srq_num; /* XRC TGT QPs only */ }; +struct ib_rdma_wr { + struct ib_send_wr wr; + u64 remote_addr; + u32 rkey; +}; + +static inline struct ib_rdma_wr *rdma_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_rdma_wr, wr); +} + +struct ib_atomic_wr { + struct ib_send_wr wr; + u64 remote_addr; + u64 compare_add; + u64 swap; + u64 compare_add_mask; + u64 swap_mask; + u32 rkey; +}; + +static inline struct ib_atomic_wr *atomic_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_atomic_wr, wr); +} + +struct ib_ud_wr { + struct ib_send_wr wr; + struct ib_ah *ah; + void *header; + int hlen; + int mss; + u32 remote_qpn; + u32 remote_qkey; + u16 pkey_index; /* valid for GSI only */ + u8 port_num; /* valid for DR SMPs on switch only */ +}; + +static inline struct ib_ud_wr *ud_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_ud_wr, wr); +} + +struct ib_reg_wr { + struct ib_send_wr wr; + struct ib_mr *mr; + u32 key; + int access; +}; + +static inline struct ib_reg_wr *reg_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_reg_wr, wr); +} + +struct ib_sig_handover_wr { + struct ib_send_wr wr; + struct ib_sig_attrs *sig_attrs; + struct ib_mr *sig_mr; + int access_flags; + struct ib_sge *prot; +}; + +static inline struct ib_sig_handover_wr *sig_handover_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_sig_handover_wr, wr); +} + struct ib_recv_wr { struct ib_recv_wr *next; - u64 wr_id; + union { + u64 wr_id; + struct ib_cqe *wr_cqe; + }; struct ib_sge *sg_list; int num_sge; }; enum ib_access_flags { IB_ACCESS_LOCAL_WRITE = 1, IB_ACCESS_REMOTE_WRITE = (1<<1), IB_ACCESS_REMOTE_READ = (1<<2), IB_ACCESS_REMOTE_ATOMIC = (1<<3), IB_ACCESS_MW_BIND = (1<<4), - IB_ACCESS_ALLOCATE_MR = (1<<5), - IB_ZERO_BASED = (1<<13) + IB_ZERO_BASED = (1<<5), + IB_ACCESS_ON_DEMAND = (1<<6), }; -struct ib_phys_buf { - u64 addr; - u64 size; -}; - -struct ib_mr_attr { - struct ib_pd *pd; - u64 device_virt_addr; - u64 size; - int mr_access_flags; - u32 lkey; - u32 rkey; -}; - +/* + * XXX: these are apparently used for ->rereg_user_mr, no idea why they + * are hidden here instead of a uapi header! + */ enum ib_mr_rereg_flags { IB_MR_REREG_TRANS = 1, IB_MR_REREG_PD = (1<<1), - IB_MR_REREG_ACCESS = (1<<2) + IB_MR_REREG_ACCESS = (1<<2), + IB_MR_REREG_SUPPORTED = ((IB_MR_REREG_ACCESS << 1) - 1) }; -/** - * struct ib_mw_bind - Parameters for a type 1 memory window bind operation. - * @wr_id: Work request id. - * @send_flags: Flags from ib_send_flags enum. - * @bind_info: More parameters of the bind operation. - */ -struct ib_mw_bind { - u64 wr_id; - int send_flags; - struct ib_mw_bind_info bind_info; -}; - struct ib_fmr_attr { int max_pages; int max_maps; u8 page_shift; }; +struct ib_umem; + struct ib_ucontext { struct ib_device *device; struct list_head pd_list; struct list_head mr_list; struct list_head mw_list; struct list_head cq_list; struct list_head qp_list; struct list_head srq_list; struct list_head ah_list; struct list_head xrcd_list; struct list_head rule_list; - struct list_head dct_list; + struct list_head wq_list; + struct list_head rwq_ind_tbl_list; int closing; - void *peer_mem_private_data; - char *peer_mem_name; + + pid_t tgid; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + struct rb_root umem_tree; + /* + * Protects .umem_rbroot and tree, as well as odp_mrs_count and + * mmu notifiers registration. + */ + struct rw_semaphore umem_rwsem; + void (*invalidate_range)(struct ib_umem *umem, + unsigned long start, unsigned long end); + + struct mmu_notifier mn; + atomic_t notifier_count; + /* A list of umems that don't have private mmu notifier counters yet. */ + struct list_head no_private_counters; + int odp_mrs_count; +#endif }; struct ib_uobject { u64 user_handle; /* handle given to us by userspace */ struct ib_ucontext *context; /* associated user context */ void *object; /* containing object */ struct list_head list; /* link to context's list */ int id; /* index into kernel idr */ struct kref ref; struct rw_semaphore mutex; /* protects .live */ + struct rcu_head rcu; /* kfree_rcu() overhead */ int live; }; -struct ib_udata; -struct ib_udata_ops { - int (*copy_from)(void *dest, struct ib_udata *udata, - size_t len); - int (*copy_to)(struct ib_udata *udata, void *src, - size_t len); -}; - struct ib_udata { - struct ib_udata_ops *ops; - void __user *inbuf; + const void __user *inbuf; void __user *outbuf; size_t inlen; size_t outlen; }; struct ib_pd { + u32 local_dma_lkey; + u32 flags; struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; /* count all resources */ + + u32 unsafe_global_rkey; + + /* + * Implementation details of the RDMA core, don't use in drivers: + */ + struct ib_mr *__internal_mr; }; struct ib_xrcd { struct ib_device *device; atomic_t usecnt; /* count all exposed resources */ struct inode *inode; - + struct mutex tgt_qp_mutex; struct list_head tgt_qp_list; }; struct ib_ah { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; }; -enum ib_cq_attr_mask { - IB_CQ_MODERATION = (1 << 0), - IB_CQ_CAP_FLAGS = (1 << 1) -}; +typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); -enum ib_cq_cap_flags { - IB_CQ_IGNORE_OVERRUN = (1 << 0) +enum ib_poll_context { + IB_POLL_DIRECT, /* caller context, no hw completions */ + IB_POLL_SOFTIRQ, /* poll from softirq context */ + IB_POLL_WORKQUEUE, /* poll from workqueue */ }; -struct ib_cq_attr { - struct { - u16 cq_count; - u16 cq_period; - } moderation; - u32 cq_cap_flags; -}; - -typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); - struct ib_cq { struct ib_device *device; struct ib_uobject *uobject; ib_comp_handler comp_handler; void (*event_handler)(struct ib_event *, void *); void *cq_context; int cqe; atomic_t usecnt; /* count number of work queues */ + enum ib_poll_context poll_ctx; + struct work_struct work; }; struct ib_srq { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; void (*event_handler)(struct ib_event *, void *); void *srq_context; enum ib_srq_type srq_type; atomic_t usecnt; union { struct { struct ib_xrcd *xrcd; struct ib_cq *cq; u32 srq_num; } xrc; } ext; }; +enum ib_wq_type { + IB_WQT_RQ +}; + +enum ib_wq_state { + IB_WQS_RESET, + IB_WQS_RDY, + IB_WQS_ERR +}; + +struct ib_wq { + struct ib_device *device; + struct ib_uobject *uobject; + void *wq_context; + void (*event_handler)(struct ib_event *, void *); + struct ib_pd *pd; + struct ib_cq *cq; + u32 wq_num; + enum ib_wq_state state; + enum ib_wq_type wq_type; + atomic_t usecnt; +}; + +struct ib_wq_init_attr { + void *wq_context; + enum ib_wq_type wq_type; + u32 max_wr; + u32 max_sge; + struct ib_cq *cq; + void (*event_handler)(struct ib_event *, void *); +}; + +enum ib_wq_attr_mask { + IB_WQ_STATE = 1 << 0, + IB_WQ_CUR_STATE = 1 << 1, +}; + +struct ib_wq_attr { + enum ib_wq_state wq_state; + enum ib_wq_state curr_wq_state; +}; + +struct ib_rwq_ind_table { + struct ib_device *device; + struct ib_uobject *uobject; + atomic_t usecnt; + u32 ind_tbl_num; + u32 log_ind_tbl_size; + struct ib_wq **ind_tbl; +}; + +struct ib_rwq_ind_table_init_attr { + u32 log_ind_tbl_size; + /* Each entry is a pointer to Receive Work Queue */ + struct ib_wq **ind_tbl; +}; + +/* + * @max_write_sge: Maximum SGE elements per RDMA WRITE request. + * @max_read_sge: Maximum SGE elements per RDMA READ request. + */ struct ib_qp { struct ib_device *device; struct ib_pd *pd; struct ib_cq *send_cq; struct ib_cq *recv_cq; + spinlock_t mr_lock; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct list_head xrcd_list; + /* count times opened, mcast attaches, flow attaches */ atomic_t usecnt; struct list_head open_list; struct ib_qp *real_qp; struct ib_uobject *uobject; void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; + u32 max_write_sge; + u32 max_read_sge; enum ib_qp_type qp_type; - enum ib_qpg_type qpg_type; - u8 port_num; + struct ib_rwq_ind_table *rwq_ind_tbl; }; -struct ib_dct { - struct ib_device *device; - struct ib_uobject *uobject; - struct ib_pd *pd; - struct ib_cq *cq; - struct ib_srq *srq; - u32 dct_num; -}; - struct ib_mr { struct ib_device *device; struct ib_pd *pd; - struct ib_uobject *uobject; u32 lkey; u32 rkey; - atomic_t usecnt; /* count number of MWs */ + u64 iova; + u32 length; + unsigned int page_size; + bool need_inval; + union { + struct ib_uobject *uobject; /* user */ + struct list_head qp_entry; /* FR */ + }; }; struct ib_mw { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; u32 rkey; enum ib_mw_type type; }; struct ib_fmr { struct ib_device *device; struct ib_pd *pd; struct list_head list; u32 lkey; u32 rkey; }; /* Supported steering options */ enum ib_flow_attr_type { /* steering according to rule specifications */ IB_FLOW_ATTR_NORMAL = 0x0, /* default unicast and multicast rule - * receive all Eth traffic which isn't steered to any QP */ IB_FLOW_ATTR_ALL_DEFAULT = 0x1, /* default multicast rule - * receive all Eth multicast traffic which isn't steered to any QP */ IB_FLOW_ATTR_MC_DEFAULT = 0x2, /* sniffer rule - receive all port traffic */ IB_FLOW_ATTR_SNIFFER = 0x3 }; /* Supported steering header types */ enum ib_flow_spec_type { /* L2 headers*/ IB_FLOW_SPEC_ETH = 0x20, - IB_FLOW_SPEC_IB = 0x21, + IB_FLOW_SPEC_IB = 0x22, /* L3 header*/ IB_FLOW_SPEC_IPV4 = 0x30, + IB_FLOW_SPEC_IPV6 = 0x31, /* L4 headers*/ IB_FLOW_SPEC_TCP = 0x40, IB_FLOW_SPEC_UDP = 0x41 }; - +#define IB_FLOW_SPEC_LAYER_MASK 0xF0 #define IB_FLOW_SPEC_SUPPORT_LAYERS 4 /* Flow steering rule priority is set according to it's domain. * Lower domain value means higher priority. */ enum ib_flow_domain { IB_FLOW_DOMAIN_USER, IB_FLOW_DOMAIN_ETHTOOL, IB_FLOW_DOMAIN_RFS, IB_FLOW_DOMAIN_NIC, IB_FLOW_DOMAIN_NUM /* Must be last */ }; enum ib_flow_flags { - IB_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1 + IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */ + IB_FLOW_ATTR_FLAGS_RESERVED = 1UL << 2 /* Must be last */ }; struct ib_flow_eth_filter { u8 dst_mac[6]; u8 src_mac[6]; __be16 ether_type; __be16 vlan_tag; + /* Must be last */ + u8 real_sz[0]; }; struct ib_flow_spec_eth { enum ib_flow_spec_type type; u16 size; struct ib_flow_eth_filter val; struct ib_flow_eth_filter mask; }; struct ib_flow_ib_filter { - __be32 l3_type_qpn; - u8 dst_gid[16]; + __be16 dlid; + __u8 sl; + /* Must be last */ + u8 real_sz[0]; }; struct ib_flow_spec_ib { enum ib_flow_spec_type type; u16 size; struct ib_flow_ib_filter val; struct ib_flow_ib_filter mask; }; +/* IPv4 header flags */ +enum ib_ipv4_flags { + IB_IPV4_DONT_FRAG = 0x2, /* Don't enable packet fragmentation */ + IB_IPV4_MORE_FRAG = 0X4 /* For All fragmented packets except the + last have this flag set */ +}; + struct ib_flow_ipv4_filter { - __be32 src_ip; - __be32 dst_ip; + __be32 src_ip; + __be32 dst_ip; + u8 proto; + u8 tos; + u8 ttl; + u8 flags; + /* Must be last */ + u8 real_sz[0]; }; struct ib_flow_spec_ipv4 { enum ib_flow_spec_type type; u16 size; struct ib_flow_ipv4_filter val; struct ib_flow_ipv4_filter mask; }; +struct ib_flow_ipv6_filter { + u8 src_ip[16]; + u8 dst_ip[16]; + __be32 flow_label; + u8 next_hdr; + u8 traffic_class; + u8 hop_limit; + /* Must be last */ + u8 real_sz[0]; +}; + +struct ib_flow_spec_ipv6 { + enum ib_flow_spec_type type; + u16 size; + struct ib_flow_ipv6_filter val; + struct ib_flow_ipv6_filter mask; +}; + struct ib_flow_tcp_udp_filter { - __be16 dst_port; + __be16 dst_port; __be16 src_port; + /* Must be last */ + u8 real_sz[0]; }; struct ib_flow_spec_tcp_udp { enum ib_flow_spec_type type; u16 size; struct ib_flow_tcp_udp_filter val; struct ib_flow_tcp_udp_filter mask; }; union ib_flow_spec { struct { enum ib_flow_spec_type type; u16 size; }; - struct ib_flow_spec_ib ib; - struct ib_flow_spec_eth eth; - struct ib_flow_spec_ipv4 ipv4; - struct ib_flow_spec_tcp_udp tcp_udp; + struct ib_flow_spec_eth eth; + struct ib_flow_spec_ib ib; + struct ib_flow_spec_ipv4 ipv4; + struct ib_flow_spec_tcp_udp tcp_udp; + struct ib_flow_spec_ipv6 ipv6; }; struct ib_flow_attr { enum ib_flow_attr_type type; u16 size; u16 priority; + u32 flags; u8 num_of_specs; u8 port; - u32 flags; /* Following are the optional layers according to user request * struct ib_flow_spec_xxx * struct ib_flow_spec_yyy */ }; struct ib_flow { struct ib_qp *qp; struct ib_uobject *uobject; }; -struct ib_mad; +struct ib_mad_hdr; struct ib_grh; enum ib_process_mad_flags { IB_MAD_IGNORE_MKEY = 1, IB_MAD_IGNORE_BKEY = 2, IB_MAD_IGNORE_ALL = IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY }; enum ib_mad_result { IB_MAD_RESULT_FAILURE = 0, /* (!SUCCESS is the important flag) */ IB_MAD_RESULT_SUCCESS = 1 << 0, /* MAD was successfully processed */ IB_MAD_RESULT_REPLY = 1 << 1, /* Reply packet needs to be sent */ IB_MAD_RESULT_CONSUMED = 1 << 2 /* Packet consumed: stop processing */ }; #define IB_DEVICE_NAME_MAX 64 struct ib_cache { rwlock_t lock; struct ib_event_handler event_handler; struct ib_pkey_cache **pkey_cache; - struct ib_gid_cache **gid_cache; + struct ib_gid_table **gid_cache; u8 *lmc_cache; }; -enum verbs_values_mask { - IBV_VALUES_HW_CLOCK = 1 << 0 -}; - -struct ib_device_values { - int values_mask; - uint64_t hwclock; -}; - struct ib_dma_mapping_ops { int (*mapping_error)(struct ib_device *dev, u64 dma_addr); u64 (*map_single)(struct ib_device *dev, void *ptr, size_t size, enum dma_data_direction direction); void (*unmap_single)(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction); u64 (*map_page)(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction); void (*unmap_page)(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction); int (*map_sg)(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); void (*unmap_sg)(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); - u64 (*dma_address)(struct ib_device *dev, - struct scatterlist *sg); - unsigned int (*dma_len)(struct ib_device *dev, - struct scatterlist *sg); + int (*map_sg_attrs)(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction, + struct dma_attrs *attrs); + void (*unmap_sg_attrs)(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction, + struct dma_attrs *attrs); void (*sync_single_for_cpu)(struct ib_device *dev, u64 dma_handle, size_t size, enum dma_data_direction dir); void (*sync_single_for_device)(struct ib_device *dev, u64 dma_handle, size_t size, enum dma_data_direction dir); void *(*alloc_coherent)(struct ib_device *dev, size_t size, u64 *dma_handle, gfp_t flag); void (*free_coherent)(struct ib_device *dev, size_t size, void *cpu_addr, u64 dma_handle); }; struct iw_cm_verbs; -struct ib_exp_device_attr; -struct ib_exp_qp_init_attr; +struct ib_port_immutable { + int pkey_tbl_len; + int gid_tbl_len; + u32 core_cap_flags; + u32 max_mad_size; +}; + struct ib_device { struct device *dma_device; char name[IB_DEVICE_NAME_MAX]; struct list_head event_handler_list; spinlock_t event_handler_lock; spinlock_t client_data_lock; struct list_head core_list; + /* Access to the client_data_list is protected by the client_data_lock + * spinlock and the lists_rwsem read-write semaphore */ struct list_head client_data_list; struct ib_cache cache; - int *pkey_tbl_len; - int *gid_tbl_len; + /** + * port_immutable is indexed by port number + */ + struct ib_port_immutable *port_immutable; int num_comp_vectors; struct iw_cm_verbs *iwcm; - int (*get_protocol_stats)(struct ib_device *device, - union rdma_protocol_stats *stats); + /** + * alloc_hw_stats - Allocate a struct rdma_hw_stats and fill in the + * driver initialized data. The struct is kfree()'ed by the sysfs + * core when the device is removed. A lifespan of -1 in the return + * struct tells the core to set a default lifespan. + */ + struct rdma_hw_stats *(*alloc_hw_stats)(struct ib_device *device, + u8 port_num); + /** + * get_hw_stats - Fill in the counter value(s) in the stats struct. + * @index - The index in the value array we wish to have updated, or + * num_counters if we want all stats updated + * Return codes - + * < 0 - Error, no counters updated + * index - Updated the single counter pointed to by index + * num_counters - Updated all counters (will reset the timestamp + * and prevent further calls for lifespan milliseconds) + * Drivers are allowed to update all counters in leiu of just the + * one given in index at their option + */ + int (*get_hw_stats)(struct ib_device *device, + struct rdma_hw_stats *stats, + u8 port, int index); int (*query_device)(struct ib_device *device, - struct ib_device_attr *device_attr); + struct ib_device_attr *device_attr, + struct ib_udata *udata); int (*query_port)(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer (*get_link_layer)(struct ib_device *device, u8 port_num); /* When calling get_netdev, the HW vendor's driver should return the - * net device of device @device at port @port_num. The function - * is called in rtnl_lock. The HW vendor's device driver must guarantee - * to return NULL before the net device has reached + * net device of device @device at port @port_num or NULL if such + * a net device doesn't exist. The vendor driver should call dev_hold + * on this net device. The HW vendor's device driver must guarantee + * that this function returns NULL before the net device reaches * NETDEV_UNREGISTER_FINAL state. */ - struct net_device *(*get_netdev)(struct ib_device *device, - u8 port_num); + struct net_device *(*get_netdev)(struct ib_device *device, + u8 port_num); int (*query_gid)(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); + /* When calling add_gid, the HW vendor's driver should + * add the gid of device @device at gid index @index of + * port @port_num to be @gid. Meta-info of that gid (for example, + * the network device related to this gid is available + * at @attr. @context allows the HW vendor driver to store extra + * information together with a GID entry. The HW vendor may allocate + * memory to contain this information and store it in @context when a + * new GID entry is written to. Params are consistent until the next + * call of add_gid or delete_gid. The function should return 0 on + * success or error otherwise. The function could be called + * concurrently for different ports. This function is only called + * when roce_gid_table is used. + */ + int (*add_gid)(struct ib_device *device, + u8 port_num, + unsigned int index, + const union ib_gid *gid, + const struct ib_gid_attr *attr, + void **context); + /* When calling del_gid, the HW vendor's driver should delete the + * gid of device @device at gid index @index of port @port_num. + * Upon the deletion of a GID entry, the HW vendor must free any + * allocated memory. The caller will clear @context afterwards. + * This function is only called when roce_gid_table is used. + */ + int (*del_gid)(struct ib_device *device, + u8 port_num, + unsigned int index, + void **context); int (*query_pkey)(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); int (*modify_device)(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); int (*modify_port)(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify); struct ib_ucontext * (*alloc_ucontext)(struct ib_device *device, struct ib_udata *udata); int (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); struct ib_pd * (*alloc_pd)(struct ib_device *device, struct ib_ucontext *context, struct ib_udata *udata); int (*dealloc_pd)(struct ib_pd *pd); struct ib_ah * (*create_ah)(struct ib_pd *pd, struct ib_ah_attr *ah_attr); int (*modify_ah)(struct ib_ah *ah, struct ib_ah_attr *ah_attr); int (*query_ah)(struct ib_ah *ah, struct ib_ah_attr *ah_attr); int (*destroy_ah)(struct ib_ah *ah); struct ib_srq * (*create_srq)(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr, struct ib_udata *udata); int (*modify_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask, struct ib_udata *udata); int (*query_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr); int (*destroy_srq)(struct ib_srq *srq); int (*post_srq_recv)(struct ib_srq *srq, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr); struct ib_qp * (*create_qp)(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata); int (*modify_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_udata *udata); int (*query_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int (*destroy_qp)(struct ib_qp *qp); int (*post_send)(struct ib_qp *qp, struct ib_send_wr *send_wr, struct ib_send_wr **bad_send_wr); int (*post_recv)(struct ib_qp *qp, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr); struct ib_cq * (*create_cq)(struct ib_device *device, - struct ib_cq_init_attr *attr, + const struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata); - int (*modify_cq)(struct ib_cq *cq, - struct ib_cq_attr *cq_attr, - int cq_attr_mask); + int (*modify_cq)(struct ib_cq *cq, u16 cq_count, + u16 cq_period); int (*destroy_cq)(struct ib_cq *cq); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); int (*poll_cq)(struct ib_cq *cq, int num_entries, struct ib_wc *wc); int (*peek_cq)(struct ib_cq *cq, int wc_cnt); int (*req_notify_cq)(struct ib_cq *cq, enum ib_cq_notify_flags flags); int (*req_ncomp_notif)(struct ib_cq *cq, int wc_cnt); struct ib_mr * (*get_dma_mr)(struct ib_pd *pd, int mr_access_flags); - struct ib_mr * (*reg_phys_mr)(struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start); struct ib_mr * (*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, - struct ib_udata *udata, - int mr_id); - int (*query_mr)(struct ib_mr *mr, - struct ib_mr_attr *mr_attr); - int (*dereg_mr)(struct ib_mr *mr); - int (*destroy_mr)(struct ib_mr *mr); - struct ib_mr * (*create_mr)(struct ib_pd *pd, - struct ib_mr_init_attr *mr_init_attr); - struct ib_mr * (*alloc_fast_reg_mr)(struct ib_pd *pd, - int max_page_list_len); - struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device, - int page_list_len); - void (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list); - int (*rereg_phys_mr)(struct ib_mr *mr, - int mr_rereg_mask, - struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, + struct ib_udata *udata); + int (*rereg_user_mr)(struct ib_mr *mr, + int flags, + u64 start, u64 length, + u64 virt_addr, int mr_access_flags, - u64 *iova_start); + struct ib_pd *pd, + struct ib_udata *udata); + int (*dereg_mr)(struct ib_mr *mr); + struct ib_mr * (*alloc_mr)(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg); + int (*map_mr_sg)(struct ib_mr *mr, + struct scatterlist *sg, + int sg_nents, + unsigned int *sg_offset); struct ib_mw * (*alloc_mw)(struct ib_pd *pd, - enum ib_mw_type type); - int (*bind_mw)(struct ib_qp *qp, - struct ib_mw *mw, - struct ib_mw_bind *mw_bind); + enum ib_mw_type type, + struct ib_udata *udata); int (*dealloc_mw)(struct ib_mw *mw); struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); int (*map_phys_fmr)(struct ib_fmr *fmr, u64 *page_list, int list_len, u64 iova); int (*unmap_fmr)(struct list_head *fmr_list); int (*dealloc_fmr)(struct ib_fmr *fmr); int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*process_mad)(struct ib_device *device, int process_mad_flags, u8 port_num, - struct ib_wc *in_wc, - struct ib_grh *in_grh, - struct ib_mad *in_mad, - struct ib_mad *out_mad); + const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct ib_mad_hdr *in_mad, + size_t in_mad_size, + struct ib_mad_hdr *out_mad, + size_t *out_mad_size, + u16 *out_mad_pkey_index); struct ib_xrcd * (*alloc_xrcd)(struct ib_device *device, struct ib_ucontext *ucontext, struct ib_udata *udata); int (*dealloc_xrcd)(struct ib_xrcd *xrcd); struct ib_flow * (*create_flow)(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain); int (*destroy_flow)(struct ib_flow *flow_id); int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); - - unsigned long (*get_unmapped_area)(struct file *file, - unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags); - int (*ioctl)(struct ib_ucontext *context, - unsigned int cmd, - unsigned long arg); - int (*query_values)(struct ib_device *device, - int q_values, - struct ib_device_values *values); + void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); + void (*drain_rq)(struct ib_qp *qp); + void (*drain_sq)(struct ib_qp *qp); + int (*set_vf_link_state)(struct ib_device *device, int vf, u8 port, + int state); + int (*get_vf_config)(struct ib_device *device, int vf, u8 port, + struct ifla_vf_info *ivf); + int (*get_vf_stats)(struct ib_device *device, int vf, u8 port, + struct ifla_vf_stats *stats); + int (*set_vf_guid)(struct ib_device *device, int vf, u8 port, u64 guid, + int type); + struct ib_wq * (*create_wq)(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata); + int (*destroy_wq)(struct ib_wq *wq); + int (*modify_wq)(struct ib_wq *wq, + struct ib_wq_attr *attr, + u32 wq_attr_mask, + struct ib_udata *udata); + struct ib_rwq_ind_table * (*create_rwq_ind_table)(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata); + int (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table); struct ib_dma_mapping_ops *dma_ops; struct module *owner; struct device dev; struct kobject *ports_parent; struct list_head port_list; enum { IB_DEV_UNINITIALIZED, IB_DEV_REGISTERED, IB_DEV_UNREGISTERED } reg_state; int uverbs_abi_ver; u64 uverbs_cmd_mask; u64 uverbs_ex_cmd_mask; - char node_desc[64]; + char node_desc[IB_DEVICE_NODE_DESC_MAX]; __be64 node_guid; u32 local_dma_lkey; + u16 is_switch:1; u8 node_type; u8 phys_port_cnt; - int cmd_perf; - u64 cmd_avg; - u32 cmd_n; - spinlock_t cmd_perf_lock; + struct ib_device_attr attrs; + struct attribute_group *hw_stats_ag; + struct rdma_hw_stats *hw_stats; - /* - * Experimental data and functions + /** + * The following mandatory functions are used only at device + * registration. Keep functions such as these at the end of this + * structure to avoid cache line misses when accessing struct ib_device + * in fast paths. */ - int (*exp_query_device)(struct ib_device *device, - struct ib_exp_device_attr *device_attr); - struct ib_qp * (*exp_create_qp)(struct ib_pd *pd, - struct ib_exp_qp_init_attr *qp_init_attr, - struct ib_udata *udata); - struct ib_dct * (*exp_create_dct)(struct ib_pd *pd, - struct ib_dct_init_attr *attr, - struct ib_udata *udata); - int (*exp_destroy_dct)(struct ib_dct *dct); - int (*exp_query_dct)(struct ib_dct *dct, struct ib_dct_attr *attr); - - u64 uverbs_exp_cmd_mask; + int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *); + void (*get_dev_fw_str)(struct ib_device *, char *str, size_t str_len); }; struct ib_client { char *name; void (*add) (struct ib_device *); - void (*remove)(struct ib_device *); + void (*remove)(struct ib_device *, void *client_data); + /* Returns the net_dev belonging to this ib_client and matching the + * given parameters. + * @dev: An RDMA device that the net_dev use for communication. + * @port: A physical port number on the RDMA device. + * @pkey: P_Key that the net_dev uses if applicable. + * @gid: A GID that the net_dev uses to communicate. + * @addr: An IP address the net_dev is configured with. + * @client_data: The device's client data set by ib_set_client_data(). + * + * An ib_client that implements a net_dev on top of RDMA devices + * (such as IP over IB) should implement this callback, allowing the + * rdma_cm module to find the right net_dev for a given request. + * + * The caller is responsible for calling dev_put on the returned + * netdev. */ + struct net_device *(*get_net_dev_by_params)( + struct ib_device *dev, + u8 port, + u16 pkey, + const union ib_gid *gid, + const struct sockaddr *addr, + void *client_data); struct list_head list; }; struct ib_device *ib_alloc_device(size_t size); void ib_dealloc_device(struct ib_device *device); +void ib_get_device_fw_str(struct ib_device *device, char *str, size_t str_len); + int ib_register_device(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)); void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); void ib_unregister_client(struct ib_client *client); void *ib_get_client_data(struct ib_device *device, struct ib_client *client); void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { - return udata->ops->copy_from(dest, udata, len); + return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; } static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) { - return udata->ops->copy_to(udata, src, len); + return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; } +static inline bool ib_is_udata_cleared(struct ib_udata *udata, + size_t offset, + size_t len) +{ + const void __user *p = (const char __user *)udata->inbuf + offset; + bool ret; + u8 *buf; + + if (len > USHRT_MAX) + return false; + + buf = memdup_user(p, len); + if (IS_ERR(buf)) + return false; + + ret = !memchr_inv(buf, 0, len); + kfree(buf); + return ret; +} + /** * ib_modify_qp_is_ok - Check that the supplied attribute mask * contains all required attributes and no attributes not allowed for * the given QP state transition. * @cur_state: Current QP state * @next_state: Next QP state * @type: QP type * @mask: Mask of supplied QP attributes * @ll : link layer of port * * This function is a helper function that a low-level driver's * modify_qp method can use to validate the consumer's input. It * checks that cur_state and next_state are valid QP states, that a * transition from cur_state to next_state is allowed by the IB spec, * and that the attribute mask supplied is allowed for the transition. */ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, enum ib_qp_type type, enum ib_qp_attr_mask mask, enum rdma_link_layer ll); int ib_register_event_handler (struct ib_event_handler *event_handler); int ib_unregister_event_handler(struct ib_event_handler *event_handler); void ib_dispatch_event(struct ib_event *event); -int ib_query_device(struct ib_device *device, - struct ib_device_attr *device_attr); - int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num); +/** + * rdma_cap_ib_switch - Check if the device is IB switch + * @device: Device to check + * + * Device driver is responsible for setting is_switch bit on + * in ib_device structure at init time. + * + * Return: true if the device is IB switch. + */ +static inline bool rdma_cap_ib_switch(const struct ib_device *device) +{ + return device->is_switch; +} + +/** + * rdma_start_port - Return the first valid port number for the device + * specified + * + * @device: Device to be checked + * + * Return start port number + */ +static inline u8 rdma_start_port(const struct ib_device *device) +{ + return rdma_cap_ib_switch(device) ? 0 : 1; +} + +/** + * rdma_end_port - Return the last valid port number for the device + * specified + * + * @device: Device to be checked + * + * Return last port number + */ +static inline u8 rdma_end_port(const struct ib_device *device) +{ + return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt; +} + +static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IB; +} + +static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & + (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP); +} + +static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; +} + +static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE; +} + +static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IWARP; +} + +static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num) +{ + return rdma_protocol_ib(device, port_num) || + rdma_protocol_roce(device, port_num); +} + +/** + * rdma_cap_ib_mad - Check if the port of a device supports Infiniband + * Management Datagrams. + * @device: Device to check + * @port_num: Port number to check + * + * Management Datagrams (MAD) are a required part of the InfiniBand + * specification and are supported on all InfiniBand devices. A slightly + * extended version are also supported on OPA interfaces. + * + * Return: true if the port supports sending/receiving of MAD packets. + */ +static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_MAD; +} + +/** + * rdma_cap_opa_mad - Check if the port of device provides support for OPA + * Management Datagrams. + * @device: Device to check + * @port_num: Port number to check + * + * Intel OmniPath devices extend and/or replace the InfiniBand Management + * datagrams with their own versions. These OPA MADs share many but not all of + * the characteristics of InfiniBand MADs. + * + * OPA MADs differ in the following ways: + * + * 1) MADs are variable size up to 2K + * IBTA defined MADs remain fixed at 256 bytes + * 2) OPA SMPs must carry valid PKeys + * 3) OPA SMP packets are a different format + * + * Return: true if the port supports OPA MAD packet formats. + */ +static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num) +{ + return (device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_OPA_MAD) + == RDMA_CORE_CAP_OPA_MAD; +} + +/** + * rdma_cap_ib_smi - Check if the port of a device provides an Infiniband + * Subnet Management Agent (SMA) on the Subnet Management Interface (SMI). + * @device: Device to check + * @port_num: Port number to check + * + * Each InfiniBand node is required to provide a Subnet Management Agent + * that the subnet manager can access. Prior to the fabric being fully + * configured by the subnet manager, the SMA is accessed via a well known + * interface called the Subnet Management Interface (SMI). This interface + * uses directed route packets to communicate with the SM to get around the + * chicken and egg problem of the SM needing to know what's on the fabric + * in order to configure the fabric, and needing to configure the fabric in + * order to send packets to the devices on the fabric. These directed + * route packets do not need the fabric fully configured in order to reach + * their destination. The SMI is the only method allowed to send + * directed route packets on an InfiniBand fabric. + * + * Return: true if the port provides an SMI. + */ +static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SMI; +} + +/** + * rdma_cap_ib_cm - Check if the port of device has the capability Infiniband + * Communication Manager. + * @device: Device to check + * @port_num: Port number to check + * + * The InfiniBand Communication Manager is one of many pre-defined General + * Service Agents (GSA) that are accessed via the General Service + * Interface (GSI). It's role is to facilitate establishment of connections + * between nodes as well as other management related tasks for established + * connections. + * + * Return: true if the port supports an IB CM (this does not guarantee that + * a CM is actually running however). + */ +static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_CM; +} + +/** + * rdma_cap_iw_cm - Check if the port of device has the capability IWARP + * Communication Manager. + * @device: Device to check + * @port_num: Port number to check + * + * Similar to above, but specific to iWARP connections which have a different + * managment protocol than InfiniBand. + * + * Return: true if the port supports an iWARP CM (this does not guarantee that + * a CM is actually running however). + */ +static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IW_CM; +} + +/** + * rdma_cap_ib_sa - Check if the port of device has the capability Infiniband + * Subnet Administration. + * @device: Device to check + * @port_num: Port number to check + * + * An InfiniBand Subnet Administration (SA) service is a pre-defined General + * Service Agent (GSA) provided by the Subnet Manager (SM). On InfiniBand + * fabrics, devices should resolve routes to other hosts by contacting the + * SA to query the proper route. + * + * Return: true if the port should act as a client to the fabric Subnet + * Administration interface. This does not imply that the SA service is + * running locally. + */ +static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SA; +} + +/** + * rdma_cap_ib_mcast - Check if the port of device has the capability Infiniband + * Multicast. + * @device: Device to check + * @port_num: Port number to check + * + * InfiniBand multicast registration is more complex than normal IPv4 or + * IPv6 multicast registration. Each Host Channel Adapter must register + * with the Subnet Manager when it wishes to join a multicast group. It + * should do so only once regardless of how many queue pairs it subscribes + * to this group. And it should leave the group only after all queue pairs + * attached to the group have been detached. + * + * Return: true if the port must undertake the additional adminstrative + * overhead of registering/unregistering with the SM and tracking of the + * total number of queue pairs attached to the multicast group. + */ +static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num) +{ + return rdma_cap_ib_sa(device, port_num); +} + +/** + * rdma_cap_af_ib - Check if the port of device has the capability + * Native Infiniband Address. + * @device: Device to check + * @port_num: Port number to check + * + * InfiniBand addressing uses a port's GUID + Subnet Prefix to make a default + * GID. RoCE uses a different mechanism, but still generates a GID via + * a prescribed mechanism and port specific data. + * + * Return: true if the port uses a GID address to identify devices on the + * network. + */ +static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_AF_IB; +} + +/** + * rdma_cap_eth_ah - Check if the port of device has the capability + * Ethernet Address Handle. + * @device: Device to check + * @port_num: Port number to check + * + * RoCE is InfiniBand over Ethernet, and it uses a well defined technique + * to fabricate GIDs over Ethernet/IP specific addresses native to the + * port. Normally, packet headers are generated by the sending host + * adapter, but when sending connectionless datagrams, we must manually + * inject the proper headers for the fabric we are communicating over. + * + * Return: true if we are running as a RoCE port and must force the + * addition of a Global Route Header built from our Ethernet Address + * Handle into our header list for connectionless packets. + */ +static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_ETH_AH; +} + +/** + * rdma_max_mad_size - Return the max MAD size required by this RDMA Port. + * + * @device: Device + * @port_num: Port number + * + * This MAD size includes the MAD headers and MAD payload. No other headers + * are included. + * + * Return the max MAD size required by the Port. Will return 0 if the port + * does not support MADs + */ +static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].max_mad_size; +} + +/** + * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table + * @device: Device to check + * @port_num: Port number to check + * + * RoCE GID table mechanism manages the various GIDs for a device. + * + * NOTE: if allocating the port's GID table has failed, this call will still + * return true, but any RoCE GID table API will fail. + * + * Return: true if the port uses RoCE GID table mechanism in order to manage + * its GIDs. + */ +static inline bool rdma_cap_roce_gid_table(const struct ib_device *device, + u8 port_num) +{ + return rdma_protocol_roce(device, port_num) && + device->add_gid && device->del_gid; +} + +/* + * Check if the device supports READ W/ INVALIDATE. + */ +static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num) +{ + /* + * iWarp drivers must support READ W/ INVALIDATE. No other protocol + * has support for it yet. + */ + return rdma_protocol_iwarp(dev, port_num); +} + int ib_query_gid(struct ib_device *device, - u8 port_num, int index, union ib_gid *gid); + u8 port_num, int index, union ib_gid *gid, + struct ib_gid_attr *attr); +int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, + int state); +int ib_get_vf_config(struct ib_device *device, int vf, u8 port, + struct ifla_vf_info *info); +int ib_get_vf_stats(struct ib_device *device, int vf, u8 port, + struct ifla_vf_stats *stats); +int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, + int type); + int ib_query_pkey(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); int ib_modify_port(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify); int ib_find_gid(struct ib_device *device, union ib_gid *gid, + enum ib_gid_type gid_type, struct net_device *ndev, u8 *port_num, u16 *index); int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index); -/** - * ib_alloc_pd - Allocates an unused protection domain. - * @device: The device on which to allocate the protection domain. - * - * A protection domain object provides an association between QPs, shared - * receive queues, address handles, memory regions, and memory windows. - */ -struct ib_pd *ib_alloc_pd(struct ib_device *device); +enum ib_pd_flags { + /* + * Create a memory registration for all memory in the system and place + * the rkey for it into pd->unsafe_global_rkey. This can be used by + * ULPs to avoid the overhead of dynamic MRs. + * + * This flag is generally considered unsafe and must only be used in + * extremly trusted environments. Every use of it will log a warning + * in the kernel log. + */ + IB_PD_UNSAFE_GLOBAL_RKEY = 0x01, +}; -/** - * ib_dealloc_pd - Deallocates a protection domain. - * @pd: The protection domain to deallocate. - */ -int ib_dealloc_pd(struct ib_pd *pd); +struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, + const char *caller); +#define ib_alloc_pd(device, flags) \ + __ib_alloc_pd((device), (flags), __func__) +void ib_dealloc_pd(struct ib_pd *pd); /** * ib_create_ah - Creates an address handle for the given address vector. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); /** * ib_init_ah_from_wc - Initializes address handle attributes from a * work completion. * @device: Device on which the received message arrived. * @port_num: Port on which the received message arrived. * @wc: Work completion associated with the received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @ah_attr: Returned attributes that can be used when creating an address * handle for replying to the message. */ -int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, - struct ib_grh *grh, struct ib_ah_attr *ah_attr); +int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, + const struct ib_wc *wc, const struct ib_grh *grh, + struct ib_ah_attr *ah_attr); /** * ib_create_ah_from_wc - Creates an address handle associated with the * sender of the specified work completion. * @pd: The protection domain associated with the address handle. * @wc: Work completion information associated with a received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @port_num: The outbound port number to associate with the address. * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ -struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, - struct ib_grh *grh, u8 port_num); +struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, + const struct ib_grh *grh, u8 port_num); /** * ib_modify_ah - Modifies the address vector associated with an address * handle. * @ah: The address handle to modify. * @ah_attr: The new address vector attributes to associate with the * address handle. */ int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); /** * ib_query_ah - Queries the address vector associated with an address * handle. * @ah: The address handle to query. * @ah_attr: The address vector attributes associated with the address * handle. */ int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); /** * ib_destroy_ah - Destroys an address handle. * @ah: The address handle to destroy. */ int ib_destroy_ah(struct ib_ah *ah); /** * ib_create_srq - Creates a SRQ associated with the specified protection * domain. * @pd: The protection domain associated with the SRQ. * @srq_init_attr: A list of initial attributes required to create the * SRQ. If SRQ creation succeeds, then the attributes are updated to * the actual capabilities of the created SRQ. * * srq_attr->max_wr and srq_attr->max_sge are read the determine the * requested size of the SRQ, and set to the actual values allocated * on return. If ib_create_srq() succeeds, then max_wr and max_sge * will always be at least as large as the requested values. */ struct ib_srq *ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr); /** * ib_modify_srq - Modifies the attributes for the specified SRQ. * @srq: The SRQ to modify. * @srq_attr: On input, specifies the SRQ attributes to modify. On output, * the current values of selected SRQ attributes are returned. * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ * are being modified. * * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or * IB_SRQ_LIMIT to set the SRQ's limit and request notification when * the number of receives queued drops below the limit. */ int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask); /** * ib_query_srq - Returns the attribute list and current values for the * specified SRQ. * @srq: The SRQ to query. * @srq_attr: The attributes of the specified SRQ. */ int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); /** * ib_destroy_srq - Destroys the specified SRQ. * @srq: The SRQ to destroy. */ int ib_destroy_srq(struct ib_srq *srq); /** * ib_post_srq_recv - Posts a list of work requests to the specified SRQ. * @srq: The SRQ to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr) { return srq->device->post_srq_recv(srq, recv_wr, bad_recv_wr); } /** * ib_create_qp - Creates a QP associated with the specified protection * domain. * @pd: The protection domain associated with the QP. * @qp_init_attr: A list of initial attributes required to create the * QP. If QP creation succeeds, then the attributes are updated to * the actual capabilities of the created QP. */ struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr); /** * ib_modify_qp - Modifies the attributes for the specified QP and then * transitions the QP to the given state. * @qp: The QP to modify. * @qp_attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @qp_attr_mask: A bit-mask used to specify which attributes of the QP * are being modified. */ int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask); /** * ib_query_qp - Returns the attribute list and current values for the * specified QP. * @qp: The QP to query. * @qp_attr: The attributes of the specified QP. * @qp_attr_mask: A bit-mask used to select specific attributes to query. * @qp_init_attr: Additional attributes of the selected QP. * * The qp_attr_mask may be used to limit the query to gathering only the * selected attributes. */ int ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); /** * ib_destroy_qp - Destroys the specified QP. * @qp: The QP to destroy. */ int ib_destroy_qp(struct ib_qp *qp); /** * ib_open_qp - Obtain a reference to an existing sharable QP. * @xrcd - XRC domain * @qp_open_attr: Attributes identifying the QP to open. * * Returns a reference to a sharable QP. */ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, struct ib_qp_open_attr *qp_open_attr); /** * ib_close_qp - Release an external reference to a QP. * @qp: The QP handle to release * * The opened QP handle is released by the caller. The underlying * shared QP is not destroyed until all internal references are released. */ int ib_close_qp(struct ib_qp *qp); /** * ib_post_send - Posts a list of work requests to the send queue of * the specified QP. * @qp: The QP to post the work request on. * @send_wr: A list of work requests to post on the send queue. * @bad_send_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. * * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate * error is returned, the QP state shall not be affected, * ib_post_send() will return an immediate error after queueing any * earlier work requests in the list. */ static inline int ib_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr, struct ib_send_wr **bad_send_wr) { return qp->device->post_send(qp, send_wr, bad_send_wr); } /** * ib_post_recv - Posts a list of work requests to the receive queue of * the specified QP. * @qp: The QP to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr) { return qp->device->post_recv(qp, recv_wr, bad_recv_wr); } +struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, + int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx); +void ib_free_cq(struct ib_cq *cq); + /** * ib_create_cq - Creates a CQ on the specified device. * @device: The device on which to create the CQ. * @comp_handler: A user-specified callback that is invoked when a * completion event occurs on the CQ. * @event_handler: A user-specified callback that is invoked when an * asynchronous event not associated with a completion occurs on the CQ. * @cq_context: Context associated with the CQ returned to the user via * the associated completion and event handlers. - * @cqe: The minimum size of the CQ. - * @comp_vector - Completion vector used to signal completion events. - * Must be >= 0 and < context->num_comp_vectors. + * @cq_attr: The attributes the CQ should be created upon. * * Users can examine the cq structure to determine the actual CQ size. */ struct ib_cq *ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), - void *cq_context, int cqe, int comp_vector); + void *cq_context, + const struct ib_cq_init_attr *cq_attr); /** * ib_resize_cq - Modifies the capacity of the CQ. * @cq: The CQ to resize. * @cqe: The minimum size of the CQ. * * Users can examine the cq structure to determine the actual CQ size. */ int ib_resize_cq(struct ib_cq *cq, int cqe); /** - * ib_modify_cq - Modifies the attributes for the specified CQ and then - * transitions the CQ to the given state. + * ib_modify_cq - Modifies moderation params of the CQ * @cq: The CQ to modify. - * @cq_attr: specifies the CQ attributes to modify. - * @cq_attr_mask: A bit-mask used to specify which attributes of the CQ - * are being modified. + * @cq_count: number of CQEs that will trigger an event + * @cq_period: max period of time in usec before triggering an event + * */ -int ib_modify_cq(struct ib_cq *cq, - struct ib_cq_attr *cq_attr, - int cq_attr_mask); +int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); /** * ib_destroy_cq - Destroys the specified CQ. * @cq: The CQ to destroy. */ int ib_destroy_cq(struct ib_cq *cq); /** * ib_poll_cq - poll a CQ for completion(s) * @cq:the CQ being polled * @num_entries:maximum number of completions to return * @wc:array of at least @num_entries &struct ib_wc where completions * will be returned * * Poll a CQ for (possibly multiple) completions. If the return value * is < 0, an error occurred. If the return value is >= 0, it is the * number of completions returned. If the return value is * non-negative and < num_entries, then the CQ was emptied. */ static inline int ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) { return cq->device->poll_cq(cq, num_entries, wc); } /** * ib_peek_cq - Returns the number of unreaped completions currently * on the specified CQ. * @cq: The CQ to peek. * @wc_cnt: A minimum number of unreaped completions to check for. * * If the number of unreaped completions is greater than or equal to wc_cnt, * this function returns wc_cnt, otherwise, it returns the actual number of * unreaped completions. */ int ib_peek_cq(struct ib_cq *cq, int wc_cnt); /** * ib_req_notify_cq - Request completion notification on a CQ. * @cq: The CQ to generate an event for. * @flags: * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP * to request an event on the next solicited event or next work * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS * may also be |ed in to request a hint about missed events, as * described below. * * Return Value: * < 0 means an error occurred while requesting notification * == 0 means notification was requested successfully, and if * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events * were missed and it is safe to wait for another event. In * this case is it guaranteed that any work completions added * to the CQ since the last CQ poll will trigger a completion * notification event. * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed * in. It means that the consumer must poll the CQ again to * make sure it is empty to avoid missing an event because of a * race between requesting notification and an entry being * added to the CQ. This return value means it is possible * (but not guaranteed) that a work completion has been added * to the CQ since the last poll without triggering a * completion notification event. */ static inline int ib_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags) { return cq->device->req_notify_cq(cq, flags); } /** * ib_req_ncomp_notif - Request completion notification when there are * at least the specified number of unreaped completions on the CQ. * @cq: The CQ to generate an event for. * @wc_cnt: The number of unreaped completions that should be on the * CQ before an event is generated. */ static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt) { return cq->device->req_ncomp_notif ? cq->device->req_ncomp_notif(cq, wc_cnt) : -ENOSYS; } /** - * ib_get_dma_mr - Returns a memory region for system memory that is - * usable for DMA. - * @pd: The protection domain associated with the memory region. - * @mr_access_flags: Specifies the memory access rights. - * - * Note that the ib_dma_*() functions defined below must be used - * to create/destroy addresses used with the Lkey or Rkey returned - * by ib_get_dma_mr(). - */ -struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags); - -/** * ib_dma_mapping_error - check a DMA addr for error * @dev: The device for which the dma_addr was created * @dma_addr: The DMA address to check */ static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr) { if (dev->dma_ops) return dev->dma_ops->mapping_error(dev, dma_addr); return dma_mapping_error(dev->dma_device, dma_addr); } /** * ib_dma_map_single - Map a kernel virtual address to DMA address * @dev: The device for which the dma_addr is to be created * @cpu_addr: The kernel virtual address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_single(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) return dev->dma_ops->map_single(dev, cpu_addr, size, direction); return dma_map_single(dev->dma_device, cpu_addr, size, direction); } /** * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) dev->dma_ops->unmap_single(dev, addr, size, direction); else dma_unmap_single(dev->dma_device, addr, size, direction); } static inline u64 ib_dma_map_single_attrs(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction, - struct dma_attrs *attrs) + struct dma_attrs *dma_attrs) { return dma_map_single_attrs(dev->dma_device, cpu_addr, size, - direction, attrs); + direction, dma_attrs); } static inline void ib_dma_unmap_single_attrs(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction, - struct dma_attrs *attrs) + struct dma_attrs *dma_attrs) { return dma_unmap_single_attrs(dev->dma_device, addr, size, - direction, attrs); + direction, dma_attrs); } /** * ib_dma_map_page - Map a physical page to DMA address * @dev: The device for which the dma_addr is to be created * @page: The page to be mapped * @offset: The offset within the page * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_page(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) return dev->dma_ops->map_page(dev, page, offset, size, direction); return dma_map_page(dev->dma_device, page, offset, size, direction); } /** * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) dev->dma_ops->unmap_page(dev, addr, size, direction); else dma_unmap_page(dev->dma_device, addr, size, direction); } /** * ib_dma_map_sg - Map a scatter/gather list to DMA addresses * @dev: The device for which the DMA addresses are to be created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline int ib_dma_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { if (dev->dma_ops) return dev->dma_ops->map_sg(dev, sg, nents, direction); return dma_map_sg(dev->dma_device, sg, nents, direction); } /** * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses * @dev: The device for which the DMA addresses were created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline void ib_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { if (dev->dma_ops) dev->dma_ops->unmap_sg(dev, sg, nents, direction); else dma_unmap_sg(dev->dma_device, sg, nents, direction); } static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, - struct dma_attrs *attrs) + struct dma_attrs *dma_attrs) { - return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, attrs); + if (dev->dma_ops) + return dev->dma_ops->map_sg_attrs(dev, sg, nents, direction, + dma_attrs); + else + return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, + dma_attrs); } static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, - struct dma_attrs *attrs) + struct dma_attrs *dma_attrs) { - dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, attrs); + if (dev->dma_ops) + return dev->dma_ops->unmap_sg_attrs(dev, sg, nents, direction, + dma_attrs); + else + dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, + dma_attrs); } /** * ib_sg_dma_address - Return the DMA address from a scatter/gather entry * @dev: The device for which the DMA addresses were created * @sg: The scatter/gather entry + * + * Note: this function is obsolete. To do: change all occurrences of + * ib_sg_dma_address() into sg_dma_address(). */ static inline u64 ib_sg_dma_address(struct ib_device *dev, struct scatterlist *sg) { - if (dev->dma_ops) - return dev->dma_ops->dma_address(dev, sg); return sg_dma_address(sg); } /** * ib_sg_dma_len - Return the DMA length from a scatter/gather entry * @dev: The device for which the DMA addresses were created * @sg: The scatter/gather entry + * + * Note: this function is obsolete. To do: change all occurrences of + * ib_sg_dma_len() into sg_dma_len(). */ static inline unsigned int ib_sg_dma_len(struct ib_device *dev, struct scatterlist *sg) { - if (dev->dma_ops) - return dev->dma_ops->dma_len(dev, sg); return sg_dma_len(sg); } /** * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (dev->dma_ops) dev->dma_ops->sync_single_for_cpu(dev, addr, size, dir); else dma_sync_single_for_cpu(dev->dma_device, addr, size, dir); } /** * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_device(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (dev->dma_ops) dev->dma_ops->sync_single_for_device(dev, addr, size, dir); else dma_sync_single_for_device(dev->dma_device, addr, size, dir); } /** * ib_dma_alloc_coherent - Allocate memory and map it for DMA * @dev: The device for which the DMA address is requested * @size: The size of the region to allocate in bytes * @dma_handle: A pointer for returning the DMA address of the region * @flag: memory allocator flags */ static inline void *ib_dma_alloc_coherent(struct ib_device *dev, size_t size, u64 *dma_handle, gfp_t flag) { if (dev->dma_ops) return dev->dma_ops->alloc_coherent(dev, size, dma_handle, flag); else { dma_addr_t handle; void *ret; ret = dma_alloc_coherent(dev->dma_device, size, &handle, flag); *dma_handle = handle; return ret; } } /** * ib_dma_free_coherent - Free memory allocated by ib_dma_alloc_coherent() * @dev: The device for which the DMA addresses were allocated * @size: The size of the region * @cpu_addr: the address returned by ib_dma_alloc_coherent() * @dma_handle: the DMA address returned by ib_dma_alloc_coherent() */ static inline void ib_dma_free_coherent(struct ib_device *dev, size_t size, void *cpu_addr, u64 dma_handle) { if (dev->dma_ops) dev->dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); else dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle); } /** - * ib_reg_phys_mr - Prepares a virtually addressed memory region for use - * by an HCA. - * @pd: The protection domain associated assigned to the registered region. - * @phys_buf_array: Specifies a list of physical buffers to use in the - * memory region. - * @num_phys_buf: Specifies the size of the phys_buf_array. - * @mr_access_flags: Specifies the memory access rights. - * @iova_start: The offset of the region's starting I/O virtual address. - */ -struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start); - -/** - * ib_rereg_phys_mr - Modifies the attributes of an existing memory region. - * Conceptually, this call performs the functions deregister memory region - * followed by register physical memory region. Where possible, - * resources are reused instead of deallocated and reallocated. - * @mr: The memory region to modify. - * @mr_rereg_mask: A bit-mask used to indicate which of the following - * properties of the memory region are being modified. - * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies - * the new protection domain to associated with the memory region, - * otherwise, this parameter is ignored. - * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this - * field specifies a list of physical buffers to use in the new - * translation, otherwise, this parameter is ignored. - * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this - * field specifies the size of the phys_buf_array, otherwise, this - * parameter is ignored. - * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this - * field specifies the new memory access rights, otherwise, this - * parameter is ignored. - * @iova_start: The offset of the region's starting I/O virtual address. - */ -int ib_rereg_phys_mr(struct ib_mr *mr, - int mr_rereg_mask, - struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start); - -/** - * ib_query_mr - Retrieves information about a specific memory region. - * @mr: The memory region to retrieve information about. - * @mr_attr: The attributes of the specified memory region. - */ -int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); - -/** * ib_dereg_mr - Deregisters a memory region and removes it from the * HCA translation table. * @mr: The memory region to deregister. * * This function can fail, if the memory region has memory windows bound to it. */ int ib_dereg_mr(struct ib_mr *mr); +struct ib_mr *ib_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg); /** - * ib_create_mr - Allocates a memory region that may be used for - * signature handover operations. - * @pd: The protection domain associated with the region. - * @mr_init_attr: memory region init attributes. - */ -struct ib_mr *ib_create_mr(struct ib_pd *pd, - struct ib_mr_init_attr *mr_init_attr); - -/** - * ib_destroy_mr - Destroys a memory region that was created using - * ib_create_mr and removes it from HW translation tables. - * @mr: The memory region to destroy. - * - * This function can fail, if the memory region has memory windows bound to it. - */ -int ib_destroy_mr(struct ib_mr *mr); - -/** - * ib_alloc_fast_reg_mr - Allocates memory region usable with the - * IB_WR_FAST_REG_MR send work request. - * @pd: The protection domain associated with the region. - * @max_page_list_len: requested max physical buffer list length to be - * used with fast register work requests for this MR. - */ -struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len); - -/** - * ib_alloc_fast_reg_page_list - Allocates a page list array - * @device - ib device pointer. - * @page_list_len - size of the page list array to be allocated. - * - * This allocates and returns a struct ib_fast_reg_page_list * and a - * page_list array that is at least page_list_len in size. The actual - * size is returned in max_page_list_len. The caller is responsible - * for initializing the contents of the page_list array before posting - * a send work request with the IB_WC_FAST_REG_MR opcode. - * - * The page_list array entries must be translated using one of the - * ib_dma_*() functions just like the addresses passed to - * ib_map_phys_fmr(). Once the ib_post_send() is issued, the struct - * ib_fast_reg_page_list must not be modified by the caller until the - * IB_WC_FAST_REG_MR work request completes. - */ -struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list( - struct ib_device *device, int page_list_len); - -/** - * ib_free_fast_reg_page_list - Deallocates a previously allocated - * page list array. - * @page_list - struct ib_fast_reg_page_list pointer to be deallocated. - */ -void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); - -/** * ib_update_fast_reg_key - updates the key portion of the fast_reg MR * R_Key and L_Key. * @mr - struct ib_mr pointer to be updated. * @newkey - new key to be used. */ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) { mr->lkey = (mr->lkey & 0xffffff00) | newkey; mr->rkey = (mr->rkey & 0xffffff00) | newkey; } /** * ib_inc_rkey - increments the key portion of the given rkey. Can be used * for calculating a new rkey for type 2 memory windows. * @rkey - the rkey to increment. */ static inline u32 ib_inc_rkey(u32 rkey) { const u32 mask = 0x000000ff; return ((rkey + 1) & mask) | (rkey & ~mask); } /** - * ib_alloc_mw - Allocates a memory window. - * @pd: The protection domain associated with the memory window. - * @type: The type of the memory window (1 or 2). - */ -struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); - -/** - * ib_bind_mw - Posts a work request to the send queue of the specified - * QP, which binds the memory window to the given address range and - * remote access attributes. - * @qp: QP to post the bind work request on. - * @mw: The memory window to bind. - * @mw_bind: Specifies information about the memory window, including - * its address range, remote access rights, and associated memory region. - * - * If there is no immediate error, the function will update the rkey member - * of the mw parameter to its new value. The bind operation can still fail - * asynchronously. - */ -static inline int ib_bind_mw(struct ib_qp *qp, - struct ib_mw *mw, - struct ib_mw_bind *mw_bind) -{ - /* XXX reference counting in corresponding MR? */ - return mw->device->bind_mw ? - mw->device->bind_mw(qp, mw, mw_bind) : - -ENOSYS; -} - -/** - * ib_dealloc_mw - Deallocates a memory window. - * @mw: The memory window to deallocate. - */ -int ib_dealloc_mw(struct ib_mw *mw); - -/** * ib_alloc_fmr - Allocates a unmapped fast memory region. * @pd: The protection domain associated with the unmapped region. * @mr_access_flags: Specifies the memory access rights. * @fmr_attr: Attributes of the unmapped region. * * A fast memory region must be mapped before it can be used as part of * a work request. */ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); /** * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region. * @fmr: The fast memory region to associate with the pages. * @page_list: An array of physical pages to map to the fast memory region. * @list_len: The number of pages in page_list. * @iova: The I/O virtual address to use with the mapped region. */ static inline int ib_map_phys_fmr(struct ib_fmr *fmr, u64 *page_list, int list_len, u64 iova) { return fmr->device->map_phys_fmr(fmr, page_list, list_len, iova); } /** * ib_unmap_fmr - Removes the mapping from a list of fast memory regions. * @fmr_list: A linked list of fast memory regions to unmap. */ int ib_unmap_fmr(struct list_head *fmr_list); /** * ib_dealloc_fmr - Deallocates a fast memory region. * @fmr: The fast memory region to deallocate. */ int ib_dealloc_fmr(struct ib_fmr *fmr); /** * ib_attach_mcast - Attaches the specified QP to a multicast group. * @qp: QP to attach to the multicast group. The QP must be type * IB_QPT_UD. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. * * In order to send and receive multicast packets, subnet * administration must have created the multicast group and configured * the fabric appropriately. The port associated with the specified * QP must also be a member of the multicast group. */ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); /** * ib_detach_mcast - Detaches the specified QP from a multicast group. * @qp: QP to detach from the multicast group. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. */ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); /** * ib_alloc_xrcd - Allocates an XRC domain. * @device: The device on which to allocate the XRC domain. */ struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device); /** * ib_dealloc_xrcd - Deallocates an XRC domain. * @xrcd: The XRC domain to deallocate. */ int ib_dealloc_xrcd(struct ib_xrcd *xrcd); struct ib_flow *ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain); int ib_destroy_flow(struct ib_flow *flow_id); -struct ib_dct *ib_create_dct(struct ib_pd *pd, struct ib_dct_init_attr *attr, - struct ib_udata *udata); -int ib_destroy_dct(struct ib_dct *dct); -int ib_query_dct(struct ib_dct *dct, struct ib_dct_attr *attr); - -int ib_query_values(struct ib_device *device, - int q_values, struct ib_device_values *values); - -static inline void ib_active_speed_enum_to_rate(u8 active_speed, - int *rate, - char **speed) -{ - switch (active_speed) { - case IB_SPEED_DDR: - *speed = " DDR"; - *rate = 50; - break; - case IB_SPEED_QDR: - *speed = " QDR"; - *rate = 100; - break; - case IB_SPEED_FDR10: - *speed = " FDR10"; - *rate = 100; - break; - case IB_SPEED_FDR: - *speed = " FDR"; - *rate = 140; - break; - case IB_SPEED_EDR: - *speed = " EDR"; - *rate = 250; - break; - case IB_SPEED_SDR: - default: /* default to SDR for invalid rates */ - *rate = 25; - break; - } - -} - static inline int ib_check_mr_access(int flags) { /* * Local write permission is required if remote write or * remote atomic permission is also requested. */ if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && !(flags & IB_ACCESS_LOCAL_WRITE)) return -EINVAL; return 0; } /** * ib_check_mr_status: lightweight check of MR status. * This routine may provide status checks on a selected * ib_mr. first use is for signature status check. * * @mr: A memory region. * @check_mask: Bitmask of which checks to perform from * ib_mr_status_check enumeration. * @mr_status: The container of relevant status checks. * failed checks will be indicated in the status bitmask * and the relevant info shall be in the error item. */ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); +struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, + u16 pkey, const union ib_gid *gid, + const struct sockaddr *addr); +struct ib_wq *ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr); +int ib_destroy_wq(struct ib_wq *wq); +int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *attr, + u32 wq_attr_mask); +struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr* + wq_ind_table_init_attr); +int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); + +int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset, unsigned int page_size); + +static inline int +ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset, unsigned int page_size) +{ + int n; + + n = ib_map_mr_sg(mr, sg, sg_nents, sg_offset, page_size); + mr->iova = 0; + + return n; +} + +int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, + unsigned int *sg_offset, int (*set_page)(struct ib_mr *, u64)); + +void ib_drain_rq(struct ib_qp *qp); +void ib_drain_sq(struct ib_qp *qp); +void ib_drain_qp(struct ib_qp *qp); #endif /* IB_VERBS_H */ Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/iw_cm.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/iw_cm.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/iw_cm.h (revision 319974) @@ -1,257 +1,255 @@ /* * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. - * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef IW_CM_H #define IW_CM_H #include #include struct iw_cm_id; enum iw_cm_event_type { IW_CM_EVENT_CONNECT_REQUEST = 1, /* connect request received */ IW_CM_EVENT_CONNECT_REPLY, /* reply from active connect request */ IW_CM_EVENT_ESTABLISHED, /* passive side accept successful */ IW_CM_EVENT_DISCONNECT, /* orderly shutdown */ IW_CM_EVENT_CLOSE /* close complete */ }; struct iw_cm_event { enum iw_cm_event_type event; int status; - struct sockaddr_in local_addr; - struct sockaddr_in remote_addr; + struct sockaddr_storage local_addr; + struct sockaddr_storage remote_addr; void *private_data; void *provider_data; u8 private_data_len; - struct socket *so; u8 ord; u8 ird; }; /** * iw_cm_handler - Function to be called by the IW CM when delivering events * to the client. * * @cm_id: The IW CM identifier associated with the event. * @event: Pointer to the event structure. */ typedef int (*iw_cm_handler)(struct iw_cm_id *cm_id, struct iw_cm_event *event); /** * iw_event_handler - Function called by the provider when delivering provider * events to the IW CM. Returns either 0 indicating the event was processed * or -errno if the event could not be processed. * * @cm_id: The IW CM identifier associated with the event. * @event: Pointer to the event structure. */ typedef int (*iw_event_handler)(struct iw_cm_id *cm_id, struct iw_cm_event *event); struct iw_cm_id { iw_cm_handler cm_handler; /* client callback function */ void *context; /* client cb context */ struct ib_device *device; - struct sockaddr_in local_addr; - struct sockaddr_in remote_addr; + struct sockaddr_storage local_addr; /* local addr */ + struct sockaddr_storage remote_addr; + struct sockaddr_storage m_local_addr; /* nmapped local addr */ + struct sockaddr_storage m_remote_addr; /* nmapped rem addr */ void *provider_data; /* provider private data */ iw_event_handler event_handler; /* cb for provider events */ /* Used by provider to add and remove refs on IW cm_id */ void (*add_ref)(struct iw_cm_id *); void (*rem_ref)(struct iw_cm_id *); - struct socket *so; + u8 tos; }; struct iw_cm_conn_param { const void *private_data; u16 private_data_len; u32 ord; u32 ird; u32 qpn; }; struct iw_cm_verbs { void (*add_ref)(struct ib_qp *qp); void (*rem_ref)(struct ib_qp *qp); struct ib_qp * (*get_qp)(struct ib_device *device, int qpn); int (*connect)(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int (*accept)(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int (*reject)(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); - int (*create_listen_ep)(struct iw_cm_id *cm_id, + int (*create_listen)(struct iw_cm_id *cm_id, int backlog); - void (*destroy_listen_ep)(struct iw_cm_id *cm_id); - - void (*newconn)(struct iw_cm_id *parent_cm_id, - struct socket *so); + int (*destroy_listen)(struct iw_cm_id *cm_id); + char ifname[IFNAMSIZ]; }; /** * iw_create_cm_id - Create an IW CM identifier. * * @device: The IB device on which to create the IW CM identier. * @event_handler: User callback invoked to report events associated with the * returned IW CM identifier. * @context: User specified context associated with the id. */ -struct iw_cm_id *iw_create_cm_id(struct ib_device *device, struct socket *so, +struct iw_cm_id *iw_create_cm_id(struct ib_device *device, iw_cm_handler cm_handler, void *context); /** * iw_destroy_cm_id - Destroy an IW CM identifier. * * @cm_id: The previously created IW CM identifier to destroy. * * The client can assume that no events will be delivered for the CM ID after * this function returns. */ void iw_destroy_cm_id(struct iw_cm_id *cm_id); /** * iw_cm_bind_qp - Unbind the specified IW CM identifier and QP * * @cm_id: The IW CM idenfier to unbind from the QP. * @qp: The QP * * This is called by the provider when destroying the QP to ensure * that any references held by the IWCM are released. It may also * be called by the IWCM when destroying a CM_ID to that any * references held by the provider are released. */ void iw_cm_unbind_qp(struct iw_cm_id *cm_id, struct ib_qp *qp); /** * iw_cm_get_qp - Return the ib_qp associated with a QPN * * @ib_device: The IB device * @qpn: The queue pair number */ struct ib_qp *iw_cm_get_qp(struct ib_device *device, int qpn); /** * iw_cm_listen - Listen for incoming connection requests on the * specified IW CM id. * * @cm_id: The IW CM identifier. * @backlog: The maximum number of outstanding un-accepted inbound listen * requests to queue. * * The source address and port number are specified in the IW CM identifier * structure. */ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog); /** * iw_cm_accept - Called to accept an incoming connect request. * * @cm_id: The IW CM identifier associated with the connection request. * @iw_param: Pointer to a structure containing connection establishment * parameters. * * The specified cm_id will have been provided in the event data for a * CONNECT_REQUEST event. Subsequent events related to this connection will be * delivered to the specified IW CM identifier prior and may occur prior to * the return of this function. If this function returns a non-zero value, the * client can assume that no events will be delivered to the specified IW CM * identifier. */ int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param); /** * iw_cm_reject - Reject an incoming connection request. * * @cm_id: Connection identifier associated with the request. * @private_daa: Pointer to data to deliver to the remote peer as part of the * reject message. * @private_data_len: The number of bytes in the private_data parameter. * * The client can assume that no events will be delivered to the specified IW * CM identifier following the return of this function. The private_data * buffer is available for reuse when this function returns. */ int iw_cm_reject(struct iw_cm_id *cm_id, const void *private_data, u8 private_data_len); /** * iw_cm_connect - Called to request a connection to a remote peer. * * @cm_id: The IW CM identifier for the connection. * @iw_param: Pointer to a structure containing connection establishment * parameters. * * Events may be delivered to the specified IW CM identifier prior to the * return of this function. If this function returns a non-zero value, the * client can assume that no events will be delivered to the specified IW CM * identifier. */ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param); /** * iw_cm_disconnect - Close the specified connection. * * @cm_id: The IW CM identifier to close. * @abrupt: If 0, the connection will be closed gracefully, otherwise, the * connection will be reset. * * The IW CM identifier is still active until the IW_CM_EVENT_CLOSE event is * delivered. */ int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt); /** * iw_cm_init_qp_attr - Called to initialize the attributes of the QP * associated with a IW CM identifier. * * @cm_id: The IW CM identifier associated with the QP * @qp_attr: Pointer to the QP attributes structure. * @qp_attr_mask: Pointer to a bit vector specifying which QP attributes are * valid. */ int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr, int *qp_attr_mask); #endif /* IW_CM_H */ Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/iw_portmap.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/iw_portmap.h (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/iw_portmap.h (revision 319974) @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _IW_PORTMAP_H +#define _IW_PORTMAP_H + +#define IWPM_ULIBNAME_SIZE 32 +#define IWPM_DEVNAME_SIZE 32 +#define IWPM_IFNAME_SIZE 16 +#define IWPM_IPADDR_SIZE 16 + +enum { + IWPM_INVALID_NLMSG_ERR = 10, + IWPM_CREATE_MAPPING_ERR, + IWPM_DUPLICATE_MAPPING_ERR, + IWPM_UNKNOWN_MAPPING_ERR, + IWPM_CLIENT_DEV_INFO_ERR, + IWPM_USER_LIB_INFO_ERR, + IWPM_REMOTE_QUERY_REJECT +}; + +struct iwpm_dev_data { + char dev_name[IWPM_DEVNAME_SIZE]; + char if_name[IWPM_IFNAME_SIZE]; +}; + +struct iwpm_sa_data { + struct sockaddr_storage loc_addr; + struct sockaddr_storage mapped_loc_addr; + struct sockaddr_storage rem_addr; + struct sockaddr_storage mapped_rem_addr; +}; + +/** + * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid + * + * Returns true if the pid is greater than zero, otherwise returns false + */ +int iwpm_valid_pid(void); + +#endif /* _IW_PORTMAP_H */ Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/rdma/iw_portmap.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/opa_port_info.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/opa_port_info.h (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/opa_port_info.h (revision 319974) @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(OPA_PORT_INFO_H) +#define OPA_PORT_INFO_H + +#define OPA_PORT_LINK_MODE_NOP 0 /* No change */ +#define OPA_PORT_LINK_MODE_OPA 4 /* Port mode is OPA */ + +#define OPA_PORT_PACKET_FORMAT_NOP 0 /* No change */ +#define OPA_PORT_PACKET_FORMAT_8B 1 /* Format 8B */ +#define OPA_PORT_PACKET_FORMAT_9B 2 /* Format 9B */ +#define OPA_PORT_PACKET_FORMAT_10B 4 /* Format 10B */ +#define OPA_PORT_PACKET_FORMAT_16B 8 /* Format 16B */ + +#define OPA_PORT_LTP_CRC_MODE_NONE 0 /* No change */ +#define OPA_PORT_LTP_CRC_MODE_14 1 /* 14-bit LTP CRC mode (optional) */ +#define OPA_PORT_LTP_CRC_MODE_16 2 /* 16-bit LTP CRC mode */ +#define OPA_PORT_LTP_CRC_MODE_48 4 /* 48-bit LTP CRC mode (optional) */ +#define OPA_PORT_LTP_CRC_MODE_PER_LANE 8 /* 12/16-bit per lane LTP CRC mode */ + +/* Link Down / Neighbor Link Down Reason; indicated as follows: */ +#define OPA_LINKDOWN_REASON_NONE 0 /* No specified reason */ +#define OPA_LINKDOWN_REASON_RCV_ERROR_0 1 +#define OPA_LINKDOWN_REASON_BAD_PKT_LEN 2 +#define OPA_LINKDOWN_REASON_PKT_TOO_LONG 3 +#define OPA_LINKDOWN_REASON_PKT_TOO_SHORT 4 +#define OPA_LINKDOWN_REASON_BAD_SLID 5 +#define OPA_LINKDOWN_REASON_BAD_DLID 6 +#define OPA_LINKDOWN_REASON_BAD_L2 7 +#define OPA_LINKDOWN_REASON_BAD_SC 8 +#define OPA_LINKDOWN_REASON_RCV_ERROR_8 9 +#define OPA_LINKDOWN_REASON_BAD_MID_TAIL 10 +#define OPA_LINKDOWN_REASON_RCV_ERROR_10 11 +#define OPA_LINKDOWN_REASON_PREEMPT_ERROR 12 +#define OPA_LINKDOWN_REASON_PREEMPT_VL15 13 +#define OPA_LINKDOWN_REASON_BAD_VL_MARKER 14 +#define OPA_LINKDOWN_REASON_RCV_ERROR_14 15 +#define OPA_LINKDOWN_REASON_RCV_ERROR_15 16 +#define OPA_LINKDOWN_REASON_BAD_HEAD_DIST 17 +#define OPA_LINKDOWN_REASON_BAD_TAIL_DIST 18 +#define OPA_LINKDOWN_REASON_BAD_CTRL_DIST 19 +#define OPA_LINKDOWN_REASON_BAD_CREDIT_ACK 20 +#define OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER 21 +#define OPA_LINKDOWN_REASON_BAD_PREEMPT 22 +#define OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT 23 +#define OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT 24 +#define OPA_LINKDOWN_REASON_RCV_ERROR_24 25 +#define OPA_LINKDOWN_REASON_RCV_ERROR_25 26 +#define OPA_LINKDOWN_REASON_RCV_ERROR_26 27 +#define OPA_LINKDOWN_REASON_RCV_ERROR_27 28 +#define OPA_LINKDOWN_REASON_RCV_ERROR_28 29 +#define OPA_LINKDOWN_REASON_RCV_ERROR_29 30 +#define OPA_LINKDOWN_REASON_RCV_ERROR_30 31 +#define OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN 32 +#define OPA_LINKDOWN_REASON_UNKNOWN 33 +/* 34 -reserved */ +#define OPA_LINKDOWN_REASON_REBOOT 35 +#define OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN 36 +/* 37-38 reserved */ +#define OPA_LINKDOWN_REASON_FM_BOUNCE 39 +#define OPA_LINKDOWN_REASON_SPEED_POLICY 40 +#define OPA_LINKDOWN_REASON_WIDTH_POLICY 41 +/* 42-48 reserved */ +#define OPA_LINKDOWN_REASON_DISCONNECTED 49 +#define OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED 50 +#define OPA_LINKDOWN_REASON_NOT_INSTALLED 51 +#define OPA_LINKDOWN_REASON_CHASSIS_CONFIG 52 +/* 53 reserved */ +#define OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED 54 +/* 55 reserved */ +#define OPA_LINKDOWN_REASON_POWER_POLICY 56 +#define OPA_LINKDOWN_REASON_LINKSPEED_POLICY 57 +#define OPA_LINKDOWN_REASON_LINKWIDTH_POLICY 58 +/* 59 reserved */ +#define OPA_LINKDOWN_REASON_SWITCH_MGMT 60 +#define OPA_LINKDOWN_REASON_SMA_DISABLED 61 +/* 62 reserved */ +#define OPA_LINKDOWN_REASON_TRANSIENT 63 +/* 64-255 reserved */ + +/* OPA Link Init reason; indicated as follows: */ +/* 3-7; 11-15 reserved; 8-15 cleared on Polling->LinkUp */ +#define OPA_LINKINIT_REASON_NOP 0 +#define OPA_LINKINIT_REASON_LINKUP (1 << 4) +#define OPA_LINKINIT_REASON_FLAPPING (2 << 4) +#define OPA_LINKINIT_REASON_CLEAR (8 << 4) +#define OPA_LINKINIT_OUTSIDE_POLICY (8 << 4) +#define OPA_LINKINIT_QUARANTINED (9 << 4) +#define OPA_LINKINIT_INSUFIC_CAPABILITY (10 << 4) + +#define OPA_LINK_SPEED_NOP 0x0000 /* Reserved (1-5 Gbps) */ +#define OPA_LINK_SPEED_12_5G 0x0001 /* 12.5 Gbps */ +#define OPA_LINK_SPEED_25G 0x0002 /* 25.78125? Gbps (EDR) */ + +#define OPA_LINK_WIDTH_1X 0x0001 +#define OPA_LINK_WIDTH_2X 0x0002 +#define OPA_LINK_WIDTH_3X 0x0004 +#define OPA_LINK_WIDTH_4X 0x0008 + +#define OPA_CAP_MASK3_IsSnoopSupported (1 << 7) +#define OPA_CAP_MASK3_IsAsyncSC2VLSupported (1 << 6) +#define OPA_CAP_MASK3_IsAddrRangeConfigSupported (1 << 5) +#define OPA_CAP_MASK3_IsPassThroughSupported (1 << 4) +#define OPA_CAP_MASK3_IsSharedSpaceSupported (1 << 3) +/* reserved (1 << 2) */ +#define OPA_CAP_MASK3_IsVLMarkerSupported (1 << 1) +#define OPA_CAP_MASK3_IsVLrSupported (1 << 0) + +/** + * new MTU values + */ +enum { + OPA_MTU_8192 = 6, + OPA_MTU_10240 = 7, +}; + +enum { + OPA_PORT_PHYS_CONF_DISCONNECTED = 0, + OPA_PORT_PHYS_CONF_STANDARD = 1, + OPA_PORT_PHYS_CONF_FIXED = 2, + OPA_PORT_PHYS_CONF_VARIABLE = 3, + OPA_PORT_PHYS_CONF_SI_PHOTO = 4 +}; + +enum port_info_field_masks { + /* vl.cap */ + OPA_PI_MASK_VL_CAP = 0x1F, + /* port_states.ledenable_offlinereason */ + OPA_PI_MASK_OFFLINE_REASON = 0x0F, + OPA_PI_MASK_LED_ENABLE = 0x40, + /* port_states.unsleepstate_downdefstate */ + OPA_PI_MASK_UNSLEEP_STATE = 0xF0, + OPA_PI_MASK_DOWNDEF_STATE = 0x0F, + /* port_states.portphysstate_portstate */ + OPA_PI_MASK_PORT_PHYSICAL_STATE = 0xF0, + OPA_PI_MASK_PORT_STATE = 0x0F, + /* port_phys_conf */ + OPA_PI_MASK_PORT_PHYSICAL_CONF = 0x0F, + /* collectivemask_multicastmask */ + OPA_PI_MASK_COLLECT_MASK = 0x38, + OPA_PI_MASK_MULTICAST_MASK = 0x07, + /* mkeyprotect_lmc */ + OPA_PI_MASK_MKEY_PROT_BIT = 0xC0, + OPA_PI_MASK_LMC = 0x0F, + /* smsl */ + OPA_PI_MASK_SMSL = 0x1F, + /* partenforce_filterraw */ + /* Filter Raw In/Out bits 1 and 2 were removed */ + OPA_PI_MASK_LINKINIT_REASON = 0xF0, + OPA_PI_MASK_PARTITION_ENFORCE_IN = 0x08, + OPA_PI_MASK_PARTITION_ENFORCE_OUT = 0x04, + /* operational_vls */ + OPA_PI_MASK_OPERATIONAL_VL = 0x1F, + /* sa_qp */ + OPA_PI_MASK_SA_QP = 0x00FFFFFF, + /* sm_trap_qp */ + OPA_PI_MASK_SM_TRAP_QP = 0x00FFFFFF, + /* localphy_overrun_errors */ + OPA_PI_MASK_LOCAL_PHY_ERRORS = 0xF0, + OPA_PI_MASK_OVERRUN_ERRORS = 0x0F, + /* clientrereg_subnettimeout */ + OPA_PI_MASK_CLIENT_REREGISTER = 0x80, + OPA_PI_MASK_SUBNET_TIMEOUT = 0x1F, + /* port_link_mode */ + OPA_PI_MASK_PORT_LINK_SUPPORTED = (0x001F << 10), + OPA_PI_MASK_PORT_LINK_ENABLED = (0x001F << 5), + OPA_PI_MASK_PORT_LINK_ACTIVE = (0x001F << 0), + /* port_link_crc_mode */ + OPA_PI_MASK_PORT_LINK_CRC_SUPPORTED = 0x0F00, + OPA_PI_MASK_PORT_LINK_CRC_ENABLED = 0x00F0, + OPA_PI_MASK_PORT_LINK_CRC_ACTIVE = 0x000F, + /* port_mode */ + OPA_PI_MASK_PORT_MODE_SECURITY_CHECK = 0x0001, + OPA_PI_MASK_PORT_MODE_16B_TRAP_QUERY = 0x0002, + OPA_PI_MASK_PORT_MODE_PKEY_CONVERT = 0x0004, + OPA_PI_MASK_PORT_MODE_SC2SC_MAPPING = 0x0008, + OPA_PI_MASK_PORT_MODE_VL_MARKER = 0x0010, + OPA_PI_MASK_PORT_PASS_THROUGH = 0x0020, + OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE = 0x0040, + /* flit_control.interleave */ + OPA_PI_MASK_INTERLEAVE_DIST_SUP = (0x0003 << 12), + OPA_PI_MASK_INTERLEAVE_DIST_ENABLE = (0x0003 << 10), + OPA_PI_MASK_INTERLEAVE_MAX_NEST_TX = (0x001F << 5), + OPA_PI_MASK_INTERLEAVE_MAX_NEST_RX = (0x001F << 0), + + /* port_error_action */ + OPA_PI_MASK_EX_BUFFER_OVERRUN = 0x80000000, + /* 7 bits reserved */ + OPA_PI_MASK_FM_CFG_ERR_EXCEED_MULTICAST_LIMIT = 0x00800000, + OPA_PI_MASK_FM_CFG_BAD_CONTROL_FLIT = 0x00400000, + OPA_PI_MASK_FM_CFG_BAD_PREEMPT = 0x00200000, + OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER = 0x00100000, + OPA_PI_MASK_FM_CFG_BAD_CRDT_ACK = 0x00080000, + OPA_PI_MASK_FM_CFG_BAD_CTRL_DIST = 0x00040000, + OPA_PI_MASK_FM_CFG_BAD_TAIL_DIST = 0x00020000, + OPA_PI_MASK_FM_CFG_BAD_HEAD_DIST = 0x00010000, + /* 2 bits reserved */ + OPA_PI_MASK_PORT_RCV_BAD_VL_MARKER = 0x00002000, + OPA_PI_MASK_PORT_RCV_PREEMPT_VL15 = 0x00001000, + OPA_PI_MASK_PORT_RCV_PREEMPT_ERROR = 0x00000800, + /* 1 bit reserved */ + OPA_PI_MASK_PORT_RCV_BAD_MidTail = 0x00000200, + /* 1 bit reserved */ + OPA_PI_MASK_PORT_RCV_BAD_SC = 0x00000080, + OPA_PI_MASK_PORT_RCV_BAD_L2 = 0x00000040, + OPA_PI_MASK_PORT_RCV_BAD_DLID = 0x00000020, + OPA_PI_MASK_PORT_RCV_BAD_SLID = 0x00000010, + OPA_PI_MASK_PORT_RCV_PKTLEN_TOOSHORT = 0x00000008, + OPA_PI_MASK_PORT_RCV_PKTLEN_TOOLONG = 0x00000004, + OPA_PI_MASK_PORT_RCV_BAD_PKTLEN = 0x00000002, + OPA_PI_MASK_PORT_RCV_BAD_LT = 0x00000001, + + /* pass_through.res_drctl */ + OPA_PI_MASK_PASS_THROUGH_DR_CONTROL = 0x01, + + /* buffer_units */ + OPA_PI_MASK_BUF_UNIT_VL15_INIT = (0x00000FFF << 11), + OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE = (0x0000001F << 6), + OPA_PI_MASK_BUF_UNIT_CREDIT_ACK = (0x00000003 << 3), + OPA_PI_MASK_BUF_UNIT_BUF_ALLOC = (0x00000003 << 0), + + /* neigh_mtu.pvlx_to_mtu */ + OPA_PI_MASK_NEIGH_MTU_PVL0 = 0xF0, + OPA_PI_MASK_NEIGH_MTU_PVL1 = 0x0F, + + /* neigh_mtu.vlstall_hoq_life */ + OPA_PI_MASK_VL_STALL = (0x03 << 5), + OPA_PI_MASK_HOQ_LIFE = (0x1F << 0), + + /* port_neigh_mode */ + OPA_PI_MASK_NEIGH_MGMT_ALLOWED = (0x01 << 3), + OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS = (0x01 << 2), + OPA_PI_MASK_NEIGH_NODE_TYPE = (0x03 << 0), + + /* resptime_value */ + OPA_PI_MASK_RESPONSE_TIME_VALUE = 0x1F, + + /* mtucap */ + OPA_PI_MASK_MTU_CAP = 0x0F, +}; + +struct opa_port_states { + u8 reserved; + u8 ledenable_offlinereason; /* 1 res, 1 bit, 6 bits */ + u8 reserved2; + u8 portphysstate_portstate; /* 4 bits, 4 bits */ +}; + +struct opa_port_state_info { + struct opa_port_states port_states; + __be16 link_width_downgrade_tx_active; + __be16 link_width_downgrade_rx_active; +}; + +struct opa_port_info { + __be32 lid; + __be32 flow_control_mask; + + struct { + u8 res; /* was inittype */ + u8 cap; /* 3 res, 5 bits */ + __be16 high_limit; + __be16 preempt_limit; + u8 arb_high_cap; + u8 arb_low_cap; + } vl; + + struct opa_port_states port_states; + u8 port_phys_conf; /* 4 res, 4 bits */ + u8 collectivemask_multicastmask; /* 2 res, 3, 3 */ + u8 mkeyprotect_lmc; /* 2 bits, 2 res, 4 bits */ + u8 smsl; /* 3 res, 5 bits */ + + u8 partenforce_filterraw; /* bit fields */ + u8 operational_vls; /* 3 res, 5 bits */ + __be16 pkey_8b; + __be16 pkey_10b; + __be16 mkey_violations; + + __be16 pkey_violations; + __be16 qkey_violations; + __be32 sm_trap_qp; /* 8 bits, 24 bits */ + + __be32 sa_qp; /* 8 bits, 24 bits */ + u8 neigh_port_num; + u8 link_down_reason; + u8 neigh_link_down_reason; + u8 clientrereg_subnettimeout; /* 1 bit, 2 bits, 5 */ + + struct { + __be16 supported; + __be16 enabled; + __be16 active; + } link_speed; + struct { + __be16 supported; + __be16 enabled; + __be16 active; + } link_width; + struct { + __be16 supported; + __be16 enabled; + __be16 tx_active; + __be16 rx_active; + } link_width_downgrade; + __be16 port_link_mode; /* 1 res, 5 bits, 5 bits, 5 bits */ + __be16 port_ltp_crc_mode; /* 4 res, 4 bits, 4 bits, 4 bits */ + + __be16 port_mode; /* 9 res, bit fields */ + struct { + __be16 supported; + __be16 enabled; + } port_packet_format; + struct { + __be16 interleave; /* 2 res, 2,2,5,5 */ + struct { + __be16 min_initial; + __be16 min_tail; + u8 large_pkt_limit; + u8 small_pkt_limit; + u8 max_small_pkt_limit; + u8 preemption_limit; + } preemption; + } flit_control; + + __be32 reserved4; + __be32 port_error_action; /* bit field */ + + struct { + u8 egress_port; + u8 res_drctl; /* 7 res, 1 */ + } pass_through; + __be16 mkey_lease_period; + __be32 buffer_units; /* 9 res, 12, 5, 3, 3 */ + + __be32 reserved5; + __be32 sm_lid; + + __be64 mkey; + + __be64 subnet_prefix; + + struct { + u8 pvlx_to_mtu[OPA_MAX_VLS/2]; /* 4 bits, 4 bits */ + } neigh_mtu; + + struct { + u8 vlstall_hoqlife; /* 3 bits, 5 bits */ + } xmit_q[OPA_MAX_VLS]; + + struct { + u8 addr[16]; + } ipaddr_ipv6; + + struct { + u8 addr[4]; + } ipaddr_ipv4; + + u32 reserved6; + u32 reserved7; + u32 reserved8; + + __be64 neigh_node_guid; + + __be32 ib_cap_mask; + __be16 reserved9; /* was ib_cap_mask2 */ + __be16 opa_cap_mask; + + __be32 reserved10; /* was link_roundtrip_latency */ + __be16 overall_buffer_space; + __be16 reserved11; /* was max_credit_hint */ + + __be16 diag_code; + struct { + u8 buffer; + u8 wire; + } replay_depth; + u8 port_neigh_mode; + u8 mtucap; /* 4 res, 4 bits */ + + u8 resptimevalue; /* 3 res, 5 bits */ + u8 local_port_num; + u8 reserved12; + u8 reserved13; /* was guid_cap */ +} __attribute__ ((packed)); + +#endif /* OPA_PORT_INFO_H */ Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/rdma/opa_port_info.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/opa_smi.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/opa_smi.h (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/opa_smi.h (revision 319974) @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(OPA_SMI_H) +#define OPA_SMI_H + +#include +#include + +#define OPA_SMP_LID_DATA_SIZE 2016 +#define OPA_SMP_DR_DATA_SIZE 1872 +#define OPA_SMP_MAX_PATH_HOPS 64 + +#define OPA_MAX_VLS 32 +#define OPA_MAX_SLS 32 +#define OPA_MAX_SCS 32 + +#define OPA_SMI_CLASS_VERSION 0x80 + +#define OPA_LID_PERMISSIVE cpu_to_be32(0xFFFFFFFF) + +struct opa_smp { + u8 base_version; + u8 mgmt_class; + u8 class_version; + u8 method; + __be16 status; + u8 hop_ptr; + u8 hop_cnt; + __be64 tid; + __be16 attr_id; + __be16 resv; + __be32 attr_mod; + __be64 mkey; + union { + struct { + uint8_t data[OPA_SMP_LID_DATA_SIZE]; + } lid; + struct { + __be32 dr_slid; + __be32 dr_dlid; + u8 initial_path[OPA_SMP_MAX_PATH_HOPS]; + u8 return_path[OPA_SMP_MAX_PATH_HOPS]; + u8 reserved[8]; + u8 data[OPA_SMP_DR_DATA_SIZE]; + } dr; + } route; +} __packed; + + +/* Subnet management attributes */ +/* ... */ +#define OPA_ATTRIB_ID_NODE_DESCRIPTION cpu_to_be16(0x0010) +#define OPA_ATTRIB_ID_NODE_INFO cpu_to_be16(0x0011) +#define OPA_ATTRIB_ID_PORT_INFO cpu_to_be16(0x0015) +#define OPA_ATTRIB_ID_PARTITION_TABLE cpu_to_be16(0x0016) +#define OPA_ATTRIB_ID_SL_TO_SC_MAP cpu_to_be16(0x0017) +#define OPA_ATTRIB_ID_VL_ARBITRATION cpu_to_be16(0x0018) +#define OPA_ATTRIB_ID_SM_INFO cpu_to_be16(0x0020) +#define OPA_ATTRIB_ID_CABLE_INFO cpu_to_be16(0x0032) +#define OPA_ATTRIB_ID_AGGREGATE cpu_to_be16(0x0080) +#define OPA_ATTRIB_ID_SC_TO_SL_MAP cpu_to_be16(0x0082) +#define OPA_ATTRIB_ID_SC_TO_VLR_MAP cpu_to_be16(0x0083) +#define OPA_ATTRIB_ID_SC_TO_VLT_MAP cpu_to_be16(0x0084) +#define OPA_ATTRIB_ID_SC_TO_VLNT_MAP cpu_to_be16(0x0085) +/* ... */ +#define OPA_ATTRIB_ID_PORT_STATE_INFO cpu_to_be16(0x0087) +/* ... */ +#define OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE cpu_to_be16(0x008A) +/* ... */ + +struct opa_node_description { + u8 data[64]; +} __attribute__ ((packed)); + +struct opa_node_info { + u8 base_version; + u8 class_version; + u8 node_type; + u8 num_ports; + __be32 reserved; + __be64 system_image_guid; + __be64 node_guid; + __be64 port_guid; + __be16 partition_cap; + __be16 device_id; + __be32 revision; + u8 local_port_num; + u8 vendor_id[3]; /* network byte order */ +} __attribute__ ((packed)); + +#define OPA_PARTITION_TABLE_BLK_SIZE 32 + +static inline u8 +opa_get_smp_direction(const struct opa_smp *smp) +{ + return ib_get_smp_direction((const struct ib_smp *)smp); +} + +static inline u8 *opa_get_smp_data(struct opa_smp *smp) +{ + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + return smp->route.dr.data; + + return smp->route.lid.data; +} + +static inline size_t opa_get_smp_data_size(const struct opa_smp *smp) +{ + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + return sizeof(smp->route.dr.data); + + return sizeof(smp->route.lid.data); +} + +static inline size_t opa_get_smp_header_size(const struct opa_smp *smp) +{ + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + return sizeof(*smp) - sizeof(smp->route.dr.data); + + return sizeof(*smp) - sizeof(smp->route.lid.data); +} + +#endif /* OPA_SMI_H */ Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/rdma/opa_smi.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdma_cm.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdma_cm.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdma_cm.h (revision 319974) @@ -1,407 +1,391 @@ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. - * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if !defined(RDMA_CM_H) #define RDMA_CM_H #include #include #include #include /* * Upon receiving a device removal event, users must destroy the associated * RDMA identifier and release all resources allocated with the device. */ enum rdma_cm_event_type { RDMA_CM_EVENT_ADDR_RESOLVED, RDMA_CM_EVENT_ADDR_ERROR, RDMA_CM_EVENT_ROUTE_RESOLVED, RDMA_CM_EVENT_ROUTE_ERROR, RDMA_CM_EVENT_CONNECT_REQUEST, RDMA_CM_EVENT_CONNECT_RESPONSE, RDMA_CM_EVENT_CONNECT_ERROR, RDMA_CM_EVENT_UNREACHABLE, RDMA_CM_EVENT_REJECTED, RDMA_CM_EVENT_ESTABLISHED, RDMA_CM_EVENT_DISCONNECTED, RDMA_CM_EVENT_DEVICE_REMOVAL, RDMA_CM_EVENT_MULTICAST_JOIN, RDMA_CM_EVENT_MULTICAST_ERROR, RDMA_CM_EVENT_ADDR_CHANGE, - RDMA_CM_EVENT_TIMEWAIT_EXIT, - RDMA_CM_EVENT_ALT_ROUTE_RESOLVED, - RDMA_CM_EVENT_ALT_ROUTE_ERROR, - RDMA_CM_EVENT_LOAD_ALT_PATH, - RDMA_CM_EVENT_ALT_PATH_LOADED, + RDMA_CM_EVENT_TIMEWAIT_EXIT }; +const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event); + enum rdma_port_space { RDMA_PS_SDP = 0x0001, RDMA_PS_IPOIB = 0x0002, RDMA_PS_IB = 0x013F, RDMA_PS_TCP = 0x0106, RDMA_PS_UDP = 0x0111, }; -enum alt_path_type { - RDMA_ALT_PATH_NONE, - RDMA_ALT_PATH_PORT, - RDMA_ALT_PATH_LID, - RDMA_ALT_PATH_BEST -}; +#define RDMA_IB_IP_PS_MASK 0xFFFFFFFFFFFF0000ULL +#define RDMA_IB_IP_PS_TCP 0x0000000001060000ULL +#define RDMA_IB_IP_PS_UDP 0x0000000001110000ULL +#define RDMA_IB_IP_PS_IB 0x00000000013F0000ULL struct rdma_addr { struct sockaddr_storage src_addr; struct sockaddr_storage dst_addr; struct rdma_dev_addr dev_addr; }; struct rdma_route { struct rdma_addr addr; struct ib_sa_path_rec *path_rec; int num_paths; }; struct rdma_conn_param { const void *private_data; u8 private_data_len; u8 responder_resources; u8 initiator_depth; u8 flow_control; u8 retry_count; /* ignored when accepting */ u8 rnr_retry_count; /* Fields below ignored if a QP is created on the rdma_cm_id. */ u8 srq; u32 qp_num; + u32 qkey; }; struct rdma_ud_param { const void *private_data; u8 private_data_len; struct ib_ah_attr ah_attr; u32 qp_num; u32 qkey; - u8 alt_path_index; }; struct rdma_cm_event { enum rdma_cm_event_type event; int status; union { struct rdma_conn_param conn; struct rdma_ud_param ud; } param; }; enum rdma_cm_state { RDMA_CM_IDLE, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT, RDMA_CM_DISCONNECT, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN, RDMA_CM_DEVICE_REMOVAL, RDMA_CM_DESTROYING }; struct rdma_cm_id; /** * rdma_cm_event_handler - Callback used to report user events. * * Notes: Users may not call rdma_destroy_id from this callback to destroy * the passed in id, or a corresponding listen id. Returning a * non-zero value from the callback will destroy the passed in id. */ typedef int (*rdma_cm_event_handler)(struct rdma_cm_id *id, struct rdma_cm_event *event); struct rdma_cm_id { struct ib_device *device; void *context; struct ib_qp *qp; rdma_cm_event_handler event_handler; struct rdma_route route; enum rdma_port_space ps; enum ib_qp_type qp_type; u8 port_num; - void *ucontext; }; /** * rdma_create_id - Create an RDMA identifier. * + * @net: The network namespace in which to create the new id. * @event_handler: User callback invoked to report events associated with the * returned rdma_id. * @context: User specified context associated with the id. * @ps: RDMA port space. * @qp_type: type of queue pair associated with the id. + * + * The id holds a reference on the network namespace until it is destroyed. */ -struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, +struct rdma_cm_id *rdma_create_id(struct vnet *net, + rdma_cm_event_handler event_handler, void *context, enum rdma_port_space ps, enum ib_qp_type qp_type); /** * rdma_destroy_id - Destroys an RDMA identifier. * * @id: RDMA identifier. * * Note: calling this function has the effect of canceling in-flight * asynchronous operations associated with the id. */ void rdma_destroy_id(struct rdma_cm_id *id); /** * rdma_bind_addr - Bind an RDMA identifier to a source address and * associated RDMA device, if needed. * * @id: RDMA identifier. * @addr: Local address information. Wildcard values are permitted. * * This associates a source address with the RDMA identifier before calling * rdma_listen. If a specific local address is given, the RDMA identifier will * be bound to a local RDMA device. */ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr); /** * rdma_resolve_addr - Resolve destination and optional source addresses * from IP addresses to an RDMA address. If successful, the specified * rdma_cm_id will be bound to a local device. * * @id: RDMA identifier. * @src_addr: Source address information. This parameter may be NULL. * @dst_addr: Destination address information. * @timeout_ms: Time to wait for resolution to complete. */ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr, int timeout_ms); /** * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier * into route information needed to establish a connection. * * This is called on the client side of a connection. * Users must have first called rdma_resolve_addr to resolve a dst_addr * into an RDMA address before calling this routine. */ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); /** - * rdma_enable_apm - Get ready to use APM for the given ID. - * Actual Alternate path discovery and load will take place only - * after a connection has been established. - * - * Calling this function only has an effect on the connection's client side. - * It should be called after rdma_resolve_route and before rdma_connect. - * - * @id: RDMA identifier. - * @alt_type: Alternate path type to resolve. - */ -int rdma_enable_apm(struct rdma_cm_id *id, enum alt_path_type alt_type); - -/** * rdma_create_qp - Allocate a QP and associate it with the specified RDMA * identifier. * * QPs allocated to an rdma_cm_id will automatically be transitioned by the CMA * through their states. */ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr); /** * rdma_destroy_qp - Deallocate the QP associated with the specified RDMA * identifier. * * Users must destroy any QP associated with an RDMA identifier before * destroying the RDMA ID. */ void rdma_destroy_qp(struct rdma_cm_id *id); /** * rdma_init_qp_attr - Initializes the QP attributes for use in transitioning * to a specified QP state. * @id: Communication identifier associated with the QP attributes to * initialize. * @qp_attr: On input, specifies the desired QP state. On output, the * mandatory and desired optional attributes will be set in order to * modify the QP to the specified state. * @qp_attr_mask: The QP attribute mask that may be used to transition the * QP to the specified state. * * Users must set the @qp_attr->qp_state to the desired QP state. This call * will set all required attributes for the given transition, along with * known optional attributes. Users may override the attributes returned from * this call before calling ib_modify_qp. * * Users that wish to have their QP automatically transitioned through its * states can associate a QP with the rdma_cm_id by calling rdma_create_qp(). */ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, int *qp_attr_mask); /** * rdma_connect - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Users must have resolved a route for the rdma_cm_id to connect with * by having called rdma_resolve_route before calling this routine. * * This call will either connect to a remote QP or obtain remote QP * information for unconnected rdma_cm_id's. The actual operation is * based on the rdma_cm_id's port space. */ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); /** * rdma_listen - This function is called by the passive side to * listen for incoming connection requests. * * Users must have bound the rdma_cm_id to a local address by calling * rdma_bind_addr before calling this routine. */ int rdma_listen(struct rdma_cm_id *id, int backlog); /** * rdma_accept - Called to accept a connection request or response. * @id: Connection identifier associated with the request. * @conn_param: Information needed to establish the connection. This must be * provided if accepting a connection request. If accepting a connection * response, this parameter must be NULL. * * Typically, this routine is only called by the listener to accept a connection * request. It must also be called on the active side of a connection if the * user is performing their own QP transitions. * * In the case of error, a reject message is sent to the remote side and the * state of the qp associated with the id is modified to error, such that any * previously posted receive buffers would be flushed. */ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); /** * rdma_notify - Notifies the RDMA CM of an asynchronous event that has * occurred on the connection. * @id: Connection identifier to transition to established. * @event: Asynchronous event. * * This routine should be invoked by users to notify the CM of relevant * communication events. Events that should be reported to the CM and * when to report them are: * * IB_EVENT_COMM_EST - Used when a message is received on a connected * QP before an RTU has been received. */ int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event); /** * rdma_reject - Called to reject a connection request or response. */ int rdma_reject(struct rdma_cm_id *id, const void *private_data, u8 private_data_len); /** * rdma_disconnect - This function disconnects the associated QP and * transitions it into the error state. */ int rdma_disconnect(struct rdma_cm_id *id); /** * rdma_join_multicast - Join the multicast group specified by the given * address. * @id: Communication identifier associated with the request. * @addr: Multicast address identifying the group to join. + * @join_state: Multicast JoinState bitmap requested by port. + * Bitmap is based on IB_SA_MCMEMBER_REC_JOIN_STATE bits. * @context: User-defined context associated with the join request, returned * to the user through the private_data pointer in multicast events. */ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, - void *context); + u8 join_state, void *context); /** * rdma_leave_multicast - Leave the multicast group specified by the given * address. */ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr); /** * rdma_set_service_type - Set the type of service associated with a * connection identifier. * @id: Communication identifier to associated with service type. * @tos: Type of service. * * The type of service is interpretted as a differentiated service * field (RFC 2474). The service type should be specified before * performing route resolution, as existing communication on the * connection identifier may be unaffected. The type of service * requested may not be supported by the network to all destinations. */ void rdma_set_service_type(struct rdma_cm_id *id, int tos); /** * rdma_set_reuseaddr - Allow the reuse of local addresses when binding * the rdma_cm_id. * @id: Communication identifier to configure. * @reuse: Value indicating if the bound address is reusable. * * Reuse must be set before an address is bound to the id. */ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse); /** * rdma_set_afonly - Specify that listens are restricted to the * bound address family only. * @id: Communication identifer to configure. * @afonly: Value indicating if listens are restricted. * * Must be set before identifier is in the listening state. */ int rdma_set_afonly(struct rdma_cm_id *id, int afonly); -/** - * rdma_set_timeout - Set the QP timeout associated with a connection - * identifier. - * @id: Communication identifier to associated with service type. - * @timeout: QP timeout + /** + * rdma_get_service_id - Return the IB service ID for a specified address. + * @id: Communication identifier associated with the address. + * @addr: Address for the service ID. */ -void rdma_set_timeout(struct rdma_cm_id *id, int timeout); -int rdma_cma_any_addr(struct sockaddr *addr); -int rdma_find_cmid_laddr(struct sockaddr_in *local_addr, - unsigned short dev_type, void **cm_id); +__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr); + #endif /* RDMA_CM_H */ Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdma_user_cm.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdma_user_cm.h (revision 319973) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdma_user_cm.h (revision 319974) @@ -1,251 +1,310 @@ /* * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef RDMA_USER_CM_H #define RDMA_USER_CM_H #include +#include #include #include #include #define RDMA_USER_CM_ABI_VERSION 4 #define RDMA_MAX_PRIVATE_DATA 256 enum { RDMA_USER_CM_CMD_CREATE_ID, RDMA_USER_CM_CMD_DESTROY_ID, - RDMA_USER_CM_CMD_BIND_ADDR, - RDMA_USER_CM_CMD_RESOLVE_ADDR, + RDMA_USER_CM_CMD_BIND_IP, + RDMA_USER_CM_CMD_RESOLVE_IP, RDMA_USER_CM_CMD_RESOLVE_ROUTE, RDMA_USER_CM_CMD_QUERY_ROUTE, RDMA_USER_CM_CMD_CONNECT, RDMA_USER_CM_CMD_LISTEN, RDMA_USER_CM_CMD_ACCEPT, RDMA_USER_CM_CMD_REJECT, RDMA_USER_CM_CMD_DISCONNECT, RDMA_USER_CM_CMD_INIT_QP_ATTR, RDMA_USER_CM_CMD_GET_EVENT, RDMA_USER_CM_CMD_GET_OPTION, RDMA_USER_CM_CMD_SET_OPTION, RDMA_USER_CM_CMD_NOTIFY, - RDMA_USER_CM_CMD_JOIN_MCAST, + RDMA_USER_CM_CMD_JOIN_IP_MCAST, RDMA_USER_CM_CMD_LEAVE_MCAST, - RDMA_USER_CM_CMD_MIGRATE_ID + RDMA_USER_CM_CMD_MIGRATE_ID, + RDMA_USER_CM_CMD_QUERY, + RDMA_USER_CM_CMD_BIND, + RDMA_USER_CM_CMD_RESOLVE_ADDR, + RDMA_USER_CM_CMD_JOIN_MCAST }; /* * command ABI structures. */ struct rdma_ucm_cmd_hdr { __u32 cmd; __u16 in; __u16 out; }; struct rdma_ucm_create_id { __u64 uid; __u64 response; __u16 ps; __u8 qp_type; __u8 reserved[5]; }; struct rdma_ucm_create_id_resp { __u32 id; }; struct rdma_ucm_destroy_id { __u64 response; __u32 id; __u32 reserved; }; struct rdma_ucm_destroy_id_resp { __u32 events_reported; }; -struct rdma_ucm_bind_addr { +struct rdma_ucm_bind_ip { __u64 response; struct sockaddr_in6 addr; __u32 id; }; -struct rdma_ucm_resolve_addr { +struct rdma_ucm_bind { + __u32 id; + __u16 addr_size; + __u16 reserved; + struct sockaddr_storage addr; +}; + +struct rdma_ucm_resolve_ip { struct sockaddr_in6 src_addr; struct sockaddr_in6 dst_addr; __u32 id; __u32 timeout_ms; }; +struct rdma_ucm_resolve_addr { + __u32 id; + __u32 timeout_ms; + __u16 src_size; + __u16 dst_size; + __u32 reserved; + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; +}; + struct rdma_ucm_resolve_route { __u32 id; __u32 timeout_ms; }; -struct rdma_ucm_query_route { +enum { + RDMA_USER_CM_QUERY_ADDR, + RDMA_USER_CM_QUERY_PATH, + RDMA_USER_CM_QUERY_GID +}; + +struct rdma_ucm_query { __u64 response; __u32 id; - __u32 reserved; + __u32 option; }; struct rdma_ucm_query_route_resp { __u64 node_guid; struct ib_user_path_rec ib_route[2]; struct sockaddr_in6 src_addr; struct sockaddr_in6 dst_addr; __u32 num_paths; __u8 port_num; __u8 reserved[3]; }; +struct rdma_ucm_query_addr_resp { + __u64 node_guid; + __u8 port_num; + __u8 reserved; + __u16 pkey; + __u16 src_size; + __u16 dst_size; + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; +}; + +struct rdma_ucm_query_path_resp { + __u32 num_paths; + __u32 reserved; + struct ib_path_rec_data path_data[0]; +}; + struct rdma_ucm_conn_param { __u32 qp_num; - __u32 reserved; + __u32 qkey; __u8 private_data[RDMA_MAX_PRIVATE_DATA]; __u8 private_data_len; __u8 srq; __u8 responder_resources; __u8 initiator_depth; __u8 flow_control; __u8 retry_count; __u8 rnr_retry_count; __u8 valid; }; struct rdma_ucm_ud_param { __u32 qp_num; __u32 qkey; struct ib_uverbs_ah_attr ah_attr; __u8 private_data[RDMA_MAX_PRIVATE_DATA]; __u8 private_data_len; __u8 reserved[7]; }; struct rdma_ucm_connect { struct rdma_ucm_conn_param conn_param; __u32 id; __u32 reserved; }; struct rdma_ucm_listen { __u32 id; __u32 backlog; }; struct rdma_ucm_accept { __u64 uid; struct rdma_ucm_conn_param conn_param; __u32 id; __u32 reserved; }; struct rdma_ucm_reject { __u32 id; __u8 private_data_len; __u8 reserved[3]; __u8 private_data[RDMA_MAX_PRIVATE_DATA]; }; struct rdma_ucm_disconnect { __u32 id; }; struct rdma_ucm_init_qp_attr { __u64 response; __u32 id; __u32 qp_state; }; struct rdma_ucm_notify { __u32 id; __u32 event; }; -struct rdma_ucm_join_mcast { +struct rdma_ucm_join_ip_mcast { __u64 response; /* rdma_ucm_create_id_resp */ __u64 uid; struct sockaddr_in6 addr; __u32 id; }; +/* Multicast join flags */ +enum { + RDMA_MC_JOIN_FLAG_FULLMEMBER, + RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER, + RDMA_MC_JOIN_FLAG_RESERVED, +}; + +struct rdma_ucm_join_mcast { + __u64 response; /* rdma_ucma_create_id_resp */ + __u64 uid; + __u32 id; + __u16 addr_size; + __u16 join_flags; + struct sockaddr_storage addr; +}; + struct rdma_ucm_get_event { __u64 response; }; struct rdma_ucm_event_resp { __u64 uid; __u32 id; __u32 event; __u32 status; union { struct rdma_ucm_conn_param conn; struct rdma_ucm_ud_param ud; } param; }; /* Option levels */ enum { RDMA_OPTION_ID = 0, RDMA_OPTION_IB = 1 }; /* Option details */ enum { - RDMA_OPTION_ID_TOS = 0, + RDMA_OPTION_ID_TOS = 0, RDMA_OPTION_ID_REUSEADDR = 1, RDMA_OPTION_ID_AFONLY = 2, - - RDMA_OPTION_IB_PATH = 1, - RDMA_OPTION_IB_APM = 2, + RDMA_OPTION_IB_PATH = 1 }; struct rdma_ucm_set_option { __u64 optval; __u32 id; __u32 level; __u32 optname; __u32 optlen; }; struct rdma_ucm_migrate_id { __u64 response; __u32 id; __u32 fd; }; struct rdma_ucm_migrate_resp { __u32 events_reported; }; #endif /* RDMA_USER_CM_H */ Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdma_vt.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdma_vt.h (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdma_vt.h (revision 319974) @@ -0,0 +1,500 @@ +#ifndef DEF_RDMA_VT_H +#define DEF_RDMA_VT_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * Structure that low level drivers will populate in order to register with the + * rdmavt layer. + */ + +#include +#include +#include +#include +#include +#include + +#define RVT_MAX_PKEY_VALUES 16 + +struct rvt_ibport { + struct rvt_qp __rcu *qp[2]; + struct ib_mad_agent *send_agent; /* agent for SMI (traps) */ + struct rb_root mcast_tree; + spinlock_t lock; /* protect changes in this struct */ + + /* non-zero when timer is set */ + unsigned long mkey_lease_timeout; + unsigned long trap_timeout; + __be64 gid_prefix; /* in network order */ + __be64 mkey; + u64 tid; + u32 port_cap_flags; + u32 pma_sample_start; + u32 pma_sample_interval; + __be16 pma_counter_select[5]; + u16 pma_tag; + u16 mkey_lease_period; + u16 sm_lid; + u8 sm_sl; + u8 mkeyprot; + u8 subnet_timeout; + u8 vl_high_limit; + + /* + * Driver is expected to keep these up to date. These + * counters are informational only and not required to be + * completely accurate. + */ + u64 n_rc_resends; + u64 n_seq_naks; + u64 n_rdma_seq; + u64 n_rnr_naks; + u64 n_other_naks; + u64 n_loop_pkts; + u64 n_pkt_drops; + u64 n_vl15_dropped; + u64 n_rc_timeouts; + u64 n_dmawait; + u64 n_unaligned; + u64 n_rc_dupreq; + u64 n_rc_seqnak; + u16 pkey_violations; + u16 qkey_violations; + u16 mkey_violations; + + /* Hot-path per CPU counters to avoid cacheline trading to update */ + u64 z_rc_acks; + u64 z_rc_qacks; + u64 z_rc_delayed_comp; + u64 __percpu *rc_acks; + u64 __percpu *rc_qacks; + u64 __percpu *rc_delayed_comp; + + void *priv; /* driver private data */ + + /* + * The pkey table is allocated and maintained by the driver. Drivers + * need to have access to this before registering with rdmav. However + * rdmavt will need access to it so drivers need to proviee this during + * the attach port API call. + */ + u16 *pkey_table; + + struct rvt_ah *sm_ah; +}; + +#define RVT_CQN_MAX 16 /* maximum length of cq name */ + +/* + * Things that are driver specific, module parameters in hfi1 and qib + */ +struct rvt_driver_params { + struct ib_device_attr props; + + /* + * Anything driver specific that is not covered by props + * For instance special module parameters. Goes here. + */ + unsigned int lkey_table_size; + unsigned int qp_table_size; + int qpn_start; + int qpn_inc; + int qpn_res_start; + int qpn_res_end; + int nports; + int npkeys; + char cq_name[RVT_CQN_MAX]; + int node; + int psn_mask; + int psn_shift; + int psn_modify_mask; + u32 core_cap_flags; + u32 max_mad_size; + u8 qos_shift; + u8 max_rdma_atomic; + u8 reserved_operations; +}; + +/* Protection domain */ +struct rvt_pd { + struct ib_pd ibpd; + int user; /* non-zero if created from user space */ +}; + +/* Address handle */ +struct rvt_ah { + struct ib_ah ibah; + struct ib_ah_attr attr; + atomic_t refcount; + u8 vl; + u8 log_pmtu; +}; + +struct rvt_dev_info; +struct rvt_swqe; +struct rvt_driver_provided { + /* + * Which functions are required depends on which verbs rdmavt is + * providing and which verbs the driver is overriding. See + * check_support() for details. + */ + + /* Passed to ib core registration. Callback to create syfs files */ + int (*port_callback)(struct ib_device *, u8, struct kobject *); + + /* + * Returns a string to represent the device for which is being + * registered. This is primarily used for error and debug messages on + * the console. + */ + const char * (*get_card_name)(struct rvt_dev_info *rdi); + + /* + * Returns a pointer to the undelying hardware's PCI device. This is + * used to display information as to what hardware is being referenced + * in an output message + */ + struct pci_dev * (*get_pci_dev)(struct rvt_dev_info *rdi); + + /* + * Allocate a private queue pair data structure for driver specific + * information which is opaque to rdmavt. Errors are returned via + * ERR_PTR(err). The driver is free to return NULL or a valid + * pointer. + */ + void * (*qp_priv_alloc)(struct rvt_dev_info *rdi, struct rvt_qp *qp, + gfp_t gfp); + + /* + * Free the driver's private qp structure. + */ + void (*qp_priv_free)(struct rvt_dev_info *rdi, struct rvt_qp *qp); + + /* + * Inform the driver the particular qp in quesiton has been reset so + * that it can clean up anything it needs to. + */ + void (*notify_qp_reset)(struct rvt_qp *qp); + + /* + * Give the driver a notice that there is send work to do. It is up to + * the driver to generally push the packets out, this just queues the + * work with the driver. There are two variants here. The no_lock + * version requires the s_lock not to be held. The other assumes the + * s_lock is held. + */ + void (*schedule_send)(struct rvt_qp *qp); + void (*schedule_send_no_lock)(struct rvt_qp *qp); + + /* + * Sometimes rdmavt needs to kick the driver's send progress. That is + * done by this call back. + */ + void (*do_send)(struct rvt_qp *qp); + + /* + * Get a path mtu from the driver based on qp attributes. + */ + int (*get_pmtu_from_attr)(struct rvt_dev_info *rdi, struct rvt_qp *qp, + struct ib_qp_attr *attr); + + /* + * Notify driver that it needs to flush any outstanding IO requests that + * are waiting on a qp. + */ + void (*flush_qp_waiters)(struct rvt_qp *qp); + + /* + * Notify driver to stop its queue of sending packets. Nothing else + * should be posted to the queue pair after this has been called. + */ + void (*stop_send_queue)(struct rvt_qp *qp); + + /* + * Have the drivr drain any in progress operations + */ + void (*quiesce_qp)(struct rvt_qp *qp); + + /* + * Inform the driver a qp has went to error state. + */ + void (*notify_error_qp)(struct rvt_qp *qp); + + /* + * Get an MTU for a qp. + */ + u32 (*mtu_from_qp)(struct rvt_dev_info *rdi, struct rvt_qp *qp, + u32 pmtu); + /* + * Convert an mtu to a path mtu + */ + int (*mtu_to_path_mtu)(u32 mtu); + + /* + * Get the guid of a port in big endian byte order + */ + int (*get_guid_be)(struct rvt_dev_info *rdi, struct rvt_ibport *rvp, + int guid_index, __be64 *guid); + + /* + * Query driver for the state of the port. + */ + int (*query_port_state)(struct rvt_dev_info *rdi, u8 port_num, + struct ib_port_attr *props); + + /* + * Tell driver to shutdown a port + */ + int (*shut_down_port)(struct rvt_dev_info *rdi, u8 port_num); + + /* Tell driver to send a trap for changed port capabilities */ + void (*cap_mask_chg)(struct rvt_dev_info *rdi, u8 port_num); + + /* + * The following functions can be safely ignored completely. Any use of + * these is checked for NULL before blindly calling. Rdmavt should also + * be functional if drivers omit these. + */ + + /* Called to inform the driver that all qps should now be freed. */ + unsigned (*free_all_qps)(struct rvt_dev_info *rdi); + + /* Driver specific AH validation */ + int (*check_ah)(struct ib_device *, struct ib_ah_attr *); + + /* Inform the driver a new AH has been created */ + void (*notify_new_ah)(struct ib_device *, struct ib_ah_attr *, + struct rvt_ah *); + + /* Let the driver pick the next queue pair number*/ + int (*alloc_qpn)(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, + enum ib_qp_type type, u8 port_num, gfp_t gfp); + + /* Determine if its safe or allowed to modify the qp */ + int (*check_modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + + /* Driver specific QP modification/notification-of */ + void (*modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + + /* Driver specific work request checking */ + int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe); + + /* Notify driver a mad agent has been created */ + void (*notify_create_mad_agent)(struct rvt_dev_info *rdi, int port_idx); + + /* Notify driver a mad agent has been removed */ + void (*notify_free_mad_agent)(struct rvt_dev_info *rdi, int port_idx); + +}; + +struct rvt_dev_info { + struct ib_device ibdev; /* Keep this first. Nothing above here */ + + /* + * Prior to calling for registration the driver will be responsible for + * allocating space for this structure. + * + * The driver will also be responsible for filling in certain members of + * dparms.props. The driver needs to fill in dparms exactly as it would + * want values reported to a ULP. This will be returned to the caller + * in rdmavt's device. The driver should also therefore refrain from + * modifying this directly after registration with rdmavt. + */ + + /* Driver specific properties */ + struct rvt_driver_params dparms; + + /* post send table */ + const struct rvt_operation_params *post_parms; + + struct rvt_mregion __rcu *dma_mr; + struct rvt_lkey_table lkey_table; + + /* Driver specific helper functions */ + struct rvt_driver_provided driver_f; + + /* Internal use */ + int n_pds_allocated; + spinlock_t n_pds_lock; /* Protect pd allocated count */ + + int n_ahs_allocated; + spinlock_t n_ahs_lock; /* Protect ah allocated count */ + + u32 n_srqs_allocated; + spinlock_t n_srqs_lock; /* Protect srqs allocated count */ + + int flags; + struct rvt_ibport **ports; + + /* QP */ + struct rvt_qp_ibdev *qp_dev; + u32 n_qps_allocated; /* number of QPs allocated for device */ + u32 n_rc_qps; /* number of RC QPs allocated for device */ + u32 busy_jiffies; /* timeout scaling based on RC QP count */ + spinlock_t n_qps_lock; /* protect qps, rc qps and busy jiffy counts */ + + /* memory maps */ + struct list_head pending_mmaps; + spinlock_t mmap_offset_lock; /* protect mmap_offset */ + u32 mmap_offset; + spinlock_t pending_lock; /* protect pending mmap list */ + + /* CQ */ + struct kthread_worker *worker; /* per device cq worker */ + u32 n_cqs_allocated; /* number of CQs allocated for device */ + spinlock_t n_cqs_lock; /* protect count of in use cqs */ + + /* Multicast */ + u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ + spinlock_t n_mcast_grps_lock; + +}; + +static inline struct rvt_pd *ibpd_to_rvtpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct rvt_pd, ibpd); +} + +static inline struct rvt_ah *ibah_to_rvtah(struct ib_ah *ibah) +{ + return container_of(ibah, struct rvt_ah, ibah); +} + +static inline struct rvt_dev_info *ib_to_rvt(struct ib_device *ibdev) +{ + return container_of(ibdev, struct rvt_dev_info, ibdev); +} + +static inline struct rvt_srq *ibsrq_to_rvtsrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct rvt_srq, ibsrq); +} + +static inline struct rvt_qp *ibqp_to_rvtqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct rvt_qp, ibqp); +} + +static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi) +{ + /* + * All ports have same number of pkeys. + */ + return rdi->dparms.npkeys; +} + +/* + * Return the max atomic suitable for determining + * the size of the ack ring buffer in a QP. + */ +static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi) +{ + return rdi->dparms.max_rdma_atomic + 1; +} + +/* + * Return the indexed PKEY from the port PKEY table. + */ +static inline u16 rvt_get_pkey(struct rvt_dev_info *rdi, + int port_index, + unsigned index) +{ + if (index >= rvt_get_npkeys(rdi)) + return 0; + else + return rdi->ports[port_index]->pkey_table[index]; +} + +/** + * rvt_lookup_qpn - return the QP with the given QPN + * @ibp: the ibport + * @qpn: the QP number to look up + * + * The caller must hold the rcu_read_lock(), and keep the lock until + * the returned qp is no longer in use. + */ +/* TODO: Remove this and put in rdmavt/qp.h when no longer needed by drivers */ +static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi, + struct rvt_ibport *rvp, + u32 qpn) __must_hold(RCU) +{ + struct rvt_qp *qp = NULL; + + if (unlikely(qpn <= 1)) { + qp = rcu_dereference(rvp->qp[qpn]); + } else { + u32 n = hash_32(qpn, rdi->qp_dev->qp_table_bits); + + for (qp = rcu_dereference(rdi->qp_dev->qp_table[n]); qp; + qp = rcu_dereference(qp->next)) + if (qp->ibqp.qp_num == qpn) + break; + } + return qp; +} + +struct rvt_dev_info *rvt_alloc_device(size_t size, int nports); +void rvt_dealloc_device(struct rvt_dev_info *rdi); +int rvt_register_device(struct rvt_dev_info *rvd); +void rvt_unregister_device(struct rvt_dev_info *rvd); +int rvt_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr); +int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port, + int port_index, u16 *pkey_table); +int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key, + int access); +int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey); +int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge, + u32 len, u64 vaddr, u32 rkey, int acc); +int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, + struct rvt_sge *isge, struct ib_sge *sge, int acc); +struct rvt_mcast *rvt_mcast_find(struct rvt_ibport *ibp, union ib_gid *mgid); + +#endif /* DEF_RDMA_VT_H */ Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdma_vt.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdmavt_cq.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdmavt_cq.h (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdmavt_cq.h (revision 319974) @@ -0,0 +1,99 @@ +#ifndef DEF_RDMAVT_INCCQ_H +#define DEF_RDMAVT_INCCQ_H + +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include + +/* + * Define an ib_cq_notify value that is not valid so we know when CQ + * notifications are armed. + */ +#define RVT_CQ_NONE (IB_CQ_NEXT_COMP + 1) + +/* + * This structure is used to contain the head pointer, tail pointer, + * and completion queue entries as a single memory allocation so + * it can be mmap'ed into user space. + */ +struct rvt_cq_wc { + u32 head; /* index of next entry to fill */ + u32 tail; /* index of next ib_poll_cq() entry */ + union { + /* these are actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[0]; + struct ib_wc kqueue[0]; + }; +}; + +/* + * The completion queue structure. + */ +struct rvt_cq { + struct ib_cq ibcq; + struct kthread_work comptask; + spinlock_t lock; /* protect changes in this struct */ + u8 notify; + u8 triggered; + struct rvt_dev_info *rdi; + struct rvt_cq_wc *queue; + struct rvt_mmap_info *ip; +}; + +static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct rvt_cq, ibcq); +} + +void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited); + +#endif /* DEF_RDMAVT_INCCQH */ Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdmavt_cq.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdmavt_mr.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdmavt_mr.h (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdmavt_mr.h (revision 319974) @@ -0,0 +1,140 @@ +#ifndef DEF_RDMAVT_INCMR_H +#define DEF_RDMAVT_INCMR_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * For Memory Regions. This stuff should probably be moved into rdmavt/mr.h once + * drivers no longer need access to the MR directly. + */ + +/* + * A segment is a linear region of low physical memory. + * Used by the verbs layer. + */ +struct rvt_seg { + void *vaddr; + size_t length; +}; + +/* The number of rvt_segs that fit in a page. */ +#define RVT_SEGSZ (PAGE_SIZE / sizeof(struct rvt_seg)) + +struct rvt_segarray { + struct rvt_seg segs[RVT_SEGSZ]; +}; + +struct rvt_mregion { + struct ib_pd *pd; /* shares refcnt of ibmr.pd */ + u64 user_base; /* User's address for this region */ + u64 iova; /* IB start address of this region */ + size_t length; + u32 lkey; + u32 offset; /* offset (bytes) to start of region */ + int access_flags; + u32 max_segs; /* number of rvt_segs in all the arrays */ + u32 mapsz; /* size of the map array */ + u8 page_shift; /* 0 - non unform/non powerof2 sizes */ + u8 lkey_published; /* in global table */ + atomic_t lkey_invalid; /* true if current lkey is invalid */ + struct completion comp; /* complete when refcount goes to zero */ + atomic_t refcount; + struct rvt_segarray *map[0]; /* the segments */ +}; + +#define RVT_MAX_LKEY_TABLE_BITS 23 + +struct rvt_lkey_table { + spinlock_t lock; /* protect changes in this struct */ + u32 next; /* next unused index (speeds search) */ + u32 gen; /* generation count */ + u32 max; /* size of the table */ + struct rvt_mregion __rcu **table; +}; + +/* + * These keep track of the copy progress within a memory region. + * Used by the verbs layer. + */ +struct rvt_sge { + struct rvt_mregion *mr; + void *vaddr; /* kernel virtual address of segment */ + u32 sge_length; /* length of the SGE */ + u32 length; /* remaining length of the segment */ + u16 m; /* current index: mr->map[m] */ + u16 n; /* current index: mr->map[m]->segs[n] */ +}; + +struct rvt_sge_state { + struct rvt_sge *sg_list; /* next SGE to be used if any */ + struct rvt_sge sge; /* progress state for the current SGE */ + u32 total_len; + u8 num_sge; +}; + +static inline void rvt_put_mr(struct rvt_mregion *mr) +{ + if (unlikely(atomic_dec_and_test(&mr->refcount))) + complete(&mr->comp); +} + +static inline void rvt_get_mr(struct rvt_mregion *mr) +{ + atomic_inc(&mr->refcount); +} + +static inline void rvt_put_ss(struct rvt_sge_state *ss) +{ + while (ss->num_sge) { + rvt_put_mr(ss->sge.mr); + if (--ss->num_sge) + ss->sge = *ss->sg_list++; + } +} + +#endif /* DEF_RDMAVT_INCMRH */ Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdmavt_mr.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdmavt_qp.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdmavt_qp.h (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdmavt_qp.h (revision 319974) @@ -0,0 +1,535 @@ +#ifndef DEF_RDMAVT_INCQP_H +#define DEF_RDMAVT_INCQP_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +/* + * Atomic bit definitions for r_aflags. + */ +#define RVT_R_WRID_VALID 0 +#define RVT_R_REWIND_SGE 1 + +/* + * Bit definitions for r_flags. + */ +#define RVT_R_REUSE_SGE 0x01 +#define RVT_R_RDMAR_SEQ 0x02 +#define RVT_R_RSP_NAK 0x04 +#define RVT_R_RSP_SEND 0x08 +#define RVT_R_COMM_EST 0x10 + +/* + * Bit definitions for s_flags. + * + * RVT_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled + * RVT_S_BUSY - send tasklet is processing the QP + * RVT_S_TIMER - the RC retry timer is active + * RVT_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics + * RVT_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs + * before processing the next SWQE + * RVT_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete + * before processing the next SWQE + * RVT_S_WAIT_RNR - waiting for RNR timeout + * RVT_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE + * RVT_S_WAIT_DMA - waiting for send DMA queue to drain before generating + * next send completion entry not via send DMA + * RVT_S_WAIT_PIO - waiting for a send buffer to be available + * RVT_S_WAIT_PIO_DRAIN - waiting for a qp to drain pio packets + * RVT_S_WAIT_TX - waiting for a struct verbs_txreq to be available + * RVT_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available + * RVT_S_WAIT_KMEM - waiting for kernel memory to be available + * RVT_S_WAIT_PSN - waiting for a packet to exit the send DMA queue + * RVT_S_WAIT_ACK - waiting for an ACK packet before sending more requests + * RVT_S_SEND_ONE - send one packet, request ACK, then wait for ACK + * RVT_S_ECN - a BECN was queued to the send engine + */ +#define RVT_S_SIGNAL_REQ_WR 0x0001 +#define RVT_S_BUSY 0x0002 +#define RVT_S_TIMER 0x0004 +#define RVT_S_RESP_PENDING 0x0008 +#define RVT_S_ACK_PENDING 0x0010 +#define RVT_S_WAIT_FENCE 0x0020 +#define RVT_S_WAIT_RDMAR 0x0040 +#define RVT_S_WAIT_RNR 0x0080 +#define RVT_S_WAIT_SSN_CREDIT 0x0100 +#define RVT_S_WAIT_DMA 0x0200 +#define RVT_S_WAIT_PIO 0x0400 +#define RVT_S_WAIT_PIO_DRAIN 0x0800 +#define RVT_S_WAIT_TX 0x1000 +#define RVT_S_WAIT_DMA_DESC 0x2000 +#define RVT_S_WAIT_KMEM 0x4000 +#define RVT_S_WAIT_PSN 0x8000 +#define RVT_S_WAIT_ACK 0x10000 +#define RVT_S_SEND_ONE 0x20000 +#define RVT_S_UNLIMITED_CREDIT 0x40000 +#define RVT_S_AHG_VALID 0x80000 +#define RVT_S_AHG_CLEAR 0x100000 +#define RVT_S_ECN 0x200000 + +/* + * Wait flags that would prevent any packet type from being sent. + */ +#define RVT_S_ANY_WAIT_IO \ + (RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN | RVT_S_WAIT_TX | \ + RVT_S_WAIT_DMA_DESC | RVT_S_WAIT_KMEM) + +/* + * Wait flags that would prevent send work requests from making progress. + */ +#define RVT_S_ANY_WAIT_SEND (RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | \ + RVT_S_WAIT_RNR | RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA | \ + RVT_S_WAIT_PSN | RVT_S_WAIT_ACK) + +#define RVT_S_ANY_WAIT (RVT_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND) + +/* Number of bits to pay attention to in the opcode for checking qp type */ +#define RVT_OPCODE_QP_MASK 0xE0 + +/* Flags for checking QP state (see ib_rvt_state_ops[]) */ +#define RVT_POST_SEND_OK 0x01 +#define RVT_POST_RECV_OK 0x02 +#define RVT_PROCESS_RECV_OK 0x04 +#define RVT_PROCESS_SEND_OK 0x08 +#define RVT_PROCESS_NEXT_SEND_OK 0x10 +#define RVT_FLUSH_SEND 0x20 +#define RVT_FLUSH_RECV 0x40 +#define RVT_PROCESS_OR_FLUSH_SEND \ + (RVT_PROCESS_SEND_OK | RVT_FLUSH_SEND) + +/* + * Internal send flags + */ +#define RVT_SEND_RESERVE_USED IB_SEND_RESERVED_START +#define RVT_SEND_COMPLETION_ONLY (IB_SEND_RESERVED_START << 1) + +/* + * Send work request queue entry. + * The size of the sg_list is determined when the QP is created and stored + * in qp->s_max_sge. + */ +struct rvt_swqe { + union { + struct ib_send_wr wr; /* don't use wr.sg_list */ + struct ib_ud_wr ud_wr; + struct ib_reg_wr reg_wr; + struct ib_rdma_wr rdma_wr; + struct ib_atomic_wr atomic_wr; + }; + u32 psn; /* first packet sequence number */ + u32 lpsn; /* last packet sequence number */ + u32 ssn; /* send sequence number */ + u32 length; /* total length of data in sg_list */ + struct rvt_sge sg_list[0]; +}; + +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP (or SRQ) is created + * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). + */ +struct rvt_rwqe { + u64 wr_id; + u8 num_sge; + struct ib_sge sg_list[0]; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and receive work queue entries as a single memory allocation so + * it can be mmap'ed into user space. + * Note that the wq array elements are variable size so you can't + * just index into the array to get the N'th element; + * use get_rwqe_ptr() instead. + */ +struct rvt_rwq { + u32 head; /* new work requests posted to the head */ + u32 tail; /* receives pull requests from here. */ + struct rvt_rwqe wq[0]; +}; + +struct rvt_rq { + struct rvt_rwq *wq; + u32 size; /* size of RWQE array */ + u8 max_sge; + /* protect changes in this struct */ + spinlock_t lock ____cacheline_aligned_in_smp; +}; + +/* + * This structure is used by rvt_mmap() to validate an offset + * when an mmap() request is made. The vm_area_struct then uses + * this as its vm_private_data. + */ +struct rvt_mmap_info { + struct list_head pending_mmaps; + struct ib_ucontext *context; + void *obj; + __u64 offset; + struct kref ref; + unsigned size; +}; + +/* + * This structure holds the information that the send tasklet needs + * to send a RDMA read response or atomic operation. + */ +struct rvt_ack_entry { + struct rvt_sge rdma_sge; + u64 atomic_data; + u32 psn; + u32 lpsn; + u8 opcode; + u8 sent; +}; + +#define RC_QP_SCALING_INTERVAL 5 + +#define RVT_OPERATION_PRIV 0x00000001 +#define RVT_OPERATION_ATOMIC 0x00000002 +#define RVT_OPERATION_ATOMIC_SGE 0x00000004 +#define RVT_OPERATION_LOCAL 0x00000008 +#define RVT_OPERATION_USE_RESERVE 0x00000010 + +#define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1) + +/** + * rvt_operation_params - op table entry + * @length - the length to copy into the swqe entry + * @qpt_support - a bit mask indicating QP type support + * @flags - RVT_OPERATION flags (see above) + * + * This supports table driven post send so that + * the driver can have differing an potentially + * different sets of operations. + * + **/ + +struct rvt_operation_params { + size_t length; + u32 qpt_support; + u32 flags; +}; + +/* + * Common variables are protected by both r_rq.lock and s_lock in that order + * which only happens in modify_qp() or changing the QP 'state'. + */ +struct rvt_qp { + struct ib_qp ibqp; + void *priv; /* Driver private data */ + /* read mostly fields above and below */ + struct ib_ah_attr remote_ah_attr; + struct ib_ah_attr alt_ah_attr; + struct rvt_qp __rcu *next; /* link list for QPN hash table */ + struct rvt_swqe *s_wq; /* send work queue */ + struct rvt_mmap_info *ip; + + unsigned long timeout_jiffies; /* computed from timeout */ + + enum ib_mtu path_mtu; + int srate_mbps; /* s_srate (below) converted to Mbit/s */ + pid_t pid; /* pid for user mode QPs */ + u32 remote_qpn; + u32 qkey; /* QKEY for this QP (for UD or RD) */ + u32 s_size; /* send work queue size */ + u32 s_ahgpsn; /* set to the psn in the copy of the header */ + + u16 pmtu; /* decoded from path_mtu */ + u8 log_pmtu; /* shift for pmtu */ + u8 state; /* QP state */ + u8 allowed_ops; /* high order bits of allowed opcodes */ + u8 qp_access_flags; + u8 alt_timeout; /* Alternate path timeout for this QP */ + u8 timeout; /* Timeout for this QP */ + u8 s_srate; + u8 s_mig_state; + u8 port_num; + u8 s_pkey_index; /* PKEY index to use */ + u8 s_alt_pkey_index; /* Alternate path PKEY index to use */ + u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */ + u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */ + u8 s_retry_cnt; /* number of times to retry */ + u8 s_rnr_retry_cnt; + u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */ + u8 s_max_sge; /* size of s_wq->sg_list */ + u8 s_draining; + + /* start of read/write fields */ + atomic_t refcount ____cacheline_aligned_in_smp; + wait_queue_head_t wait; + + struct rvt_ack_entry *s_ack_queue; + struct rvt_sge_state s_rdma_read_sge; + + spinlock_t r_lock ____cacheline_aligned_in_smp; /* used for APM */ + u32 r_psn; /* expected rcv packet sequence number */ + unsigned long r_aflags; + u64 r_wr_id; /* ID for current receive WQE */ + u32 r_ack_psn; /* PSN for next ACK or atomic ACK */ + u32 r_len; /* total length of r_sge */ + u32 r_rcv_len; /* receive data len processed */ + u32 r_msn; /* message sequence number */ + + u8 r_state; /* opcode of last packet received */ + u8 r_flags; + u8 r_head_ack_queue; /* index into s_ack_queue[] */ + + struct list_head rspwait; /* link for waiting to respond */ + + struct rvt_sge_state r_sge; /* current receive data */ + struct rvt_rq r_rq; /* receive work queue */ + + /* post send line */ + spinlock_t s_hlock ____cacheline_aligned_in_smp; + u32 s_head; /* new entries added here */ + u32 s_next_psn; /* PSN for next request */ + u32 s_avail; /* number of entries avail */ + u32 s_ssn; /* SSN of tail entry */ + atomic_t s_reserved_used; /* reserved entries in use */ + + spinlock_t s_lock ____cacheline_aligned_in_smp; + u32 s_flags; + struct rvt_sge_state *s_cur_sge; + struct rvt_swqe *s_wqe; + struct rvt_sge_state s_sge; /* current send request data */ + struct rvt_mregion *s_rdma_mr; + u32 s_cur_size; /* size of send packet in bytes */ + u32 s_len; /* total length of s_sge */ + u32 s_rdma_read_len; /* total length of s_rdma_read_sge */ + u32 s_last_psn; /* last response PSN processed */ + u32 s_sending_psn; /* lowest PSN that is being sent */ + u32 s_sending_hpsn; /* highest PSN that is being sent */ + u32 s_psn; /* current packet sequence number */ + u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */ + u32 s_ack_psn; /* PSN for acking sends and RDMA writes */ + u32 s_tail; /* next entry to process */ + u32 s_cur; /* current work queue entry */ + u32 s_acked; /* last un-ACK'ed entry */ + u32 s_last; /* last completed entry */ + u32 s_lsn; /* limit sequence number (credit) */ + u16 s_hdrwords; /* size of s_hdr in 32 bit words */ + u16 s_rdma_ack_cnt; + s8 s_ahgidx; + u8 s_state; /* opcode of last packet sent */ + u8 s_ack_state; /* opcode of packet to ACK */ + u8 s_nak_state; /* non-zero if NAK is pending */ + u8 r_nak_state; /* non-zero if NAK is pending */ + u8 s_retry; /* requester retry counter */ + u8 s_rnr_retry; /* requester RNR retry counter */ + u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ + u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + + struct rvt_sge_state s_ack_rdma_sge; + struct timer_list s_timer; + + atomic_t local_ops_pending; /* number of fast_reg/local_inv reqs */ + + /* + * This sge list MUST be last. Do not add anything below here. + */ + struct rvt_sge r_sg_list[0] /* verified SGEs */ + ____cacheline_aligned_in_smp; +}; + +struct rvt_srq { + struct ib_srq ibsrq; + struct rvt_rq rq; + struct rvt_mmap_info *ip; + /* send signal when number of RWQEs < limit */ + u32 limit; +}; + +#define RVT_QPN_MAX BIT(24) +#define RVT_QPNMAP_ENTRIES (RVT_QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) +#define RVT_BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) +#define RVT_BITS_PER_PAGE_MASK (RVT_BITS_PER_PAGE - 1) +#define RVT_QPN_MASK 0xFFFFFF + +/* + * QPN-map pages start out as NULL, they get allocated upon + * first use and are never deallocated. This way, + * large bitmaps are not allocated unless large numbers of QPs are used. + */ +struct rvt_qpn_map { + void *page; +}; + +struct rvt_qpn_table { + spinlock_t lock; /* protect changes to the qp table */ + unsigned flags; /* flags for QP0/1 allocated for each port */ + u32 last; /* last QP number allocated */ + u32 nmaps; /* size of the map table */ + u16 limit; + u8 incr; + /* bit map of free QP numbers other than 0/1 */ + struct rvt_qpn_map map[RVT_QPNMAP_ENTRIES]; +}; + +struct rvt_qp_ibdev { + u32 qp_table_size; + u32 qp_table_bits; + struct rvt_qp __rcu **qp_table; + spinlock_t qpt_lock; /* qptable lock */ + struct rvt_qpn_table qpn_table; +}; + +/* + * There is one struct rvt_mcast for each multicast GID. + * All attached QPs are then stored as a list of + * struct rvt_mcast_qp. + */ +struct rvt_mcast_qp { + struct list_head list; + struct rvt_qp *qp; +}; + +struct rvt_mcast { + struct rb_node rb_node; + union ib_gid mgid; + struct list_head qp_list; + wait_queue_head_t wait; + atomic_t refcount; + int n_attached; +}; + +/* + * Since struct rvt_swqe is not a fixed size, we can't simply index into + * struct rvt_qp.s_wq. This function does the array index computation. + */ +static inline struct rvt_swqe *rvt_get_swqe_ptr(struct rvt_qp *qp, + unsigned n) +{ + return (struct rvt_swqe *)((char *)qp->s_wq + + (sizeof(struct rvt_swqe) + + qp->s_max_sge * + sizeof(struct rvt_sge)) * n); +} + +/* + * Since struct rvt_rwqe is not a fixed size, we can't simply index into + * struct rvt_rwq.wq. This function does the array index computation. + */ +static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n) +{ + return (struct rvt_rwqe *) + ((char *)rq->wq->wq + + (sizeof(struct rvt_rwqe) + + rq->max_sge * sizeof(struct ib_sge)) * n); +} + +/** + * rvt_get_qp - get a QP reference + * @qp - the QP to hold + */ +static inline void rvt_get_qp(struct rvt_qp *qp) +{ + atomic_inc(&qp->refcount); +} + +/** + * rvt_put_qp - release a QP reference + * @qp - the QP to release + */ +static inline void rvt_put_qp(struct rvt_qp *qp) +{ + if (qp && atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +/** + * rvt_qp_wqe_reserve - reserve operation + * @qp - the rvt qp + * @wqe - the send wqe + * + * This routine used in post send to record + * a wqe relative reserved operation use. + */ +static inline void rvt_qp_wqe_reserve( + struct rvt_qp *qp, + struct rvt_swqe *wqe) +{ + wqe->wr.send_flags |= RVT_SEND_RESERVE_USED; + atomic_inc(&qp->s_reserved_used); +} + +/** + * rvt_qp_wqe_unreserve - clean reserved operation + * @qp - the rvt qp + * @wqe - the send wqe + * + * This decrements the reserve use count. + * + * This call MUST precede the change to + * s_last to insure that post send sees a stable + * s_avail. + * + * An smp_mp__after_atomic() is used to insure + * the compiler does not juggle the order of the s_last + * ring index and the decrementing of s_reserved_used. + */ +static inline void rvt_qp_wqe_unreserve( + struct rvt_qp *qp, + struct rvt_swqe *wqe) +{ + if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED)) { + wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED; + atomic_dec(&qp->s_reserved_used); + /* insure no compiler re-order up to s_last change */ + smp_mb__after_atomic(); + } +} + +extern const int ib_rvt_state_ops[]; + +struct rvt_dev_info; +int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err); + +#endif /* DEF_RDMAVT_INCQP_H */ Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/rdma/rdmavt_qp.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/uapi/rdma/mlx4-abi.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/uapi/rdma/mlx4-abi.h (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/include/uapi/rdma/mlx4-abi.h (revision 319974) @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_ABI_USER_H +#define MLX4_ABI_USER_H + +#include + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ + +#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION 3 +#define MLX4_IB_UVERBS_ABI_VERSION 4 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mlx4_ib_alloc_ucontext_resp_v3 { + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; +}; + +struct mlx4_ib_alloc_ucontext_resp { + __u32 dev_caps; + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; + __u32 cqe_size; +}; + +struct mlx4_ib_alloc_pd_resp { + __u32 pdn; + __u32 reserved; +}; + +struct mlx4_ib_create_cq { + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mlx4_ib_resize_cq { + __u64 buf_addr; +}; + +struct mlx4_ib_create_srq { + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mlx4_ib_create_qp { + __u64 buf_addr; + __u64 db_addr; + __u8 log_sq_bb_count; + __u8 log_sq_stride; + __u8 sq_no_prefetch; + __u8 reserved[5]; +}; + +#endif /* MLX4_ABI_USER_H */ Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/uapi/rdma/mlx4-abi.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/bsd_rdma_4_9/sys/ofed/include/uapi/rdma/mlx5-abi.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/include/uapi/rdma/mlx5-abi.h (nonexistent) +++ projects/bsd_rdma_4_9/sys/ofed/include/uapi/rdma/mlx5-abi.h (revision 319974) @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_ABI_USER_H +#define MLX5_ABI_USER_H + +#include + +enum { + MLX5_QP_FLAG_SIGNATURE = 1 << 0, + MLX5_QP_FLAG_SCATTER_CQE = 1 << 1, +}; + +enum { + MLX5_SRQ_FLAG_SIGNATURE = 1 << 0, +}; + +enum { + MLX5_WQ_FLAG_SIGNATURE = 1 << 0, +}; + +/* Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define MLX5_IB_UVERBS_ABI_VERSION 1 + +/* Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mlx5_ib_alloc_ucontext_req { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; +}; + +struct mlx5_ib_alloc_ucontext_req_v2 { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; + __u32 flags; + __u32 comp_mask; + __u8 max_cqe_version; + __u8 reserved0; + __u16 reserved1; + __u32 reserved2; +}; + +enum mlx5_ib_alloc_ucontext_resp_mask { + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0, +}; + +enum mlx5_user_cmds_supp_uhw { + MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE = 1 << 0, +}; + +struct mlx5_ib_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 bf_reg_size; + __u32 tot_uuars; + __u32 cache_line_size; + __u16 max_sq_desc_sz; + __u16 max_rq_desc_sz; + __u32 max_send_wqebb; + __u32 max_recv_wr; + __u32 max_srq_recv_wr; + __u16 num_ports; + __u16 reserved1; + __u32 comp_mask; + __u32 response_length; + __u8 cqe_version; + __u8 cmds_supp_uhw; + __u16 reserved2; + __u64 hca_core_clock_offset; +}; + +struct mlx5_ib_alloc_pd_resp { + __u32 pdn; +}; + +struct mlx5_ib_tso_caps { + __u32 max_tso; /* Maximum tso payload size in bytes */ + + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_UD + */ + __u32 supported_qpts; +}; + +struct mlx5_ib_rss_caps { + __u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */ + __u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */ + __u8 reserved[7]; +}; + +struct mlx5_ib_query_device_resp { + __u32 comp_mask; + __u32 response_length; + struct mlx5_ib_tso_caps tso_caps; + struct mlx5_ib_rss_caps rss_caps; +}; + +struct mlx5_ib_create_cq { + __u64 buf_addr; + __u64 db_addr; + __u32 cqe_size; + __u32 reserved; /* explicit padding (optional on i386) */ +}; + +struct mlx5_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mlx5_ib_resize_cq { + __u64 buf_addr; + __u16 cqe_size; + __u16 reserved0; + __u32 reserved1; +}; + +struct mlx5_ib_create_srq { + __u64 buf_addr; + __u64 db_addr; + __u32 flags; + __u32 reserved0; /* explicit padding (optional on i386) */ + __u32 uidx; + __u32 reserved1; +}; + +struct mlx5_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mlx5_ib_create_qp { + __u64 buf_addr; + __u64 db_addr; + __u32 sq_wqe_count; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 flags; + __u32 uidx; + __u32 reserved0; + __u64 sq_buf_addr; +}; + +/* RX Hash function flags */ +enum mlx5_rx_hash_function_flags { + MLX5_RX_HASH_FUNC_TOEPLITZ = 1 << 0, +}; + +/* + * RX Hash flags, these flags allows to set which incoming packet's field should + * participates in RX Hash. Each flag represent certain packet's field, + * when the flag is set the field that is represented by the flag will + * participate in RX Hash calculation. + * Note: *IPV4 and *IPV6 flags can't be enabled together on the same QP + * and *TCP and *UDP flags can't be enabled together on the same QP. +*/ +enum mlx5_rx_hash_fields { + MLX5_RX_HASH_SRC_IPV4 = 1 << 0, + MLX5_RX_HASH_DST_IPV4 = 1 << 1, + MLX5_RX_HASH_SRC_IPV6 = 1 << 2, + MLX5_RX_HASH_DST_IPV6 = 1 << 3, + MLX5_RX_HASH_SRC_PORT_TCP = 1 << 4, + MLX5_RX_HASH_DST_PORT_TCP = 1 << 5, + MLX5_RX_HASH_SRC_PORT_UDP = 1 << 6, + MLX5_RX_HASH_DST_PORT_UDP = 1 << 7 +}; + +struct mlx5_ib_create_qp_rss { + __u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */ + __u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */ + __u8 rx_key_len; /* valid only for Toeplitz */ + __u8 reserved[6]; + __u8 rx_hash_key[128]; /* valid only for Toeplitz */ + __u32 comp_mask; + __u32 reserved1; +}; + +struct mlx5_ib_create_qp_resp { + __u32 uuar_index; +}; + +struct mlx5_ib_alloc_mw { + __u32 comp_mask; + __u8 num_klms; + __u8 reserved1; + __u16 reserved2; +}; + +struct mlx5_ib_create_wq { + __u64 buf_addr; + __u64 db_addr; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 user_index; + __u32 flags; + __u32 comp_mask; + __u32 reserved; +}; + +struct mlx5_ib_create_wq_resp { + __u32 response_length; + __u32 reserved; +}; + +struct mlx5_ib_create_rwq_ind_tbl_resp { + __u32 response_length; + __u32 reserved; +}; + +struct mlx5_ib_modify_wq { + __u32 comp_mask; + __u32 reserved; +}; +#endif /* MLX5_ABI_USER_H */ Property changes on: projects/bsd_rdma_4_9/sys/ofed/include/uapi/rdma/mlx5-abi.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property